diff --git a/zddc/internal/convert/convert.go b/zddc/internal/convert/convert.go index b9d7b0b..7ff3f43 100644 --- a/zddc/internal/convert/convert.go +++ b/zddc/internal/convert/convert.go @@ -154,12 +154,20 @@ func currentChromiumBinary() string { // full file content (envelope + body); pandoc handles // `markdown+yaml_metadata_block` natively. func ToDocx(ctx context.Context, source []byte, m Metadata) ([]byte, error) { + return convertToDocx(ctx, "markdown+yaml_metadata_block", source, m) +} + +// convertToDocx renders source (in pandoc input format fromFmt) to DOCX bytes +// via a single pandoc exec (stdin → stdout; no scratch dir). Images in the +// source's mediabag — present when fromFmt is "html" — are embedded into the +// .docx natively by pandoc's docx writer. +func convertToDocx(ctx context.Context, fromFmt string, source []byte, m Metadata) ([]byte, error) { r := currentRunner() if r == nil { return nil, ErrUnavailable } cmd := []string{ - "--from=markdown+yaml_metadata_block", + "--from=" + fromFmt, "--to=docx", "--output=-", } @@ -168,6 +176,68 @@ func ToDocx(ctx context.Context, source []byte, m Metadata) ([]byte, error) { return r.Run(ctx, currentPandocBinary(), source, "", cmd) } +// convertToMarkdown renders source (DOCX or HTML, per fromFmt) to GitHub- +// flavored markdown. Embedded images are inlined as base64 data: URIs via the +// inline-media.lua filter so the output .md is self-contained; --wrap=none keeps +// paragraphs on one line (no hard line breaks). +func convertToMarkdown(ctx context.Context, fromFmt string, source []byte) ([]byte, error) { + r := currentRunner() + if r == nil { + return nil, ErrUnavailable + } + scratch, err := writeScratchFiles(currentScratchDir(), map[string][]byte{"inline-media.lua": inlineMediaLua}) + if err != nil { + return nil, fmt.Errorf("scratch: %w", err) + } + defer os.RemoveAll(scratch) + cmd := []string{ + "--from=" + fromFmt, + "--to=gfm", + "--wrap=none", + "--lua-filter=" + filepath.Join(scratch, "inline-media.lua"), + "--output=-", + "-", + } + return r.Run(ctx, currentPandocBinary(), source, scratch, cmd) +} + +// Convert renders source from one document format to another. Supported pairs: +// +// md → docx | html | pdf +// docx → md | html +// html → md | docx +// +// ts is the resolved HTML template set, used only for the *→html and md→pdf +// directions and ignored otherwise. Unsupported pairs return an error. +func Convert(ctx context.Context, from, to string, source []byte, m Metadata, ts TemplateSet) ([]byte, error) { + switch from { + case "md", "markdown": + switch to { + case "docx": + return ToDocx(ctx, source, m) + case "html": + return ToHTML(ctx, source, m, ts) + case "pdf": + return ToPDF(ctx, source, m, ts) + } + case "docx": + switch to { + case "md": + return convertToMarkdown(ctx, "docx", source) + case "html": + return convertToHTML(ctx, "docx", source, m, ts) + } + case "html", "htm": + switch to { + case "md": + return convertToMarkdown(ctx, "html", source) + case "docx": + return convertToDocx(ctx, "html", source, m) + } + } + return nil, fmt.Errorf("unsupported conversion %s→%s", from, to) +} + // ToHTML renders source markdown to standalone HTML using the doctype // template in ts. Embeds CSS + images via --embed-resources. The // template + its partials live in a per-call scratch dir; the host path @@ -175,6 +245,14 @@ func ToDocx(ctx context.Context, source []byte, m Metadata) ([]byte, error) { // sandbox at the same path. A zero-value ts falls back to the embedded // default template. func ToHTML(ctx context.Context, source []byte, m Metadata, ts TemplateSet) ([]byte, error) { + return convertToHTML(ctx, "markdown+yaml_metadata_block", source, m, ts) +} + +// convertToHTML renders source (in pandoc input format fromFmt) to standalone +// HTML through the doctype template in ts. --embed-resources base64-inlines CSS +// and any mediabag images (so DOCX images survive docx→html with no extra +// filter). The template + partials are written to a per-call scratch dir. +func convertToHTML(ctx context.Context, fromFmt string, source []byte, m Metadata, ts TemplateSet) ([]byte, error) { r := currentRunner() if r == nil { return nil, ErrUnavailable @@ -182,7 +260,7 @@ func ToHTML(ctx context.Context, source []byte, m Metadata, ts TemplateSet) ([]b if ts.Name == "" || len(ts.Files) == 0 { ts = DefaultTemplateSet(DefaultTemplateName) } - scratch, err := writeTemplateSetToScratch(currentScratchDir(), ts) + scratch, err := writeScratchFiles(currentScratchDir(), ts.Files) if err != nil { return nil, fmt.Errorf("scratch: %w", err) } @@ -190,7 +268,7 @@ func ToHTML(ctx context.Context, source []byte, m Metadata, ts TemplateSet) ([]b tplPath := filepath.Join(scratch, ts.Name) cmd := []string{ - "--from=markdown+yaml_metadata_block", + "--from=" + fromFmt, "--to=html5", "--standalone", "--embed-resources", diff --git a/zddc/internal/convert/convert_test.go b/zddc/internal/convert/convert_test.go index 93e81f3..c973514 100644 --- a/zddc/internal/convert/convert_test.go +++ b/zddc/internal/convert/convert_test.go @@ -41,6 +41,77 @@ func (f *fakeRunner) lastCall() (string, []string) { return f.binaries[len(f.binaries)-1], f.calls[len(f.calls)-1] } +func TestConvert_Directions(t *testing.T) { + cases := []struct { + from, to string + wantArgs []string // substrings that must appear in the pandoc command + wantErr bool + }{ + {"docx", "md", []string{"--from=docx", "--to=gfm", "--wrap=none"}, false}, + {"html", "md", []string{"--from=html", "--to=gfm", "--wrap=none"}, false}, + {"docx", "html", []string{"--from=docx", "--to=html5", "--embed-resources"}, false}, + {"html", "docx", []string{"--from=html", "--to=docx"}, false}, + {"md", "docx", []string{"--from=markdown+yaml_metadata_block", "--to=docx"}, false}, + {"md", "html", []string{"--from=markdown+yaml_metadata_block", "--to=html5"}, false}, + {"docx", "pdf", nil, true}, // pdf is markdown-only + {"docx", "docx", nil, true}, // same-format is unsupported + {"html", "html", nil, true}, + } + for _, c := range cases { + t.Run(c.from+"_to_"+c.to, func(t *testing.T) { + f := &fakeRunner{resp: []byte("OUT")} + InstallRunner(f) + t.Cleanup(func() { InstallRunner(nil) }) + SetBinaries("pandoc", "chromium-browser") + + _, err := Convert(context.Background(), c.from, c.to, []byte("x"), Metadata{}, TemplateSet{}) + if c.wantErr { + if err == nil { + t.Fatalf("Convert(%s→%s): expected error, got nil", c.from, c.to) + } + return + } + if err != nil { + t.Fatalf("Convert(%s→%s): %v", c.from, c.to, err) + } + binary, call := f.lastCall() + if binary != "pandoc" { + t.Errorf("expected pandoc, got %q", binary) + } + for _, want := range c.wantArgs { + if !contains(call, want) { + t.Errorf("Convert(%s→%s) missing %q in %v", c.from, c.to, want, call) + } + } + // To-markdown directions inline images via the lua filter. + if c.to == "md" { + if !hasPrefArg(call, "--lua-filter=") || !hasSuffArg(call, "inline-media.lua") { + t.Errorf("Convert(%s→md) missing inline-media.lua filter: %v", c.from, call) + } + } + }) + } +} + +// hasPrefArg / hasSuffArg report whether any arg has the given prefix/suffix. +func hasPrefArg(args []string, prefix string) bool { + for _, a := range args { + if strings.HasPrefix(a, prefix) { + return true + } + } + return false +} + +func hasSuffArg(args []string, suffix string) bool { + for _, a := range args { + if strings.HasSuffix(a, suffix) { + return true + } + } + return false +} + func TestToDocx_UsesPandocBinary(t *testing.T) { f := &fakeRunner{resp: []byte("FAKE-DOCX")} InstallRunner(f) diff --git a/zddc/internal/convert/embed.go b/zddc/internal/convert/embed.go index 2a566dd..702615e 100644 --- a/zddc/internal/convert/embed.go +++ b/zddc/internal/convert/embed.go @@ -32,6 +32,14 @@ import ( //go:embed all:templates var templatesFS embed.FS +// inlineMediaLua is the pandoc filter that base64-inlines images into markdown +// output (docx→md / html→md), written to the per-call scratch dir alongside the +// conversion. Server-only — the CLI convert script extracts media to a folder +// instead. +// +//go:embed inline-media.lua +var inlineMediaLua []byte + // DefaultTemplateName is used when a document declares no `template:` field or // names one that doesn't resolve. const DefaultTemplateName = "report" diff --git a/zddc/internal/convert/inline-media.lua b/zddc/internal/convert/inline-media.lua new file mode 100644 index 0000000..b51f94a --- /dev/null +++ b/zddc/internal/convert/inline-media.lua @@ -0,0 +1,31 @@ +-- inline-media.lua — pandoc filter that rewrites every image to a self-contained +-- base64 data: URI, pulling the bytes from pandoc's mediabag (populated when +-- reading DOCX, or fetched for HTML). Used by the docx→md / html→md conversions +-- so the resulting markdown carries its images inline (markdown output has no +-- native --embed-resources equivalent). + +local b = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/' + +local function base64(data) + return ((data:gsub('.', function(x) + local r, byte = '', x:byte() + for i = 8, 1, -1 do r = r .. (byte % 2 ^ i - byte % 2 ^ (i - 1) > 0 and '1' or '0') end + return r + end) .. '0000'):gsub('%d%d%d?%d?%d?%d?', function(x) + if #x < 6 then return '' end + local c = 0 + for i = 1, 6 do c = c + (x:sub(i, i) == '1' and 2 ^ (6 - i) or 0) end + return b:sub(c + 1, c + 1) + end) .. ({ '', '==', '=' })[#data % 3 + 1]) +end + +function Image(img) + local mt, data = pandoc.mediabag.lookup(img.src) + if not data then + mt, data = pandoc.mediabag.fetch(img.src) + end + if data then + img.src = 'data:' .. (mt or 'application/octet-stream') .. ';base64,' .. base64(data) + end + return img +end diff --git a/zddc/internal/convert/runner.go b/zddc/internal/convert/runner.go index c378e67..aedd84d 100644 --- a/zddc/internal/convert/runner.go +++ b/zddc/internal/convert/runner.go @@ -274,26 +274,26 @@ func (r *ringWriter) String() string { return string(r.buf) } -// writeTemplateSetToScratch materialises a TemplateSet (the chosen doctype -// template plus its partials) into a fresh scratch dir and returns the host -// path. Caller is responsible for os.RemoveAll(dir) when done. Used by ToHTML, -// which needs the template + partials visible inside the sandbox (pandoc -// resolves `$partial()$` includes from the template's own directory). +// writeScratchFiles materialises a set of named byte buffers (template + +// partials, or a lua filter) into a fresh scratch dir and returns the host +// path. Caller is responsible for os.RemoveAll(dir) when done. pandoc resolves +// `$partial()$` includes and --lua-filter paths from this dir, so everything +// lands flat alongside the entry file. // // scratchRoot controls where the temp dir lands. Empty means "use $TMPDIR". // // Files are written world-readable so the binary's default user can read them -// through the wrapper's bind mount regardless of the host's umask. File names -// are base names only (no path separators) — they all land flat in the dir. -func writeTemplateSetToScratch(scratchRoot string, ts TemplateSet) (string, error) { +// through the wrapper's bind mount regardless of the host's umask. Keys are +// reduced to base names only (no path separators). +func writeScratchFiles(scratchRoot string, files map[string][]byte) (string, error) { dir, err := os.MkdirTemp(scratchRoot, "zddc-convert-") if err != nil { return "", fmt.Errorf("scratch dir: %w", err) } - for name, b := range ts.Files { + for name, b := range files { if err := os.WriteFile(filepath.Join(dir, filepath.Base(name)), b, 0o644); err != nil { os.RemoveAll(dir) - return "", fmt.Errorf("write template %q: %w", name, err) + return "", fmt.Errorf("write scratch file %q: %w", name, err) } } if err := chmodTree(dir, 0o755, 0o644); err != nil { diff --git a/zddc/internal/handler/converthandler.go b/zddc/internal/handler/converthandler.go index a2d648f..1059274 100644 --- a/zddc/internal/handler/converthandler.go +++ b/zddc/internal/handler/converthandler.go @@ -48,35 +48,50 @@ var convertSF singleflightGroup // runner itself enforces a finer-grained timeout on the container. const convertTimeout = 90 * time.Second +// convertSourceExts maps a requested target extension to the candidate source +// extensions in precedence order — the first existing real sibling wins. The +// matrix: md↔docx↔html all directions, plus md→pdf (PDF stays markdown-only). +var convertSourceExts = map[string][]string{ + "md": {"docx", "html"}, + "docx": {"md", "html"}, + "html": {"md", "docx"}, + "pdf": {"md"}, +} + // RecognizeVirtualConvert reports whether urlPath names a virtual -// "." — a rendered form of a sibling markdown source. -// Returns (mdAbsPath, format, true) when .md exists on disk and -// the requested extension is one of docx / html / pdf. The caller -// (the dispatcher) only invokes this when a stat on the requested -// path itself fails — a real on-disk file always wins. +// "." — a rendered form of a sibling source document in a +// different format. Returns (srcAbsPath, format, true) when the requested +// extension is convertible (md/docx/html/pdf) and a sibling source exists on +// disk, picked by convertSourceExts precedence. The caller (the dispatcher) only +// invokes this when a stat on the requested path itself fails — a real on-disk +// file always wins. // // A virtual file URL means `` works without any -// query-string handling, and a script's `curl -O …/foo.pdf` writes the -// expected filename. -func RecognizeVirtualConvert(fsRoot, urlPath string) (mdAbs, format string, ok bool) { +// query-string handling, and a script's `curl -O …/foo.md` writes the expected +// filename. +func RecognizeVirtualConvert(fsRoot, urlPath string) (srcAbs, format string, ok bool) { lower := strings.ToLower(urlPath) - for _, ext := range []string{".docx", ".html", ".pdf"} { + for target, sources := range convertSourceExts { + ext := "." + target if !strings.HasSuffix(lower, ext) { - continue + continue // distinct suffixes — at most one target matches } base := urlPath[:len(urlPath)-len(ext)] if base == "" || strings.HasSuffix(base, "/") { - continue + return "", "", false } - rel := strings.Trim(base, "/") + ".md" - abs := filepath.Join(fsRoot, filepath.FromSlash(rel)) - // Path containment. - if abs != fsRoot && !strings.HasPrefix(abs, fsRoot+string(filepath.Separator)) { - continue - } - if info, err := os.Stat(abs); err == nil && !info.IsDir() { - return abs, ext[1:], true + stem := strings.Trim(base, "/") + for _, srcExt := range sources { + abs := filepath.Join(fsRoot, filepath.FromSlash(stem+"."+srcExt)) + // Path containment. + if abs != fsRoot && !strings.HasPrefix(abs, fsRoot+string(filepath.Separator)) { + continue + } + if info, err := os.Stat(abs); err == nil && !info.IsDir() { + return abs, target, true + } } + return "", "", false } return "", "", false } @@ -87,9 +102,9 @@ func RecognizeVirtualConvert(fsRoot, urlPath string) (mdAbs, format string, ok b func ServeConverted(cfg config.Config, w http.ResponseWriter, r *http.Request, srcAbs, format string, chain zddc.PolicyChain) { format = strings.ToLower(strings.TrimSpace(format)) switch format { - case "docx", "html", "pdf": + case "md", "docx", "html", "pdf": default: - http.Error(w, "Bad Request — convert must be docx, html, or pdf", http.StatusBadRequest) + http.Error(w, "Bad Request — convert must be md, docx, html, or pdf", http.StatusBadRequest) return } @@ -159,17 +174,13 @@ func buildAndStore(ctx context.Context, fsRoot, srcAbs string, srcInfo os.FileIn ctx, cancel := context.WithTimeout(ctx, convertTimeout) defer cancel() - var out []byte - switch format { - case "docx": - out, err = convert.ToDocx(ctx, source, meta) - case "html": - out, err = convert.ToHTML(ctx, source, meta, resolveTemplateSet(fsRoot, filepath.Dir(srcAbs), source)) - case "pdf": - out, err = convert.ToPDF(ctx, source, meta, resolveTemplateSet(fsRoot, filepath.Dir(srcAbs), source)) - default: - return fmt.Errorf("unsupported format %q", format) + // Source format is the on-disk extension; target is the requested format. + from := strings.TrimPrefix(strings.ToLower(filepath.Ext(srcAbs)), ".") + var ts convert.TemplateSet + if format == "html" || format == "pdf" { + ts = resolveTemplateSet(fsRoot, filepath.Dir(srcAbs), source) } + out, err := convert.Convert(ctx, from, format, source, meta, ts) if err != nil { return err } @@ -290,20 +301,21 @@ func contentDispositionFor(format, base string) string { return fmt.Sprintf(`inline; filename="%s.%s"`, base, format) } -// purgeConverted removes the cached .zddc.d/converted/.{docx,html,pdf} -// sidecars for an .md source. Called from the file API after a -// successful PUT/DELETE/MOVE so the next GET ?convert= regenerates. -// Best-effort: errors (including "directory doesn't exist") are -// swallowed. Non-.md sources are a no-op so this is safe to call +// purgeConverted removes the cached .zddc.d/converted/.{md,docx,html,pdf} +// sidecars for a convertible source. Called from the file API after a successful +// PUT/DELETE/MOVE so the next virtual-convert GET regenerates. Best-effort: +// errors (including "directory doesn't exist") are swallowed. Sources whose +// extension isn't convertible are a no-op, so this is safe to call // unconditionally after any write. func purgeConverted(srcAbs string) { - if !strings.HasSuffix(strings.ToLower(srcAbs), ".md") { + ext := strings.TrimPrefix(strings.ToLower(filepath.Ext(srcAbs)), ".") + if _, ok := convertSourceExts[ext]; !ok { return } dir := filepath.Dir(srcAbs) base := strings.TrimSuffix(filepath.Base(srcAbs), filepath.Ext(srcAbs)) - for _, ext := range []string{".docx", ".html", ".pdf"} { - _ = os.Remove(filepath.Join(dir, ReservedSidecar, "converted", base+ext)) + for target := range convertSourceExts { + _ = os.Remove(filepath.Join(dir, ReservedSidecar, "converted", base+"."+target)) } } diff --git a/zddc/internal/handler/converthandler_test.go b/zddc/internal/handler/converthandler_test.go new file mode 100644 index 0000000..dce4d43 --- /dev/null +++ b/zddc/internal/handler/converthandler_test.go @@ -0,0 +1,65 @@ +package handler + +import ( + "os" + "path/filepath" + "testing" +) + +func TestRecognizeVirtualConvert_MatrixAndPrecedence(t *testing.T) { + root := t.TempDir() + write := func(rel string) { + p := filepath.Join(root, filepath.FromSlash(rel)) + if err := os.MkdirAll(filepath.Dir(p), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(p, []byte("x"), 0o644); err != nil { + t.Fatal(err) + } + } + + // Sources on disk: doc.md, only.docx, both.md + both.docx, page.html. + write("doc.md") + write("only.docx") + write("both.md") + write("both.docx") + write("page.html") + + cases := []struct { + name string + url string + wantOK bool + wantSrcExt string + wantFormat string + }{ + {"md→docx", "/doc.docx", true, ".md", "docx"}, + {"md→html", "/doc.html", true, ".md", "html"}, + {"md→pdf", "/doc.pdf", true, ".md", "pdf"}, + {"docx→md (only docx present)", "/only.md", true, ".docx", "md"}, + {"docx→html (only docx present)", "/only.html", true, ".docx", "html"}, + {"docx has no pdf source", "/only.pdf", false, "", ""}, + {"both present, html prefers md source", "/both.html", true, ".md", "html"}, + {"html→md", "/page.md", true, ".html", "md"}, + {"html→docx", "/page.docx", true, ".html", "docx"}, + {"no source at all", "/missing.html", false, "", ""}, + {"directory url ignored", "/doc/", false, "", ""}, + {"non-convertible target", "/doc.txt", false, "", ""}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + src, format, ok := RecognizeVirtualConvert(root, c.url) + if ok != c.wantOK { + t.Fatalf("ok=%v want %v (src=%q format=%q)", ok, c.wantOK, src, format) + } + if !ok { + return + } + if format != c.wantFormat { + t.Errorf("format=%q want %q", format, c.wantFormat) + } + if filepath.Ext(src) != c.wantSrcExt { + t.Errorf("source ext=%q want %q (src=%q)", filepath.Ext(src), c.wantSrcExt, src) + } + }) + } +}