feat(server): full md/docx/html conversion matrix + base64 image inlining

Generalize the conversion engine from markdown-source-only to a (from→to)
dispatcher, convert.Convert, supporting:

  md   → docx | html | pdf
  docx → md   | html
  html → md   | docx

- convertToMarkdown (docx→md, html→md): pandoc -t gfm --wrap=none with an
  embedded inline-media.lua filter that base64-inlines mediabag images as data:
  URIs, so the output .md is self-contained (markdown has no --embed-resources).
- convertToHTML now takes a source format: docx→html reuses the doctype template
  and --embed-resources base64-inlines the docx's images automatically.
- convertToDocx takes a source format: html→docx embeds images natively.
- ToDocx/ToHTML/ToPDF are kept as the md-source entry points, delegating to the
  shared internals. writeScratchFiles generalizes the old template-set writer.

Routing (converthandler.go):
- RecognizeVirtualConvert maps any target ext {md,docx,html,pdf} to the first
  existing real sibling source by precedence (md←docx,html; docx←md,html;
  html←md,docx; pdf←md). Real files still win (dispatcher stats first).
- ServeConverted accepts md; buildAndStore dispatches on (ext(src), format) via
  convert.Convert; purgeConverted clears all derived siblings on any write.

Tests: per-direction command-shape assertions (convert) + recognizer matrix and
precedence (handler). Verified end-to-end with real pandoc (docx→md/html,
html→md/docx, base64 images). Full ./... suite green.

PDF stays markdown-only for now (docx/html→pdf would need a two-stage hop).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
ZDDC 2026-06-04 21:02:11 -05:00
parent 894610d59e
commit 16d88010a6
7 changed files with 317 additions and 52 deletions

View file

@ -154,12 +154,20 @@ func currentChromiumBinary() string {
// full file content (envelope + body); pandoc handles
// `markdown+yaml_metadata_block` natively.
func ToDocx(ctx context.Context, source []byte, m Metadata) ([]byte, error) {
return convertToDocx(ctx, "markdown+yaml_metadata_block", source, m)
}
// convertToDocx renders source (in pandoc input format fromFmt) to DOCX bytes
// via a single pandoc exec (stdin → stdout; no scratch dir). Images in the
// source's mediabag — present when fromFmt is "html" — are embedded into the
// .docx natively by pandoc's docx writer.
func convertToDocx(ctx context.Context, fromFmt string, source []byte, m Metadata) ([]byte, error) {
r := currentRunner()
if r == nil {
return nil, ErrUnavailable
}
cmd := []string{
"--from=markdown+yaml_metadata_block",
"--from=" + fromFmt,
"--to=docx",
"--output=-",
}
@ -168,6 +176,68 @@ func ToDocx(ctx context.Context, source []byte, m Metadata) ([]byte, error) {
return r.Run(ctx, currentPandocBinary(), source, "", cmd)
}
// convertToMarkdown renders source (DOCX or HTML, per fromFmt) to GitHub-
// flavored markdown. Embedded images are inlined as base64 data: URIs via the
// inline-media.lua filter so the output .md is self-contained; --wrap=none keeps
// paragraphs on one line (no hard line breaks).
func convertToMarkdown(ctx context.Context, fromFmt string, source []byte) ([]byte, error) {
r := currentRunner()
if r == nil {
return nil, ErrUnavailable
}
scratch, err := writeScratchFiles(currentScratchDir(), map[string][]byte{"inline-media.lua": inlineMediaLua})
if err != nil {
return nil, fmt.Errorf("scratch: %w", err)
}
defer os.RemoveAll(scratch)
cmd := []string{
"--from=" + fromFmt,
"--to=gfm",
"--wrap=none",
"--lua-filter=" + filepath.Join(scratch, "inline-media.lua"),
"--output=-",
"-",
}
return r.Run(ctx, currentPandocBinary(), source, scratch, cmd)
}
// Convert renders source from one document format to another. Supported pairs:
//
// md → docx | html | pdf
// docx → md | html
// html → md | docx
//
// ts is the resolved HTML template set, used only for the *→html and md→pdf
// directions and ignored otherwise. Unsupported pairs return an error.
func Convert(ctx context.Context, from, to string, source []byte, m Metadata, ts TemplateSet) ([]byte, error) {
switch from {
case "md", "markdown":
switch to {
case "docx":
return ToDocx(ctx, source, m)
case "html":
return ToHTML(ctx, source, m, ts)
case "pdf":
return ToPDF(ctx, source, m, ts)
}
case "docx":
switch to {
case "md":
return convertToMarkdown(ctx, "docx", source)
case "html":
return convertToHTML(ctx, "docx", source, m, ts)
}
case "html", "htm":
switch to {
case "md":
return convertToMarkdown(ctx, "html", source)
case "docx":
return convertToDocx(ctx, "html", source, m)
}
}
return nil, fmt.Errorf("unsupported conversion %s→%s", from, to)
}
// ToHTML renders source markdown to standalone HTML using the doctype
// template in ts. Embeds CSS + images via --embed-resources. The
// template + its partials live in a per-call scratch dir; the host path
@ -175,6 +245,14 @@ func ToDocx(ctx context.Context, source []byte, m Metadata) ([]byte, error) {
// sandbox at the same path. A zero-value ts falls back to the embedded
// default template.
func ToHTML(ctx context.Context, source []byte, m Metadata, ts TemplateSet) ([]byte, error) {
return convertToHTML(ctx, "markdown+yaml_metadata_block", source, m, ts)
}
// convertToHTML renders source (in pandoc input format fromFmt) to standalone
// HTML through the doctype template in ts. --embed-resources base64-inlines CSS
// and any mediabag images (so DOCX images survive docx→html with no extra
// filter). The template + partials are written to a per-call scratch dir.
func convertToHTML(ctx context.Context, fromFmt string, source []byte, m Metadata, ts TemplateSet) ([]byte, error) {
r := currentRunner()
if r == nil {
return nil, ErrUnavailable
@ -182,7 +260,7 @@ func ToHTML(ctx context.Context, source []byte, m Metadata, ts TemplateSet) ([]b
if ts.Name == "" || len(ts.Files) == 0 {
ts = DefaultTemplateSet(DefaultTemplateName)
}
scratch, err := writeTemplateSetToScratch(currentScratchDir(), ts)
scratch, err := writeScratchFiles(currentScratchDir(), ts.Files)
if err != nil {
return nil, fmt.Errorf("scratch: %w", err)
}
@ -190,7 +268,7 @@ func ToHTML(ctx context.Context, source []byte, m Metadata, ts TemplateSet) ([]b
tplPath := filepath.Join(scratch, ts.Name)
cmd := []string{
"--from=markdown+yaml_metadata_block",
"--from=" + fromFmt,
"--to=html5",
"--standalone",
"--embed-resources",

View file

@ -41,6 +41,77 @@ func (f *fakeRunner) lastCall() (string, []string) {
return f.binaries[len(f.binaries)-1], f.calls[len(f.calls)-1]
}
func TestConvert_Directions(t *testing.T) {
cases := []struct {
from, to string
wantArgs []string // substrings that must appear in the pandoc command
wantErr bool
}{
{"docx", "md", []string{"--from=docx", "--to=gfm", "--wrap=none"}, false},
{"html", "md", []string{"--from=html", "--to=gfm", "--wrap=none"}, false},
{"docx", "html", []string{"--from=docx", "--to=html5", "--embed-resources"}, false},
{"html", "docx", []string{"--from=html", "--to=docx"}, false},
{"md", "docx", []string{"--from=markdown+yaml_metadata_block", "--to=docx"}, false},
{"md", "html", []string{"--from=markdown+yaml_metadata_block", "--to=html5"}, false},
{"docx", "pdf", nil, true}, // pdf is markdown-only
{"docx", "docx", nil, true}, // same-format is unsupported
{"html", "html", nil, true},
}
for _, c := range cases {
t.Run(c.from+"_to_"+c.to, func(t *testing.T) {
f := &fakeRunner{resp: []byte("OUT")}
InstallRunner(f)
t.Cleanup(func() { InstallRunner(nil) })
SetBinaries("pandoc", "chromium-browser")
_, err := Convert(context.Background(), c.from, c.to, []byte("x"), Metadata{}, TemplateSet{})
if c.wantErr {
if err == nil {
t.Fatalf("Convert(%s→%s): expected error, got nil", c.from, c.to)
}
return
}
if err != nil {
t.Fatalf("Convert(%s→%s): %v", c.from, c.to, err)
}
binary, call := f.lastCall()
if binary != "pandoc" {
t.Errorf("expected pandoc, got %q", binary)
}
for _, want := range c.wantArgs {
if !contains(call, want) {
t.Errorf("Convert(%s→%s) missing %q in %v", c.from, c.to, want, call)
}
}
// To-markdown directions inline images via the lua filter.
if c.to == "md" {
if !hasPrefArg(call, "--lua-filter=") || !hasSuffArg(call, "inline-media.lua") {
t.Errorf("Convert(%s→md) missing inline-media.lua filter: %v", c.from, call)
}
}
})
}
}
// hasPrefArg / hasSuffArg report whether any arg has the given prefix/suffix.
func hasPrefArg(args []string, prefix string) bool {
for _, a := range args {
if strings.HasPrefix(a, prefix) {
return true
}
}
return false
}
func hasSuffArg(args []string, suffix string) bool {
for _, a := range args {
if strings.HasSuffix(a, suffix) {
return true
}
}
return false
}
func TestToDocx_UsesPandocBinary(t *testing.T) {
f := &fakeRunner{resp: []byte("FAKE-DOCX")}
InstallRunner(f)

View file

@ -32,6 +32,14 @@ import (
//go:embed all:templates
var templatesFS embed.FS
// inlineMediaLua is the pandoc filter that base64-inlines images into markdown
// output (docx→md / html→md), written to the per-call scratch dir alongside the
// conversion. Server-only — the CLI convert script extracts media to a folder
// instead.
//
//go:embed inline-media.lua
var inlineMediaLua []byte
// DefaultTemplateName is used when a document declares no `template:` field or
// names one that doesn't resolve.
const DefaultTemplateName = "report"

View file

@ -0,0 +1,31 @@
-- inline-media.lua — pandoc filter that rewrites every image to a self-contained
-- base64 data: URI, pulling the bytes from pandoc's mediabag (populated when
-- reading DOCX, or fetched for HTML). Used by the docx→md / html→md conversions
-- so the resulting markdown carries its images inline (markdown output has no
-- native --embed-resources equivalent).
local b = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
local function base64(data)
return ((data:gsub('.', function(x)
local r, byte = '', x:byte()
for i = 8, 1, -1 do r = r .. (byte % 2 ^ i - byte % 2 ^ (i - 1) > 0 and '1' or '0') end
return r
end) .. '0000'):gsub('%d%d%d?%d?%d?%d?', function(x)
if #x < 6 then return '' end
local c = 0
for i = 1, 6 do c = c + (x:sub(i, i) == '1' and 2 ^ (6 - i) or 0) end
return b:sub(c + 1, c + 1)
end) .. ({ '', '==', '=' })[#data % 3 + 1])
end
function Image(img)
local mt, data = pandoc.mediabag.lookup(img.src)
if not data then
mt, data = pandoc.mediabag.fetch(img.src)
end
if data then
img.src = 'data:' .. (mt or 'application/octet-stream') .. ';base64,' .. base64(data)
end
return img
end

View file

@ -274,26 +274,26 @@ func (r *ringWriter) String() string {
return string(r.buf)
}
// writeTemplateSetToScratch materialises a TemplateSet (the chosen doctype
// template plus its partials) into a fresh scratch dir and returns the host
// path. Caller is responsible for os.RemoveAll(dir) when done. Used by ToHTML,
// which needs the template + partials visible inside the sandbox (pandoc
// resolves `$partial()$` includes from the template's own directory).
// writeScratchFiles materialises a set of named byte buffers (template +
// partials, or a lua filter) into a fresh scratch dir and returns the host
// path. Caller is responsible for os.RemoveAll(dir) when done. pandoc resolves
// `$partial()$` includes and --lua-filter paths from this dir, so everything
// lands flat alongside the entry file.
//
// scratchRoot controls where the temp dir lands. Empty means "use $TMPDIR".
//
// Files are written world-readable so the binary's default user can read them
// through the wrapper's bind mount regardless of the host's umask. File names
// are base names only (no path separators) — they all land flat in the dir.
func writeTemplateSetToScratch(scratchRoot string, ts TemplateSet) (string, error) {
// through the wrapper's bind mount regardless of the host's umask. Keys are
// reduced to base names only (no path separators).
func writeScratchFiles(scratchRoot string, files map[string][]byte) (string, error) {
dir, err := os.MkdirTemp(scratchRoot, "zddc-convert-")
if err != nil {
return "", fmt.Errorf("scratch dir: %w", err)
}
for name, b := range ts.Files {
for name, b := range files {
if err := os.WriteFile(filepath.Join(dir, filepath.Base(name)), b, 0o644); err != nil {
os.RemoveAll(dir)
return "", fmt.Errorf("write template %q: %w", name, err)
return "", fmt.Errorf("write scratch file %q: %w", name, err)
}
}
if err := chmodTree(dir, 0o755, 0o644); err != nil {

View file

@ -48,35 +48,50 @@ var convertSF singleflightGroup
// runner itself enforces a finer-grained timeout on the container.
const convertTimeout = 90 * time.Second
// convertSourceExts maps a requested target extension to the candidate source
// extensions in precedence order — the first existing real sibling wins. The
// matrix: md↔docx↔html all directions, plus md→pdf (PDF stays markdown-only).
var convertSourceExts = map[string][]string{
"md": {"docx", "html"},
"docx": {"md", "html"},
"html": {"md", "docx"},
"pdf": {"md"},
}
// RecognizeVirtualConvert reports whether urlPath names a virtual
// "<file>.<format>" — a rendered form of a sibling markdown source.
// Returns (mdAbsPath, format, true) when <file>.md exists on disk and
// the requested extension is one of docx / html / pdf. The caller
// (the dispatcher) only invokes this when a stat on the requested
// path itself fails — a real on-disk file always wins.
// "<file>.<format>" — a rendered form of a sibling source document in a
// different format. Returns (srcAbsPath, format, true) when the requested
// extension is convertible (md/docx/html/pdf) and a sibling source exists on
// disk, picked by convertSourceExts precedence. The caller (the dispatcher) only
// invokes this when a stat on the requested path itself fails — a real on-disk
// file always wins.
//
// A virtual file URL means `<a href="…/foo.docx">` works without any
// query-string handling, and a script's `curl -O …/foo.pdf` writes the
// expected filename.
func RecognizeVirtualConvert(fsRoot, urlPath string) (mdAbs, format string, ok bool) {
// query-string handling, and a script's `curl -O …/foo.md` writes the expected
// filename.
func RecognizeVirtualConvert(fsRoot, urlPath string) (srcAbs, format string, ok bool) {
lower := strings.ToLower(urlPath)
for _, ext := range []string{".docx", ".html", ".pdf"} {
for target, sources := range convertSourceExts {
ext := "." + target
if !strings.HasSuffix(lower, ext) {
continue
continue // distinct suffixes — at most one target matches
}
base := urlPath[:len(urlPath)-len(ext)]
if base == "" || strings.HasSuffix(base, "/") {
continue
return "", "", false
}
rel := strings.Trim(base, "/") + ".md"
abs := filepath.Join(fsRoot, filepath.FromSlash(rel))
// Path containment.
if abs != fsRoot && !strings.HasPrefix(abs, fsRoot+string(filepath.Separator)) {
continue
}
if info, err := os.Stat(abs); err == nil && !info.IsDir() {
return abs, ext[1:], true
stem := strings.Trim(base, "/")
for _, srcExt := range sources {
abs := filepath.Join(fsRoot, filepath.FromSlash(stem+"."+srcExt))
// Path containment.
if abs != fsRoot && !strings.HasPrefix(abs, fsRoot+string(filepath.Separator)) {
continue
}
if info, err := os.Stat(abs); err == nil && !info.IsDir() {
return abs, target, true
}
}
return "", "", false
}
return "", "", false
}
@ -87,9 +102,9 @@ func RecognizeVirtualConvert(fsRoot, urlPath string) (mdAbs, format string, ok b
func ServeConverted(cfg config.Config, w http.ResponseWriter, r *http.Request, srcAbs, format string, chain zddc.PolicyChain) {
format = strings.ToLower(strings.TrimSpace(format))
switch format {
case "docx", "html", "pdf":
case "md", "docx", "html", "pdf":
default:
http.Error(w, "Bad Request — convert must be docx, html, or pdf", http.StatusBadRequest)
http.Error(w, "Bad Request — convert must be md, docx, html, or pdf", http.StatusBadRequest)
return
}
@ -159,17 +174,13 @@ func buildAndStore(ctx context.Context, fsRoot, srcAbs string, srcInfo os.FileIn
ctx, cancel := context.WithTimeout(ctx, convertTimeout)
defer cancel()
var out []byte
switch format {
case "docx":
out, err = convert.ToDocx(ctx, source, meta)
case "html":
out, err = convert.ToHTML(ctx, source, meta, resolveTemplateSet(fsRoot, filepath.Dir(srcAbs), source))
case "pdf":
out, err = convert.ToPDF(ctx, source, meta, resolveTemplateSet(fsRoot, filepath.Dir(srcAbs), source))
default:
return fmt.Errorf("unsupported format %q", format)
// Source format is the on-disk extension; target is the requested format.
from := strings.TrimPrefix(strings.ToLower(filepath.Ext(srcAbs)), ".")
var ts convert.TemplateSet
if format == "html" || format == "pdf" {
ts = resolveTemplateSet(fsRoot, filepath.Dir(srcAbs), source)
}
out, err := convert.Convert(ctx, from, format, source, meta, ts)
if err != nil {
return err
}
@ -290,20 +301,21 @@ func contentDispositionFor(format, base string) string {
return fmt.Sprintf(`inline; filename="%s.%s"`, base, format)
}
// purgeConverted removes the cached .zddc.d/converted/<base>.{docx,html,pdf}
// sidecars for an .md source. Called from the file API after a
// successful PUT/DELETE/MOVE so the next GET ?convert= regenerates.
// Best-effort: errors (including "directory doesn't exist") are
// swallowed. Non-.md sources are a no-op so this is safe to call
// purgeConverted removes the cached .zddc.d/converted/<base>.{md,docx,html,pdf}
// sidecars for a convertible source. Called from the file API after a successful
// PUT/DELETE/MOVE so the next virtual-convert GET regenerates. Best-effort:
// errors (including "directory doesn't exist") are swallowed. Sources whose
// extension isn't convertible are a no-op, so this is safe to call
// unconditionally after any write.
func purgeConverted(srcAbs string) {
if !strings.HasSuffix(strings.ToLower(srcAbs), ".md") {
ext := strings.TrimPrefix(strings.ToLower(filepath.Ext(srcAbs)), ".")
if _, ok := convertSourceExts[ext]; !ok {
return
}
dir := filepath.Dir(srcAbs)
base := strings.TrimSuffix(filepath.Base(srcAbs), filepath.Ext(srcAbs))
for _, ext := range []string{".docx", ".html", ".pdf"} {
_ = os.Remove(filepath.Join(dir, ReservedSidecar, "converted", base+ext))
for target := range convertSourceExts {
_ = os.Remove(filepath.Join(dir, ReservedSidecar, "converted", base+"."+target))
}
}

View file

@ -0,0 +1,65 @@
package handler
import (
"os"
"path/filepath"
"testing"
)
func TestRecognizeVirtualConvert_MatrixAndPrecedence(t *testing.T) {
root := t.TempDir()
write := func(rel string) {
p := filepath.Join(root, filepath.FromSlash(rel))
if err := os.MkdirAll(filepath.Dir(p), 0o755); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(p, []byte("x"), 0o644); err != nil {
t.Fatal(err)
}
}
// Sources on disk: doc.md, only.docx, both.md + both.docx, page.html.
write("doc.md")
write("only.docx")
write("both.md")
write("both.docx")
write("page.html")
cases := []struct {
name string
url string
wantOK bool
wantSrcExt string
wantFormat string
}{
{"md→docx", "/doc.docx", true, ".md", "docx"},
{"md→html", "/doc.html", true, ".md", "html"},
{"md→pdf", "/doc.pdf", true, ".md", "pdf"},
{"docx→md (only docx present)", "/only.md", true, ".docx", "md"},
{"docx→html (only docx present)", "/only.html", true, ".docx", "html"},
{"docx has no pdf source", "/only.pdf", false, "", ""},
{"both present, html prefers md source", "/both.html", true, ".md", "html"},
{"html→md", "/page.md", true, ".html", "md"},
{"html→docx", "/page.docx", true, ".html", "docx"},
{"no source at all", "/missing.html", false, "", ""},
{"directory url ignored", "/doc/", false, "", ""},
{"non-convertible target", "/doc.txt", false, "", ""},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
src, format, ok := RecognizeVirtualConvert(root, c.url)
if ok != c.wantOK {
t.Fatalf("ok=%v want %v (src=%q format=%q)", ok, c.wantOK, src, format)
}
if !ok {
return
}
if format != c.wantFormat {
t.Errorf("format=%q want %q", format, c.wantFormat)
}
if filepath.Ext(src) != c.wantSrcExt {
t.Errorf("source ext=%q want %q (src=%q)", filepath.Ext(src), c.wantSrcExt, src)
}
})
}
}