// Package convert turns a markdown source byte-buffer into DOCX, HTML, // or PDF by exec'ing pandoc and chromium-browser. Each conversion runs // inside a sandbox provided by the IMAGE — typically a wrapper script // at /usr/local/bin/ that puts the real binary into a cgroup // v2 + bubblewrap sandbox before exec'ing it. See // zddc/runtime.Containerfile for the production setup. // // zddc-server's Go code is unaware of sandboxing: it just exec's // "pandoc" or "chromium-browser" and gets the corresponding tool's // behavior back. Operators who want a different isolation strategy // (firejail, systemd-nspawn, podman-run, raw exec for dev) replace // the wrapper script in their image; the Go binary doesn't change. // // Public surface: // // ToDocx(ctx, source, meta) → []byte (DOCX bytes) // ToHTML(ctx, source, meta, ts) → []byte (standalone HTML) // ToPDF (ctx, source, meta, ts) → []byte (PDF, via HTML + chromium) // // Probe(ctx) → Capabilities (call once at startup) // Available() → (Capabilities, bool) // SetBinaries(pandoc, chromium) — install binary names from config // SetScratchDir(dir) — install scratch root from config // // All three converters are safe for concurrent use; each call gets a // fresh scratch dir + (image-provided) sandbox. // // Metadata maps to the placeholders consumed by the doctype templates. // title/tracking_number/revision/status/is_draft typically come from // the source filename (zddc.ParseFilename); client/project/contractor/ // project_number from the .zddc cascade `convert:` block. package convert import ( "context" "fmt" "os" "path/filepath" "strings" "sync/atomic" "time" ) // Metadata is the variable bag passed to pandoc as `--variable k=v` // pairs. Fields with zero values are omitted. The templates use // `$if(field)$ … $endif$` blocks so absent fields render cleanly. type Metadata struct { Title string TrackingNumber string Revision string Status string Client string Project string Contractor string ProjectNumber string GenerationTime time.Time IsDraft bool NoTOC bool } // TemplateSet is the bundle of files written to the per-call scratch dir for an // HTML render: the chosen doctype template (Name) plus every partial it may // include. pandoc resolves `$partial()$` includes from the template's own // directory, so Files must contain Name and all referenced partials. type TemplateSet struct { Name string // primary template filename, e.g. "report.html" Files map[string][]byte // base filename -> bytes (must include Name) } // DefaultTemplateSet returns the baked-in template set for doctype `name` // (e.g. "report"). An empty or unknown name falls back to DefaultTemplateName. // The set includes every embedded partial so `$..()$` includes resolve; handlers // may overlay .zddc.d/templates/ overrides onto the returned Files map. func DefaultTemplateSet(name string) TemplateSet { files := embeddedTemplateFiles() primary := name + ".html" if name == "" || files[primary] == nil { primary = DefaultTemplateName + ".html" } return TemplateSet{Name: primary, Files: files} } // Default binary names. The runtime image installs WRAPPER scripts at // /usr/local/bin/pandoc and /usr/local/bin/chromium-browser (shadowing // the real binaries in /usr/bin/) so these names resolve through the // sandbox automatically. Operators running zddc-server outside the // runtime image with raw binaries on PATH still get a working // conversion endpoint — just without the per-call sandbox. // // Alpine's chromium package installs the binary as "chromium-browser"; // debian/ubuntu ships "chromium". Operators override via // --convert-chromium-binary when the package on their image differs. const ( DefaultPandocBinary = "pandoc" DefaultChromiumBinary = "chromium-browser" ) var ( pandocBinary atomic.Pointer[string] chromiumBinary atomic.Pointer[string] scratchDir atomic.Pointer[string] ) // SetBinaries installs the binary names used by Probe/Run. Empty // values keep the previous setting (or the DefaultPandocBinary / // DefaultChromiumBinary constants on first call). The values are // PATH-resolved names (e.g. "pandoc", "chromium-browser") or // absolute paths. Called from cmd/zddc-server/main.go after flag // parsing. func SetBinaries(pandoc, chromium string) { if pandoc != "" { s := pandoc pandocBinary.Store(&s) } if chromium != "" { s := chromium chromiumBinary.Store(&s) } } // SetScratchDir installs the host-side scratch root used for // per-call intermediates (template, HTML, PDF). Empty means "use // $TMPDIR". The runtime-image wrapper bind-mounts the per-call // scratch dir into its sandbox at the same path, so any path under // this root works. func SetScratchDir(dir string) { s := dir scratchDir.Store(&s) } func currentScratchDir() string { if p := scratchDir.Load(); p != nil { return *p } return "" } func currentPandocBinary() string { if p := pandocBinary.Load(); p != nil && *p != "" { return *p } return DefaultPandocBinary } func currentChromiumBinary() string { if p := chromiumBinary.Load(); p != nil && *p != "" { return *p } return DefaultChromiumBinary } // ToDocx renders source markdown to DOCX bytes. Single pandoc exec; // no scratch dir needed (stdin → stdout). The caller passes the // full file content (envelope + body); pandoc handles // `markdown+yaml_metadata_block` natively. func ToDocx(ctx context.Context, source []byte, m Metadata) ([]byte, error) { return convertToDocx(ctx, "markdown+yaml_metadata_block", source, m) } // convertToDocx renders source (in pandoc input format fromFmt) to DOCX bytes // via a single pandoc exec (stdin → stdout; no scratch dir). Images in the // source's mediabag — present when fromFmt is "html" — are embedded into the // .docx natively by pandoc's docx writer. func convertToDocx(ctx context.Context, fromFmt string, source []byte, m Metadata) ([]byte, error) { r := currentRunner() if r == nil { return nil, ErrUnavailable } cmd := []string{ "--from=" + fromFmt, "--to=docx", "--output=-", } cmd = append(cmd, metadataArgs(m)...) cmd = append(cmd, "-") return r.Run(ctx, currentPandocBinary(), source, "", cmd) } // convertToMarkdown renders source (DOCX or HTML, per fromFmt) to GitHub- // flavored markdown. Embedded images are inlined as base64 data: URIs via the // inline-media.lua filter so the output .md is self-contained; --wrap=none keeps // paragraphs on one line (no hard line breaks). func convertToMarkdown(ctx context.Context, fromFmt string, source []byte) ([]byte, error) { r := currentRunner() if r == nil { return nil, ErrUnavailable } scratch, err := writeScratchFiles(currentScratchDir(), map[string][]byte{"inline-media.lua": inlineMediaLua}) if err != nil { return nil, fmt.Errorf("scratch: %w", err) } defer os.RemoveAll(scratch) cmd := []string{ "--from=" + fromFmt, "--to=gfm", "--wrap=none", "--lua-filter=" + filepath.Join(scratch, "inline-media.lua"), "--output=-", "-", } return r.Run(ctx, currentPandocBinary(), source, scratch, cmd) } // Convert renders source from one document format to another. Supported pairs: // // md → docx | html | pdf // docx → md | html // html → md | docx // // ts is the resolved HTML template set, used only for the *→html and md→pdf // directions and ignored otherwise. Unsupported pairs return an error. func Convert(ctx context.Context, from, to string, source []byte, m Metadata, ts TemplateSet) ([]byte, error) { switch from { case "md", "markdown": switch to { case "docx": return ToDocx(ctx, source, m) case "html": return ToHTML(ctx, source, m, ts) case "pdf": return ToPDF(ctx, source, m, ts) } case "docx": switch to { case "md": return convertToMarkdown(ctx, "docx", source) case "html": return convertToHTML(ctx, "docx", source, m, ts) } case "html", "htm": switch to { case "md": return convertToMarkdown(ctx, "html", source) case "docx": return convertToDocx(ctx, "html", source, m) } } return nil, fmt.Errorf("unsupported conversion %s→%s", from, to) } // ToHTML renders source markdown to standalone HTML using the doctype // template in ts. Embeds CSS + images via --embed-resources. The // template + its partials live in a per-call scratch dir; the host path // is passed via ZDDC_SCRATCH so the wrapper bind-mounts it into the // sandbox at the same path. A zero-value ts falls back to the embedded // default template. func ToHTML(ctx context.Context, source []byte, m Metadata, ts TemplateSet) ([]byte, error) { return convertToHTML(ctx, "markdown+yaml_metadata_block", source, m, ts) } // convertToHTML renders source (in pandoc input format fromFmt) to standalone // HTML through the doctype template in ts. --embed-resources base64-inlines CSS // and any mediabag images (so DOCX images survive docx→html with no extra // filter). The template + partials are written to a per-call scratch dir. func convertToHTML(ctx context.Context, fromFmt string, source []byte, m Metadata, ts TemplateSet) ([]byte, error) { r := currentRunner() if r == nil { return nil, ErrUnavailable } if ts.Name == "" || len(ts.Files) == 0 { ts = DefaultTemplateSet(DefaultTemplateName) } scratch, err := writeScratchFiles(currentScratchDir(), ts.Files) if err != nil { return nil, fmt.Errorf("scratch: %w", err) } defer os.RemoveAll(scratch) tplPath := filepath.Join(scratch, ts.Name) cmd := []string{ "--from=" + fromFmt, "--to=html5", "--standalone", "--embed-resources", "--section-divs", "--id-prefix=", "--html-q-tags", "--template=" + tplPath, } if !m.NoTOC { cmd = append(cmd, "--toc", "--toc-depth=6") } cmd = append(cmd, metadataArgs(m)...) cmd = append(cmd, "--output=-", "-") return r.Run(ctx, currentPandocBinary(), source, scratch, cmd) } // ToPDF renders source markdown to PDF in two stages: pandoc // produces HTML using the doctype template in ts (stage 1), then // headless chromium prints that HTML to PDF (stage 2). The two-stage // choice preserves the print-media CSS authored in the templates — // pandoc's native --pdf-engine path uses LaTeX which would bypass it // entirely. // // Both stages share a single per-call scratch dir: pandoc writes // `in.html` and chromium reads it, then chromium writes `out.pdf` // which the host reads back. The wrapper bind-mounts the scratch // dir read-write into the sandbox at the same path. func ToPDF(ctx context.Context, source []byte, m Metadata, ts TemplateSet) ([]byte, error) { html, err := ToHTML(ctx, source, m, ts) if err != nil { return nil, err } r := currentRunner() if r == nil { return nil, ErrUnavailable } scratch, err := os.MkdirTemp(currentScratchDir(), "zddc-pdf-") if err != nil { return nil, fmt.Errorf("scratch: %w", err) } defer os.RemoveAll(scratch) htmlPath := filepath.Join(scratch, "in.html") pdfPath := filepath.Join(scratch, "out.pdf") if err := os.WriteFile(htmlPath, html, 0o644); err != nil { return nil, fmt.Errorf("write html: %w", err) } if err := chmodTree(scratch, 0o755, 0o644); err != nil { return nil, err } // --no-sandbox: the wrapper provides the sandbox; chromium's // own setuid sandbox would conflict (and fails inside our // user-namespace anyway). --disable-dev-shm-usage: chromium's // shared-memory fallback writes to /dev/shm which our sandbox // doesn't expose; redirect to /tmp (the wrapper's tmpfs). cmd := []string{ "--headless", "--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", "--user-data-dir=/tmp/chrome", "--no-pdf-header-footer", "--virtual-time-budget=10000", "--print-to-pdf=" + pdfPath, "file://" + htmlPath, } if _, err := r.Run(ctx, currentChromiumBinary(), nil, scratch, cmd); err != nil { return nil, err } out, err := os.ReadFile(pdfPath) if err != nil { return nil, fmt.Errorf("read pdf: %w", err) } if len(out) < 4 || string(out[:4]) != "%PDF" { return nil, &ConvertError{ Tool: currentChromiumBinary(), ExitCode: 0, Stderr: "chromium did not produce a valid PDF", Cause: fmt.Errorf("invalid PDF magic in output (got %d bytes)", len(out)), } } return out, nil } // metadataArgs renders Metadata into pandoc -V flags. Order is // stable so test fixtures don't churn. Empty values are omitted // (the template uses $if(...)$ blocks). func metadataArgs(m Metadata) []string { var out []string add := func(k, v string) { v = strings.TrimSpace(v) if v == "" { return } out = append(out, "-V", k+"="+v) } add("title", m.Title) add("tracking_number", m.TrackingNumber) add("revision", m.Revision) add("status", m.Status) add("client", m.Client) add("project", m.Project) add("contractor", m.Contractor) add("project_number", m.ProjectNumber) if !m.GenerationTime.IsZero() { add("generation_time", m.GenerationTime.Format("January 02, 2006 at 3:04:05 PM MST")) } if m.IsDraft { add("is_draft", "true") } if m.NoTOC { add("no-toc", "true") } return out }