// Package convert turns a markdown source byte-buffer into DOCX, HTML, // or PDF via two stock upstream container images: pandoc (default // `docker.io/pandoc/latex:latest`) handles MD↔DOCX and MD→HTML, and // a headless-chromium image (default `docker.io/zenika/alpine-chrome:latest`) // handles HTML→PDF. No custom image build is required — the operator // just needs `podman` or `docker` on PATH and the runner pulls each // image on first use via `--pull=missing`. // // Public surface: // // ToDocx(ctx, source, meta) → []byte (DOCX bytes) // ToHTML(ctx, source, meta) → []byte (standalone HTML) // ToPDF (ctx, source, meta) → []byte (PDF, via HTML + chromium) // // Probe(ctx, override) → Capabilities (call once at startup) // Available() → (Capabilities, bool) // SetImages(pandoc, chromium) — install image refs from config // // All three converters are safe for concurrent use; each call gets a // fresh container. The pandoc image's entrypoint is `pandoc`, so the // argv we pass after the image flows straight into pandoc. The // alpine-chrome image's entrypoint is `chromium-browser`, so the argv // flows into chromium-browser. No `sh -c` wrappers, no shell quoting. // // Metadata maps to the placeholders consumed by viewer-template.html. // title/tracking_number/revision/status/is_draft typically come from // the source filename (zddc.ParseFilename); client/project/contractor/ // project_number from the .zddc cascade `convert:` block. package convert import ( "context" "fmt" "os" "path/filepath" "strings" "sync/atomic" "time" ) // Metadata is the variable bag passed to pandoc as `--variable k=v` // pairs. Fields with zero values are omitted. The viewer-template.html // uses `$if(field)$ … $endif$` blocks so absent fields render cleanly. type Metadata struct { Title string TrackingNumber string Revision string Status string Client string Project string Contractor string ProjectNumber string GenerationTime time.Time IsDraft bool NoTOC bool } // Default images. Operator overrides via --convert-pandoc-image / // --convert-chromium-image (see cmd/zddc-server). pandoc/latex carries // TeX Live for native PDF too, so it's a superset of pandoc/core; // operators wanting a slimmer footprint can switch to pandoc/core. const ( DefaultPandocImage = "docker.io/pandoc/latex:latest" DefaultChromiumImage = "docker.io/zenika/alpine-chrome:latest" ) var ( pandocImage atomic.Pointer[string] chromiumImage atomic.Pointer[string] ) // SetImages installs the image refs used for subsequent ToDocx/ToHTML/ // ToPDF calls. Empty values keep the previous setting (or the // DefaultPandocImage / DefaultChromiumImage constants on first call). // Called from cmd/zddc-server/main.go after flag parsing. func SetImages(pandoc, chromium string) { if pandoc != "" { s := pandoc pandocImage.Store(&s) } if chromium != "" { s := chromium chromiumImage.Store(&s) } } func currentPandocImage() string { if p := pandocImage.Load(); p != nil && *p != "" { return *p } return DefaultPandocImage } func currentChromiumImage() string { if p := chromiumImage.Load(); p != nil && *p != "" { return *p } return DefaultChromiumImage } // ToDocx renders source markdown to DOCX bytes. One container run via // the pandoc image. Caller passes the full file content (envelope + // body); pandoc handles `markdown+yaml_metadata_block` natively. func ToDocx(ctx context.Context, source []byte, m Metadata) ([]byte, error) { r := currentRunner() if r == nil { return nil, ErrUnavailable } cmd := []string{ "--from=markdown+yaml_metadata_block", "--to=docx", "--output=-", } cmd = append(cmd, metadataArgs(m)...) cmd = append(cmd, "-") return r.Run(ctx, currentPandocImage(), source, nil, cmd) } // ToHTML renders source markdown to standalone HTML using // viewer-template.html. Embeds CSS + images via --embed-resources. // Template + custom.css are bind-mounted into the container at /tpl // from a per-call scratch dir. func ToHTML(ctx context.Context, source []byte, m Metadata) ([]byte, error) { r := currentRunner() if r == nil { return nil, ErrUnavailable } scratch, err := writeAssetsToScratch() if err != nil { return nil, fmt.Errorf("scratch: %w", err) } defer os.RemoveAll(scratch) cmd := []string{ "--from=markdown+yaml_metadata_block", "--to=html5", "--standalone", "--embed-resources", "--section-divs", "--id-prefix=", "--html-q-tags", "--template=/tpl/viewer-template.html", } if !m.NoTOC { cmd = append(cmd, "--toc", "--toc-depth=6") } cmd = append(cmd, metadataArgs(m)...) cmd = append(cmd, "--output=-", "-") mounts := []string{scratch + ":/tpl:ro"} return r.Run(ctx, currentPandocImage(), source, mounts, cmd) } // ToPDF renders source markdown to PDF in two stages: pandoc produces // HTML using viewer-template.html (stage 1, pandoc image), then headless // Chromium prints that HTML to PDF (stage 2, chromium image). The // two-stage choice preserves the print-media CSS already authored in // viewer-template.html — pandoc's native --pdf-engine path uses LaTeX // which would bypass it entirely. // // Chromium runs from the alpine-chrome image whose entrypoint is // `chromium-browser`; our cmd is the flag list passed straight to that // binary. The host scratch dir is bind-mounted read-write at /pdf so // chromium can write out.pdf and we read it back afterward. func ToPDF(ctx context.Context, source []byte, m Metadata) ([]byte, error) { html, err := ToHTML(ctx, source, m) if err != nil { return nil, err } r := currentRunner() if r == nil { return nil, ErrUnavailable } scratch, err := os.MkdirTemp("", "zddc-pdf-") if err != nil { return nil, fmt.Errorf("scratch: %w", err) } defer os.RemoveAll(scratch) htmlPath := filepath.Join(scratch, "in.html") pdfPath := filepath.Join(scratch, "out.pdf") if err := os.WriteFile(htmlPath, html, 0o644); err != nil { return nil, fmt.Errorf("write html: %w", err) } if err := chmodTree(scratch, 0o755, 0o644); err != nil { return nil, err } mounts := []string{scratch + ":/pdf:rw"} // alpine-chrome's entrypoint is `chromium-browser`. --no-sandbox is // required because the container drops CAP_SYS_ADMIN; the threat // model is "malicious markdown drives chromium RCE", contained by // --network=none + --cap-drop=ALL + --read-only + tmpfs. cmd := []string{ "--headless", "--disable-gpu", "--no-sandbox", "--user-data-dir=/tmp/chrome", "--no-pdf-header-footer", "--virtual-time-budget=10000", "--print-to-pdf=/pdf/out.pdf", "file:///pdf/in.html", } if _, err := r.Run(ctx, currentChromiumImage(), nil, mounts, cmd); err != nil { return nil, err } out, err := os.ReadFile(pdfPath) if err != nil { return nil, fmt.Errorf("read pdf: %w", err) } if len(out) < 4 || string(out[:4]) != "%PDF" { return nil, &ConvertError{ Tool: "chromium", ExitCode: 0, Stderr: "chromium did not produce a valid PDF", Cause: fmt.Errorf("invalid PDF magic in output (got %d bytes)", len(out)), } } return out, nil } // metadataArgs renders Metadata into pandoc -V flags. Order is stable // so test fixtures don't churn. Empty values are omitted (the template // uses $if(...)$ blocks). func metadataArgs(m Metadata) []string { var out []string add := func(k, v string) { v = strings.TrimSpace(v) if v == "" { return } out = append(out, "-V", k+"="+v) } add("title", m.Title) add("tracking_number", m.TrackingNumber) add("revision", m.Revision) add("status", m.Status) add("client", m.Client) add("project", m.Project) add("contractor", m.Contractor) add("project_number", m.ProjectNumber) if !m.GenerationTime.IsZero() { add("generation_time", m.GenerationTime.Format("January 02, 2006 at 3:04:05 PM MST")) } if m.IsDraft { add("is_draft", "true") } if m.NoTOC { add("no-toc", "true") } return out }