ZDDC/zddc/internal/convert/convert_test.go
ZDDC b5aab81d31 feat(zddc): MD→{docx,html,pdf} server-side conversion via stock pandoc + chromium containers
New endpoint GET /<path>/foo.md?convert=docx|html|pdf renders a markdown
source on demand. Surfaced as the Download buttons in browse's markdown
editor (separate commit).

Execution model — two upstream container images, lazy-pulled:

  • docker.io/pandoc/latex:latest  — MD→DOCX, MD→HTML (entrypoint pandoc)
  • docker.io/zenika/alpine-chrome — HTML→PDF (entrypoint chromium-browser)

No custom image build. The runner passes --pull=missing on every podman/
docker invocation so the operator only needs the runtime installed —
first request pulls the image, subsequent requests use the local cache.
Overrides: --convert-pandoc-image / --convert-chromium-image (and the
matching ZDDC_CONVERT_* env vars). Engine: --convert-engine (podman
preferred, docker fallback). Resource caps: --convert-mem-mib (512),
--convert-cpus (2), --convert-pids (100), --convert-timeout (30s).

PDF flow is two-stage: pandoc renders the markdown through the embedded
viewer-template.html to standalone HTML, then chromium prints that HTML
via --print-to-pdf. Preserves the print-media CSS already authored in
viewer-template.html rather than going through pandoc's LaTeX template.

Each conversion runs in a throw-away container with --rm --network=none
--read-only --tmpfs=/tmp --cap-drop=ALL --security-opt=no-new-privileges
--env=HOME=/tmp plus a bind-mounted scratch dir for I/O. Pandoc reads
markdown from stdin / writes to stdout; the viewer template lives at
/tpl (ro). Chromium reads HTML from a read-write bind mount at /pdf
and writes the PDF to the same mount; the host reads it back. No shell
wrappers, no shell quoting — argv flows straight into each image's
entrypoint.

On-disk cache at <dir>/.converted/<base>.<ext> with mtime synced to the
source. Fast path is a stat-and-serve with no exec; slow path
singleflights concurrent requests for the same target. PUT/DELETE/MOVE
on the source .md purges the .converted/ sidecars.

Per-project template variables (client/project/contractor/project_number)
come from a new .zddc `convert:` cascade block, walked leaf→root with
per-key latest-wins. Filename-derived variables (title, tracking_number,
revision, status, is_draft) come from a new zddc.ParseFilename helper.

If neither podman nor docker is on PATH, the endpoint serves 503 with
a clear Retry-After. The rest of the server keeps working.

This is the first os/exec site in the codebase. The hardening in
internal/convert/runner.go — context.CancelFunc → process kill,
cmd.WaitDelay, platform-specific SysProcAttr (Setpgid + Pdeathsig on
Linux), minimal env, stdout cap via limitWriter, stderr ring buffer —
sets the pattern for any future shell-outs.

Public surface:
  convert.ToDocx(ctx, source, meta) / .ToHTML / .ToPDF
  convert.Probe(ctx, engineOverride) → install Runner if engine present
  convert.SetImages(pandoc, chromium)
  convert.ConfigureLimits(memMiB, cpus, pids, timeout)
  convert.Available()

Container handler at internal/handler/converthandler.go; dispatcher
branch in cmd/zddc-server/main.go inserts the convert lookup after the
existing ACL gate, reusing the source file's read policy verbatim.
2026-05-13 10:33:56 -05:00

286 lines
8.1 KiB
Go

package convert
import (
"context"
"errors"
"strings"
"sync"
"testing"
"time"
)
// fakeRunner records the args it was invoked with and replays canned
// responses. Lets us assert the command lines + image refs without
// needing podman.
type fakeRunner struct {
mu sync.Mutex
calls [][]string
images []string
stdin [][]byte
mounts [][]string
resp []byte
err error
}
func (f *fakeRunner) Run(_ context.Context, image string, stdin []byte, mounts []string, cmd []string) ([]byte, error) {
f.mu.Lock()
defer f.mu.Unlock()
f.calls = append(f.calls, append([]string(nil), cmd...))
f.images = append(f.images, image)
f.stdin = append(f.stdin, append([]byte(nil), stdin...))
f.mounts = append(f.mounts, append([]string(nil), mounts...))
return f.resp, f.err
}
func (f *fakeRunner) lastCall() (string, []string) {
f.mu.Lock()
defer f.mu.Unlock()
if len(f.calls) == 0 {
return "", nil
}
return f.images[len(f.images)-1], f.calls[len(f.calls)-1]
}
func TestToDocx_UsesPandocImage(t *testing.T) {
f := &fakeRunner{resp: []byte("FAKE-DOCX")}
InstallRunner(f)
t.Cleanup(func() { InstallRunner(nil) })
SetImages("docker.io/pandoc/latex:latest", "")
out, err := ToDocx(context.Background(), []byte("# Hello\n"), Metadata{
Title: "Hello",
Client: "Acme",
})
if err != nil {
t.Fatalf("ToDocx: %v", err)
}
if string(out) != "FAKE-DOCX" {
t.Errorf("unexpected output: %q", out)
}
image, call := f.lastCall()
if image != "docker.io/pandoc/latex:latest" {
t.Errorf("expected pandoc image, got %q", image)
}
if !contains(call, "--to=docx") {
t.Errorf("missing --to=docx: %v", call)
}
if !contains(call, "title=Hello") {
t.Errorf("missing title metadata: %v", call)
}
if !contains(call, "client=Acme") {
t.Errorf("missing client metadata: %v", call)
}
// Last arg must be "-" so pandoc reads from stdin.
if call[len(call)-1] != "-" {
t.Errorf("expected stdin marker as last arg, got %q", call[len(call)-1])
}
}
func TestToHTML_UsesTemplateAndMountsScratch(t *testing.T) {
f := &fakeRunner{resp: []byte("<html>fake</html>")}
InstallRunner(f)
t.Cleanup(func() { InstallRunner(nil) })
SetImages("docker.io/pandoc/latex:latest", "")
_, err := ToHTML(context.Background(), []byte("# Hi\n"), Metadata{Title: "Hi"})
if err != nil {
t.Fatalf("ToHTML: %v", err)
}
image, call := f.lastCall()
if image != "docker.io/pandoc/latex:latest" {
t.Errorf("expected pandoc image, got %q", image)
}
if !contains(call, "--template=/tpl/viewer-template.html") {
t.Errorf("template flag missing: %v", call)
}
if !contains(call, "--toc") {
t.Errorf("TOC flag missing (default NoTOC=false): %v", call)
}
if len(f.mounts) == 0 || len(f.mounts[0]) == 0 {
t.Fatalf("expected at least one bind mount for /tpl")
}
mount := f.mounts[0][0]
if !strings.Contains(mount, ":/tpl:") {
t.Errorf("mount missing /tpl: %q", mount)
}
}
func TestToHTML_NoTOCSuppressesTOC(t *testing.T) {
f := &fakeRunner{resp: []byte("<html/>")}
InstallRunner(f)
t.Cleanup(func() { InstallRunner(nil) })
_, _ = ToHTML(context.Background(), []byte("# Hi\n"), Metadata{NoTOC: true})
_, call := f.lastCall()
if contains(call, "--toc") {
t.Errorf("TOC should be suppressed when NoTOC=true: %v", call)
}
if !contains(call, "no-toc=true") {
t.Errorf("no-toc metadata variable missing: %v", call)
}
}
// recordingRunner records every call and returns canned responses
// in sequence. Lets ToPDF tests assert the two-stage pipeline
// (pandoc image then chromium image).
type recordingRunner struct {
mu sync.Mutex
calls []recordedCall
resp [][]byte
err []error
cursor int
}
type recordedCall struct {
image string
cmd []string
mounts []string
}
func (r *recordingRunner) Run(_ context.Context, image string, _ []byte, mounts []string, cmd []string) ([]byte, error) {
r.mu.Lock()
defer r.mu.Unlock()
r.calls = append(r.calls, recordedCall{
image: image,
cmd: append([]string(nil), cmd...),
mounts: append([]string(nil), mounts...),
})
if r.cursor >= len(r.resp) {
return nil, nil
}
out := r.resp[r.cursor]
var e error
if r.cursor < len(r.err) {
e = r.err[r.cursor]
}
r.cursor++
return out, e
}
func TestToPDF_TwoStagePipeline(t *testing.T) {
// Stage 1: pandoc emits HTML. Stage 2: chromium reads HTML from
// the bind mount and writes /pdf/out.pdf. The fake runner can't
// actually write the PDF, so we expect ToPDF to fail at the
// read-back step — but we can still assert the two-stage call
// shape and the right image per stage.
r := &recordingRunner{
resp: [][]byte{
[]byte("<html><body>fake</body></html>"), // stage 1 stdout
nil, // stage 2 stdout (chromium writes PDF to bind mount)
},
}
InstallRunner(r)
t.Cleanup(func() { InstallRunner(nil) })
SetImages("docker.io/pandoc/latex:latest", "docker.io/zenika/alpine-chrome:latest")
_, err := ToPDF(context.Background(), []byte("# Hi\n"), Metadata{})
// PDF read-back will fail (fake runner didn't write the file) —
// that's expected for this test which only inspects the call
// shape.
if err == nil {
t.Fatalf("expected error from PDF read-back; got nil")
}
if len(r.calls) != 2 {
t.Fatalf("expected 2 container calls (pandoc + chromium); got %d", len(r.calls))
}
if r.calls[0].image != "docker.io/pandoc/latex:latest" {
t.Errorf("stage 1 image: got %q want pandoc/latex", r.calls[0].image)
}
if r.calls[1].image != "docker.io/zenika/alpine-chrome:latest" {
t.Errorf("stage 2 image: got %q want alpine-chrome", r.calls[1].image)
}
// Stage 2 must include the --print-to-pdf flag pointing at /pdf.
if !contains(r.calls[1].cmd, "--print-to-pdf=/pdf/out.pdf") {
t.Errorf("chromium call missing --print-to-pdf flag: %v", r.calls[1].cmd)
}
if !contains(r.calls[1].cmd, "--no-sandbox") {
t.Errorf("chromium call missing --no-sandbox: %v", r.calls[1].cmd)
}
// Stage 2's bind mount must be writable (chromium writes the PDF).
if len(r.calls[1].mounts) == 0 || !strings.Contains(r.calls[1].mounts[0], ":rw") {
t.Errorf("chromium mount must be :rw, got %v", r.calls[1].mounts)
}
}
func TestErrUnavailable_WhenNoRunner(t *testing.T) {
InstallRunner(nil)
_, err := ToDocx(context.Background(), []byte("x"), Metadata{})
if !errors.Is(err, ErrUnavailable) {
t.Errorf("expected ErrUnavailable, got %v", err)
}
}
func TestMetadataArgs_OmitsEmptyAndOrdersStably(t *testing.T) {
args := metadataArgs(Metadata{
Title: "T",
Project: "P",
GenerationTime: time.Date(2026, 5, 13, 14, 30, 22, 0, time.UTC),
})
want := []string{
"-V", "title=T",
"-V", "project=P",
}
for i, w := range want {
if i >= len(args) || args[i] != w {
t.Fatalf("args[%d]: got %v want prefix %v", i, args, want)
}
}
joined := strings.Join(args, "|")
if !strings.Contains(joined, "generation_time=") || !strings.Contains(joined, "2026") {
t.Errorf("generation_time missing or malformed: %v", args)
}
if strings.Contains(joined, "client=") {
t.Errorf("empty client should not be passed: %v", args)
}
}
func TestImageTag(t *testing.T) {
cases := map[string]string{
"docker.io/pandoc/latex:latest": "pandoc/latex",
"docker.io/zenika/alpine-chrome:latest": "zenika/alpine-chrome",
"pandoc/core": "pandoc/core",
"quay.io/example/foo:v1": "example/foo",
"alpine": "alpine",
}
for in, want := range cases {
if got := imageTag(in); got != want {
t.Errorf("imageTag(%q) = %q, want %q", in, got, want)
}
}
}
func TestSingleflight_Collapses(t *testing.T) {
var g singleflightGroup
const N = 50
var wg sync.WaitGroup
var hits int32
var mu sync.Mutex
wg.Add(N)
for i := 0; i < N; i++ {
go func() {
defer wg.Done()
_, _ = g.Do("k", func() (any, error) {
mu.Lock()
hits++
mu.Unlock()
time.Sleep(20 * time.Millisecond)
return "v", nil
})
}()
}
wg.Wait()
if hits != 1 {
t.Errorf("singleflight collapse: got %d invocations, want 1", hits)
}
}
// contains reports whether haystack has needle as any of its elements.
func contains(haystack []string, needle string) bool {
for _, s := range haystack {
if s == needle {
return true
}
}
return false
}