package convert import ( "bytes" "context" "errors" "fmt" "io" "io/fs" "os" "os/exec" "path/filepath" "strings" "sync" "time" ) // ToolSpec identifies the conversion tool to invoke. Runners pick // whichever field applies to them: // // - bwrapRunner uses Binary — the path or PATH-name of the tool on // the zddc-server host (or container). pandoc/latex's entrypoint // becomes `pandoc`; alpine-chrome's becomes `chromium-browser`. // This is the production-default engine: lightest sandbox, no // daemon, no privileged outer container. // // - containerRunner uses Image — the OCI image ref pulled into a // fresh container for each conversion (legacy/fallback engine, // kept for environments that already host a podman/docker daemon // and want OCI-image isolation per conversion). // // Both fields are populated by the entry points in convert.go so a // single call site works regardless of which engine is installed. type ToolSpec struct { Image string // OCI image ref (containerRunner) Binary string // binary name on PATH (bwrapRunner) } // Runner executes a conversion sub-process and returns its stdout. // The host-side implementations are bwrapRunner (default; wraps // `bubblewrap`) and containerRunner (fallback; wraps `podman run` / // `docker run`). Tests use a fake. // // stdin is piped to the tool's stdin. cmd is the argv passed *to the // tool* — for pandoc the entrypoint accepts pandoc flags directly; // for chromium it accepts chromium-browser flags. mounts is a list // of ":" specs (":ro" is added if no mode // segment is present); each runner translates them to its own // bind/--volume syntax. // // All exec calls in this package go through Runner.Run. This is the // first os/exec site in the codebase; the hardening here is the // pattern for future shell-outs. type Runner interface { Run(ctx context.Context, tool ToolSpec, stdin []byte, mounts []string, cmd []string) ([]byte, error) } // ErrUnavailable means no container runtime is present on the host. // Handlers translate to HTTP 503. var ErrUnavailable = errors.New("conversion unavailable") // ConvertError carries the failure surface from a non-zero exit. // Stderr is captured (truncated to 4 KiB by the runner) so callers can // surface pandoc/chromium's own complaint. type ConvertError struct { Tool string // image name fragment, used only for logging ExitCode int Stderr string Cause error } func (e *ConvertError) Error() string { if e == nil { return "" } if e.Stderr != "" { return fmt.Sprintf("%s exit %d: %s", e.Tool, e.ExitCode, strings.TrimSpace(e.Stderr)) } return fmt.Sprintf("%s exit %d: %v", e.Tool, e.ExitCode, e.Cause) } func (e *ConvertError) Unwrap() error { return e.Cause } // containerRunner runs each conversion inside a fresh container. // The engine ("podman" preferred, "docker" fallback) is resolved once // at startup by Probe. Resource limits are configurable via // SetLimits (called from main.go after flag parsing). Images are passed // per call so the same runner handles both pandoc and chromium // invocations. // // Two modes: // // - **local** (remoteURL=""): the engine binary creates containers // directly on the host that runs zddc-server. Used for bare-metal // and host-podman deployments. Requires podman or docker on PATH. // // - **remote** (remoteURL="unix:///var/run/podman/podman.sock" or // similar): the engine binary is the local podman CLIENT, invoked // as `podman --remote --url= run …`; the actual // container creation happens in whatever process owns the socket // (typically a `podman system service` sidecar in the same pod). // Used for the Kubernetes sidecar pattern so zddc-server's own // pod stays unprivileged. Bind-mount paths must resolve identically // on both sides — see scratchDir. // // The runner relies on `--pull=missing` so the operator never has to // pre-pull images: the first request that needs an image pulls it, // subsequent requests use the local cache. Both podman and docker // honour this flag identically. type containerRunner struct { mu sync.RWMutex engine string remoteURL string memMiB int cpus string pids int timeout time.Duration } var ( // shared default runner, populated by InstallRunner (called from // the health probe at startup once the engine is known). defaultRunnerMu sync.RWMutex defaultRunner Runner ) // InstallRunner sets the package-level Runner used by ToDocx/ToHTML/ToPDF. // Tests inject a fake; production code lets the health probe install a // containerRunner. Safe to call from multiple goroutines. func InstallRunner(r Runner) { defaultRunnerMu.Lock() defaultRunner = r defaultRunnerMu.Unlock() } // ConfigureLimits applies resource limits to the package-level Runner, // if it's a containerRunner. No-op when no runner is installed yet // (the probe failed) or when the installed runner doesn't accept // limits (e.g. a test fake). Zero values keep the previous setting. // // Called from cmd/zddc-server/main.go after Probe so the limits from // the operator's flags take effect before any conversion request lands. func ConfigureLimits(memMiB int, cpus string, pids int, timeout time.Duration) { defaultRunnerMu.RLock() r := defaultRunner defaultRunnerMu.RUnlock() if cr, ok := r.(*containerRunner); ok { cr.SetLimits(memMiB, cpus, pids, timeout) } } func currentRunner() Runner { defaultRunnerMu.RLock() r := defaultRunner defaultRunnerMu.RUnlock() return r } // SetLimits updates the resource ceilings used for subsequent Run // invocations. Zero values keep the previous setting (or the defaults // set at construction). Safe to call from multiple goroutines. func (cr *containerRunner) SetLimits(memMiB int, cpus string, pids int, timeout time.Duration) { cr.mu.Lock() defer cr.mu.Unlock() if memMiB > 0 { cr.memMiB = memMiB } if cpus != "" { cr.cpus = cpus } if pids > 0 { cr.pids = pids } if timeout > 0 { cr.timeout = timeout } } func newContainerRunner(engine, remoteURL string) *containerRunner { return &containerRunner{ engine: engine, remoteURL: remoteURL, memMiB: 512, cpus: "2", pids: 100, timeout: 30 * time.Second, } } // Run executes one container invocation. cmd is the argv passed to the // image's entrypoint (pandoc for pandoc/latex, chromium-browser for // alpine-chrome). mounts is a list of ":" // strings; ":ro" is appended when no mode segment is present. stdin is // piped to the container, stdout is returned as bytes (capped at // 128 MiB). // // Hardening: // - --pull=missing: image is fetched on first use, cached after. // Operator only needs podman/docker installed; no manual pull. // - --rm: container is removed on exit, even if killed. // - --network=none: no network inside the container. Prevents data // exfiltration through embedded URLs in source documents. // - --read-only + tmpfs on /tmp and /run: image fs is immutable; // pandoc/chromium scratch goes to tmpfs only. // - --memory / --cpus / --pids-limit: kernel-enforced caps. // - --cap-drop=ALL + --security-opt=no-new-privileges: standard // container-escape hardening. // - context-cancel kill + WaitDelay: a wedged podman gets force- // killed; pipes drop after 2s so we don't leak goroutines. // - cmd.Env minimal: only PATH + HOME are passed through to the // engine binary; the container itself sees only what the image // bakes in plus what --env adds (HOME=/tmp). // // Note: --user is intentionally NOT set so each image uses its // default user (pandoc/latex runs as root, alpine-chrome runs as // uid 1000). With --read-only + tmpfs + --cap-drop=ALL + // --network=none + --no-new-privileges the additional defense from // forcing nobody is small and would break alpine-chrome's own // user-data-dir layout. func (cr *containerRunner) Run(ctx context.Context, tool ToolSpec, stdin []byte, mounts []string, cmd []string) ([]byte, error) { cr.mu.RLock() engine := cr.engine remoteURL := cr.remoteURL memMiB := cr.memMiB cpus := cr.cpus pids := cr.pids timeout := cr.timeout cr.mu.RUnlock() if engine == "" { return nil, ErrUnavailable } image := tool.Image if image == "" { return nil, fmt.Errorf("convert.Run: tool.Image is empty (containerRunner requires an OCI image ref)") } runCtx, cancel := context.WithTimeout(ctx, timeout) defer cancel() // Client args. In remote mode, prepend --remote and --url so the // podman CLI dispatches the request to the sidecar's // `podman system service` instead of creating a container locally. // The remaining flags (--rm, --pull=missing, etc.) apply to the // container that the remote daemon will create — same wire format // as local mode. var args []string if remoteURL != "" { args = append(args, "--remote", "--url="+remoteURL) } args = append(args, "run", "--rm", "--pull=missing", "-i", ) // --userns=host only in local mode: needed when zddc-server itself // is the one running podman inside a Kubernetes pod, because the // kernel won't let an inner rootless podman set up its own userns // via newuidmap. In remote (sidecar) mode the sidecar runs as root // and creates the inner container in its own (rootful) namespace, // so --userns=host is unnecessary and potentially noisy. if remoteURL == "" { args = append(args, "--userns=host") } args = append(args, "--network=none", "--read-only", // /tmp must be large enough to host chromium's shared-memory // fallback (--disable-dev-shm-usage redirects /dev/shm writes // here) plus the user-data-dir. 256 MiB is plenty for the // HTML→PDF flow; pandoc itself uses almost none. "--tmpfs=/tmp:size=256m,exec", "--tmpfs=/run:size=4m", fmt.Sprintf("--memory=%dm", memMiB), fmt.Sprintf("--cpus=%s", cpus), fmt.Sprintf("--pids-limit=%d", pids), "--cap-drop=ALL", "--security-opt=no-new-privileges", "--env=HOME=/tmp", "--workdir=/tmp", ) for _, m := range mounts { if !strings.Contains(m, ":ro") && !strings.Contains(m, ":rw") { m += ":ro" } args = append(args, "--volume="+m) } args = append(args, image) args = append(args, cmd...) c := exec.CommandContext(runCtx, engine, args...) c.Cancel = func() error { if c.Process == nil { return nil } return c.Process.Kill() } c.WaitDelay = 2 * time.Second c.SysProcAttr = sysProcAttr() c.Env = []string{ "PATH=" + os.Getenv("PATH"), "HOME=" + os.TempDir(), } c.Stdin = bytes.NewReader(stdin) var stdoutBuf bytes.Buffer c.Stdout = &limitWriter{w: &stdoutBuf, max: 128 << 20} stderr := newRingWriter(4 << 10) c.Stderr = stderr err := c.Run() if err != nil { exitCode := -1 if ee, ok := err.(*exec.ExitError); ok { exitCode = ee.ExitCode() } toolName := imageTag(image) if runCtx.Err() == context.DeadlineExceeded { return nil, &ConvertError{ Tool: toolName, ExitCode: exitCode, Stderr: stderr.String(), Cause: fmt.Errorf("timeout after %s: %w", timeout, runCtx.Err()), } } return nil, &ConvertError{ Tool: toolName, ExitCode: exitCode, Stderr: stderr.String(), Cause: err, } } return stdoutBuf.Bytes(), nil } // ─────────────────────────────────────────────────────────────────────────── // bwrapRunner — default conversion engine. // // Wraps `bubblewrap` to run pandoc / chromium binaries directly in a // per-call Linux-namespace sandbox. No daemon, no OCI images, no // privileged outer container. Image-build bundles pandoc + chromium // into the zddc-server image so the binaries are available on PATH; // each conversion gets a fresh set of namespaces, a read-only view // of the host's /usr (so the binary + its libs are visible), a tmpfs // /tmp, and nothing else. // // This matches the threat model of the legacy containerRunner — // untrusted source-markdown drives the binary, we contain any // resulting RCE inside the bwrap sandbox — without the operational // tax of running a container engine per conversion (image pull, // daemon, socket, ~300ms startup). // // Hardening (mirror of containerRunner's flags): // - --unshare-all + --share-net=off via omission → no network // - --unshare-user-try → user namespace when kernel allows it // - --die-with-parent → cleanup on zddc-server exit // - --ro-bind /usr /usr, /lib /lib, /lib64 /lib64, /etc /etc, /bin /bin // (where present) → tools + libs visible read-only // - --proc /proc, --dev /dev → minimal pseudo-filesystems // - --tmpfs /tmp (256 MiB) → scratch space, matches container path // - --chdir /tmp → workdir // - --clearenv + minimal HOME/PATH/LANG → no host env leaks // - --cap-drop ALL (bwrap default, explicit for clarity) // ─────────────────────────────────────────────────────────────────────────── type bwrapRunner struct { mu sync.RWMutex bin string // path to bwrap binary memMiB int // currently advisory; bwrap has no built-in cap cpus string // currently advisory pids int // currently advisory timeout time.Duration // context deadline per Run } func newBwrapRunner(bin string) *bwrapRunner { return &bwrapRunner{ bin: bin, memMiB: 512, cpus: "2", pids: 100, timeout: 30 * time.Second, } } // SetLimits — same shape as containerRunner.SetLimits. bwrap itself // doesn't enforce cgroup limits; we capture the values so an operator // can read them back via /.profile/config or the convert-health probe. // Wrapping with systemd-run --scope --property MemoryMax=… is the // follow-up if hard caps are needed; not in this iteration. func (br *bwrapRunner) SetLimits(memMiB int, cpus string, pids int, timeout time.Duration) { br.mu.Lock() defer br.mu.Unlock() if memMiB > 0 { br.memMiB = memMiB } if cpus != "" { br.cpus = cpus } if pids > 0 { br.pids = pids } if timeout > 0 { br.timeout = timeout } } func (br *bwrapRunner) Run(ctx context.Context, tool ToolSpec, stdin []byte, mounts []string, cmd []string) ([]byte, error) { br.mu.RLock() bwrapBin := br.bin timeout := br.timeout br.mu.RUnlock() if bwrapBin == "" { return nil, ErrUnavailable } if tool.Binary == "" { return nil, fmt.Errorf("convert.Run: tool.Binary is empty (bwrapRunner requires a host-binary name)") } runCtx, cancel := context.WithTimeout(ctx, timeout) defer cancel() args, err := buildBwrapArgs(tool.Binary, mounts, cmd) if err != nil { return nil, err } c := exec.CommandContext(runCtx, bwrapBin, args...) c.Cancel = func() error { if c.Process == nil { return nil } return c.Process.Kill() } c.WaitDelay = 2 * time.Second c.SysProcAttr = sysProcAttr() c.Env = []string{ "PATH=" + os.Getenv("PATH"), "HOME=" + os.TempDir(), } c.Stdin = bytes.NewReader(stdin) var stdoutBuf bytes.Buffer c.Stdout = &limitWriter{w: &stdoutBuf, max: 128 << 20} stderr := newRingWriter(4 << 10) c.Stderr = stderr if runErr := c.Run(); runErr != nil { exitCode := -1 if ee, ok := runErr.(*exec.ExitError); ok { exitCode = ee.ExitCode() } toolName := tool.Binary if runCtx.Err() == context.DeadlineExceeded { return nil, &ConvertError{ Tool: toolName, ExitCode: exitCode, Stderr: stderr.String(), Cause: fmt.Errorf("timeout after %s: %w", timeout, runCtx.Err()), } } return nil, &ConvertError{ Tool: toolName, ExitCode: exitCode, Stderr: stderr.String(), Cause: runErr, } } return stdoutBuf.Bytes(), nil } // buildBwrapArgs assembles the bwrap argv for a single conversion. // Exposed as a package-internal helper so tests can lock the sandbox // flag shape without exec'ing bwrap. Returns an error when a mount // spec is malformed. func buildBwrapArgs(binary string, mounts, cmd []string) ([]string, error) { args := []string{ // Namespace isolation. --unshare-all unshares user (when // available), ipc, pid, net, uts, cgroup; --unshare-user-try // downgrades cleanly when the kernel refuses (e.g. some // container hosts disable user-namespace creation). "--unshare-all", "--unshare-user-try", "--die-with-parent", // Read-only system view. Each --ro-bind only mounts paths // that exist on the host; for hosts where /lib is a symlink // into /usr/lib (modern Linux) the symlink resolution lets // bwrap mount /usr's contents through. "--ro-bind", "/usr", "/usr", "--ro-bind-try", "/lib", "/lib", "--ro-bind-try", "/lib64", "/lib64", "--ro-bind-try", "/bin", "/bin", "--ro-bind-try", "/sbin", "/sbin", "--ro-bind-try", "/etc", "/etc", // Pseudo-filesystems. /proc and /dev are required for any // non-trivial binary; we make them minimal. "--proc", "/proc", "--dev", "/dev", // Scratch. 256 MiB tmpfs at /tmp matches containerRunner. // chromium spills its shared-memory fallback (--disable-dev- // shm-usage) here, so the budget actually matters. "--tmpfs", "/tmp", "--size", "268435456", // 256 MiB; applies to the most recent --tmpfs "--chdir", "/tmp", // Minimal env. HOME=/tmp lets chromium write its // user-data-dir without permission errors; PATH covers the // usual install locations for pandoc + chromium across // alpine / debian / rhel. "--clearenv", "--setenv", "HOME", "/tmp", "--setenv", "PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "--setenv", "LANG", "C.UTF-8", } // Caller-supplied bind mounts (template, output, …). Same // "host:target[:ro|:rw]" syntax as containerRunner; we translate // to bwrap's --ro-bind / --bind. for _, m := range mounts { host, target, mode, ok := splitMount(m) if !ok { return nil, fmt.Errorf("convert.Run: invalid mount spec %q (want host:target[:ro|:rw])", m) } if mode == "rw" { args = append(args, "--bind", host, target) } else { args = append(args, "--ro-bind", host, target) } } // Finally the binary + its argv. The binary path is PATH-resolved // inside the sandbox via the constructed PATH above; if the // operator passed an absolute path it bypasses PATH lookup and is // invoked verbatim (still subject to the /usr bind mount). args = append(args, binary) args = append(args, cmd...) return args, nil } // splitMount parses "host:target[:ro|:rw]" into its three parts. // The mode segment is optional; absent means read-only (matches the // containerRunner default). func splitMount(m string) (host, target, mode string, ok bool) { parts := strings.SplitN(m, ":", 3) if len(parts) < 2 { return "", "", "", false } host = parts[0] target = parts[1] mode = "ro" if len(parts) == 3 { switch parts[2] { case "ro", "rw": mode = parts[2] default: return "", "", "", false } } return host, target, mode, true } // imageTag extracts a short name for an image reference, used as the // "Tool" label on ConvertError. "docker.io/pandoc/latex:latest" → // "pandoc/latex". func imageTag(image string) string { s := image // Strip registry prefix. if i := strings.Index(s, "/"); i >= 0 { if strings.Contains(s[:i], ".") || strings.Contains(s[:i], ":") { s = s[i+1:] } } // Strip tag suffix. if i := strings.LastIndex(s, ":"); i >= 0 { s = s[:i] } return s } // limitWriter caps the underlying buffer at max bytes. Writes past the // cap return io.ErrShortWrite, which surfaces as a Run() error — the // caller then maps to 422 (output too large) at the handler edge. type limitWriter struct { w io.Writer max int64 n int64 } func (l *limitWriter) Write(p []byte) (int, error) { if l.n >= l.max { return 0, fmt.Errorf("output exceeded %d bytes", l.max) } rem := l.max - l.n if int64(len(p)) > rem { n, _ := l.w.Write(p[:rem]) l.n += int64(n) return n, fmt.Errorf("output exceeded %d bytes", l.max) } n, err := l.w.Write(p) l.n += int64(n) return n, err } // ringWriter keeps only the tail of what's written — useful for stderr // capture where the most-recent bytes are the ones with the actual // error message and earlier output is usually progress noise. type ringWriter struct { mu sync.Mutex buf []byte max int } func newRingWriter(max int) *ringWriter { return &ringWriter{max: max} } func (r *ringWriter) Write(p []byte) (int, error) { r.mu.Lock() defer r.mu.Unlock() if len(p) >= r.max { r.buf = append(r.buf[:0], p[len(p)-r.max:]...) return len(p), nil } r.buf = append(r.buf, p...) if len(r.buf) > r.max { r.buf = r.buf[len(r.buf)-r.max:] } return len(p), nil } func (r *ringWriter) String() string { r.mu.Lock() defer r.mu.Unlock() return string(r.buf) } // writeAssetsToScratch materialises the embedded viewer-template.html // and custom.css into a fresh scratch dir and returns the host path. // Caller is responsible for os.RemoveAll(dir) when done. Used by // ToHTML which needs the template visible inside the container. // // scratchRoot controls where the temp dir lands. Empty means "use // $TMPDIR" (local mode default). In remote/sidecar mode the caller // passes the shared mount path (e.g. "/work") so the podman-service // sidecar sees the bind-mount source at the same path. // // Files are written world-readable so the container's default user // (root for pandoc/latex, uid 1000 for alpine-chrome) can read them // through the read-only bind mount regardless of the host's umask. func writeAssetsToScratch(scratchRoot string) (string, error) { dir, err := os.MkdirTemp(scratchRoot, "zddc-convert-") if err != nil { return "", fmt.Errorf("scratch dir: %w", err) } if err := os.WriteFile(filepath.Join(dir, "viewer-template.html"), viewerTemplate, 0o644); err != nil { os.RemoveAll(dir) return "", fmt.Errorf("write template: %w", err) } if err := os.WriteFile(filepath.Join(dir, "custom.css"), customCSS, 0o644); err != nil { os.RemoveAll(dir) return "", fmt.Errorf("write css: %w", err) } if err := chmodTree(dir, 0o755, 0o644); err != nil { os.RemoveAll(dir) return "", err } return dir, nil } func chmodTree(root string, dirMode, fileMode os.FileMode) error { return filepath.WalkDir(root, func(p string, d fs.DirEntry, err error) error { if err != nil { return err } if d.IsDir() { return os.Chmod(p, dirMode) } return os.Chmod(p, fileMode) }) }