ZDDC/zddc/internal/convert/runner.go
ZDDC dfdd767536 fix(convert): pass --userns=host to inner podman so nested invocations don't trip newuidmap
When zddc-server runs inside a Kubernetes pod and shells out to
`podman run`, the inner podman tries to set up its own user namespace
via /usr/bin/newuidmap. The mapping fails inside the pod's namespace
even with privileged: true:

  newuidmap: write to uid_map failed: Invalid argument
  Error: cannot set up namespace using "/usr/bin/newuidmap": exit status 1

Adding --userns=host to the inner `podman run` tells it to reuse the
caller's user namespace instead of creating a new one — newuidmap
isn't invoked. The chart already runs the pod privileged so reusing
its userns adds no new privilege; --cap-drop=ALL + --network=none +
--read-only + --tmpfs continue to isolate the inner container.

On a bare-metal host invocation, --userns=host means "no userns
remapping at all", which is the default for rootful podman and works
identically to the prior behavior — the bitnest test setup and any
laptop dev runs are unaffected.

Smoke-tested locally with the exact flag set: pandoc/latex:latest in
a --userns=host --read-only container produces valid HTML from
`# Hello world` on stdin.
2026-05-13 12:06:51 -05:00

400 lines
12 KiB
Go

package convert
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"io/fs"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"
)
// Runner executes a conversion sub-process and returns its stdout.
// The host-side implementation (containerRunner) wraps `podman run`
// or `docker run`; tests use a fake.
//
// image is the OCI image to invoke (e.g. "docker.io/pandoc/latex:latest"
// or "docker.io/zenika/alpine-chrome:latest"). stdin is piped to the
// container's stdin. cmd is the argv passed *to the image's entrypoint*
// — for pandoc/latex the entrypoint is `pandoc`, for alpine-chrome it
// is `chromium-browser`. mounts is a list of "<hostPath>:<containerPath>"
// specs handed to --volume (":ro" is added if no mode segment is
// present).
//
// All exec calls in this package go through Runner.Run. This is the
// first os/exec site in the codebase; the hardening here is the
// pattern for future shell-outs.
type Runner interface {
Run(ctx context.Context, image string, stdin []byte, mounts []string, cmd []string) ([]byte, error)
}
// ErrUnavailable means no container runtime is present on the host.
// Handlers translate to HTTP 503.
var ErrUnavailable = errors.New("conversion unavailable")
// ConvertError carries the failure surface from a non-zero exit.
// Stderr is captured (truncated to 4 KiB by the runner) so callers can
// surface pandoc/chromium's own complaint.
type ConvertError struct {
Tool string // image name fragment, used only for logging
ExitCode int
Stderr string
Cause error
}
func (e *ConvertError) Error() string {
if e == nil {
return "<nil>"
}
if e.Stderr != "" {
return fmt.Sprintf("%s exit %d: %s", e.Tool, e.ExitCode, strings.TrimSpace(e.Stderr))
}
return fmt.Sprintf("%s exit %d: %v", e.Tool, e.ExitCode, e.Cause)
}
func (e *ConvertError) Unwrap() error { return e.Cause }
// containerRunner runs each conversion inside a fresh container.
// The engine ("podman" preferred, "docker" fallback) is resolved once
// at startup by Probe. Resource limits are configurable via
// SetLimits (called from main.go after flag parsing). Images are passed
// per call so the same runner handles both pandoc and chromium
// invocations.
//
// The runner relies on `--pull=missing` so the operator never has to
// pre-pull images: the first request that needs an image pulls it,
// subsequent requests use the local cache. Both podman and docker
// honour this flag identically.
type containerRunner struct {
mu sync.RWMutex
engine string
memMiB int
cpus string
pids int
timeout time.Duration
}
var (
// shared default runner, populated by InstallRunner (called from
// the health probe at startup once the engine is known).
defaultRunnerMu sync.RWMutex
defaultRunner Runner
)
// InstallRunner sets the package-level Runner used by ToDocx/ToHTML/ToPDF.
// Tests inject a fake; production code lets the health probe install a
// containerRunner. Safe to call from multiple goroutines.
func InstallRunner(r Runner) {
defaultRunnerMu.Lock()
defaultRunner = r
defaultRunnerMu.Unlock()
}
// ConfigureLimits applies resource limits to the package-level Runner,
// if it's a containerRunner. No-op when no runner is installed yet
// (the probe failed) or when the installed runner doesn't accept
// limits (e.g. a test fake). Zero values keep the previous setting.
//
// Called from cmd/zddc-server/main.go after Probe so the limits from
// the operator's flags take effect before any conversion request lands.
func ConfigureLimits(memMiB int, cpus string, pids int, timeout time.Duration) {
defaultRunnerMu.RLock()
r := defaultRunner
defaultRunnerMu.RUnlock()
if cr, ok := r.(*containerRunner); ok {
cr.SetLimits(memMiB, cpus, pids, timeout)
}
}
func currentRunner() Runner {
defaultRunnerMu.RLock()
r := defaultRunner
defaultRunnerMu.RUnlock()
return r
}
// SetLimits updates the resource ceilings used for subsequent Run
// invocations. Zero values keep the previous setting (or the defaults
// set at construction). Safe to call from multiple goroutines.
func (cr *containerRunner) SetLimits(memMiB int, cpus string, pids int, timeout time.Duration) {
cr.mu.Lock()
defer cr.mu.Unlock()
if memMiB > 0 {
cr.memMiB = memMiB
}
if cpus != "" {
cr.cpus = cpus
}
if pids > 0 {
cr.pids = pids
}
if timeout > 0 {
cr.timeout = timeout
}
}
func newContainerRunner(engine string) *containerRunner {
return &containerRunner{
engine: engine,
memMiB: 512,
cpus: "2",
pids: 100,
timeout: 30 * time.Second,
}
}
// Run executes one container invocation. cmd is the argv passed to the
// image's entrypoint (pandoc for pandoc/latex, chromium-browser for
// alpine-chrome). mounts is a list of "<hostPath>:<containerPath>"
// strings; ":ro" is appended when no mode segment is present. stdin is
// piped to the container, stdout is returned as bytes (capped at
// 128 MiB).
//
// Hardening:
// - --pull=missing: image is fetched on first use, cached after.
// Operator only needs podman/docker installed; no manual pull.
// - --rm: container is removed on exit, even if killed.
// - --network=none: no network inside the container. Prevents data
// exfiltration through embedded URLs in source documents.
// - --read-only + tmpfs on /tmp and /run: image fs is immutable;
// pandoc/chromium scratch goes to tmpfs only.
// - --memory / --cpus / --pids-limit: kernel-enforced caps.
// - --cap-drop=ALL + --security-opt=no-new-privileges: standard
// container-escape hardening.
// - context-cancel kill + WaitDelay: a wedged podman gets force-
// killed; pipes drop after 2s so we don't leak goroutines.
// - cmd.Env minimal: only PATH + HOME are passed through to the
// engine binary; the container itself sees only what the image
// bakes in plus what --env adds (HOME=/tmp).
//
// Note: --user is intentionally NOT set so each image uses its
// default user (pandoc/latex runs as root, alpine-chrome runs as
// uid 1000). With --read-only + tmpfs + --cap-drop=ALL +
// --network=none + --no-new-privileges the additional defense from
// forcing nobody is small and would break alpine-chrome's own
// user-data-dir layout.
func (cr *containerRunner) Run(ctx context.Context, image string, stdin []byte, mounts []string, cmd []string) ([]byte, error) {
cr.mu.RLock()
engine := cr.engine
memMiB := cr.memMiB
cpus := cr.cpus
pids := cr.pids
timeout := cr.timeout
cr.mu.RUnlock()
if engine == "" {
return nil, ErrUnavailable
}
if image == "" {
return nil, fmt.Errorf("convert.Run: image is empty")
}
runCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
args := []string{
"run",
"--rm",
"--pull=missing",
"-i",
// --userns=host: reuse the calling process's user namespace
// instead of creating a new one. Required for the nested-
// podman case (zddc-server runs inside a Kubernetes pod and
// invokes podman from there): the kernel won't let the inner
// podman set up its own userns via newuidmap when /etc/subuid
// mappings don't resolve through the pod's namespace, even
// with CAP_SETUID via privileged: true. The chart already
// runs the pod privileged, so reusing its userns adds no new
// privilege escalation. On a bare-metal host invocation the
// outer userns is the host's, so --userns=host means "no
// userns remapping" — also fine; --cap-drop=ALL +
// --network=none + --read-only continue to isolate the
// inner container's process.
"--userns=host",
"--network=none",
"--read-only",
"--tmpfs=/tmp:size=128m,exec",
"--tmpfs=/run:size=4m",
fmt.Sprintf("--memory=%dm", memMiB),
fmt.Sprintf("--cpus=%s", cpus),
fmt.Sprintf("--pids-limit=%d", pids),
"--cap-drop=ALL",
"--security-opt=no-new-privileges",
"--env=HOME=/tmp",
"--workdir=/tmp",
}
for _, m := range mounts {
if !strings.Contains(m, ":ro") && !strings.Contains(m, ":rw") {
m += ":ro"
}
args = append(args, "--volume="+m)
}
args = append(args, image)
args = append(args, cmd...)
c := exec.CommandContext(runCtx, engine, args...)
c.Cancel = func() error {
if c.Process == nil {
return nil
}
return c.Process.Kill()
}
c.WaitDelay = 2 * time.Second
c.SysProcAttr = sysProcAttr()
c.Env = []string{
"PATH=" + os.Getenv("PATH"),
"HOME=" + os.TempDir(),
}
c.Stdin = bytes.NewReader(stdin)
var stdoutBuf bytes.Buffer
c.Stdout = &limitWriter{w: &stdoutBuf, max: 128 << 20}
stderr := newRingWriter(4 << 10)
c.Stderr = stderr
err := c.Run()
if err != nil {
exitCode := -1
if ee, ok := err.(*exec.ExitError); ok {
exitCode = ee.ExitCode()
}
toolName := imageTag(image)
if runCtx.Err() == context.DeadlineExceeded {
return nil, &ConvertError{
Tool: toolName,
ExitCode: exitCode,
Stderr: stderr.String(),
Cause: fmt.Errorf("timeout after %s: %w", timeout, runCtx.Err()),
}
}
return nil, &ConvertError{
Tool: toolName,
ExitCode: exitCode,
Stderr: stderr.String(),
Cause: err,
}
}
return stdoutBuf.Bytes(), nil
}
// imageTag extracts a short name for an image reference, used as the
// "Tool" label on ConvertError. "docker.io/pandoc/latex:latest" →
// "pandoc/latex".
func imageTag(image string) string {
s := image
// Strip registry prefix.
if i := strings.Index(s, "/"); i >= 0 {
if strings.Contains(s[:i], ".") || strings.Contains(s[:i], ":") {
s = s[i+1:]
}
}
// Strip tag suffix.
if i := strings.LastIndex(s, ":"); i >= 0 {
s = s[:i]
}
return s
}
// limitWriter caps the underlying buffer at max bytes. Writes past the
// cap return io.ErrShortWrite, which surfaces as a Run() error — the
// caller then maps to 422 (output too large) at the handler edge.
type limitWriter struct {
w io.Writer
max int64
n int64
}
func (l *limitWriter) Write(p []byte) (int, error) {
if l.n >= l.max {
return 0, fmt.Errorf("output exceeded %d bytes", l.max)
}
rem := l.max - l.n
if int64(len(p)) > rem {
n, _ := l.w.Write(p[:rem])
l.n += int64(n)
return n, fmt.Errorf("output exceeded %d bytes", l.max)
}
n, err := l.w.Write(p)
l.n += int64(n)
return n, err
}
// ringWriter keeps only the tail of what's written — useful for stderr
// capture where the most-recent bytes are the ones with the actual
// error message and earlier output is usually progress noise.
type ringWriter struct {
mu sync.Mutex
buf []byte
max int
}
func newRingWriter(max int) *ringWriter {
return &ringWriter{max: max}
}
func (r *ringWriter) Write(p []byte) (int, error) {
r.mu.Lock()
defer r.mu.Unlock()
if len(p) >= r.max {
r.buf = append(r.buf[:0], p[len(p)-r.max:]...)
return len(p), nil
}
r.buf = append(r.buf, p...)
if len(r.buf) > r.max {
r.buf = r.buf[len(r.buf)-r.max:]
}
return len(p), nil
}
func (r *ringWriter) String() string {
r.mu.Lock()
defer r.mu.Unlock()
return string(r.buf)
}
// writeAssetsToScratch materialises the embedded viewer-template.html
// and custom.css into a fresh scratch dir under TMPDIR and returns the
// host path. Caller is responsible for os.RemoveAll(dir) when done.
// Used by ToHTML which needs the template visible inside the container.
//
// Files are written world-readable so the container's default user
// (root for pandoc/latex, uid 1000 for alpine-chrome) can read them
// through the read-only bind mount regardless of the host's umask.
func writeAssetsToScratch() (string, error) {
dir, err := os.MkdirTemp("", "zddc-convert-")
if err != nil {
return "", fmt.Errorf("scratch dir: %w", err)
}
if err := os.WriteFile(filepath.Join(dir, "viewer-template.html"), viewerTemplate, 0o644); err != nil {
os.RemoveAll(dir)
return "", fmt.Errorf("write template: %w", err)
}
if err := os.WriteFile(filepath.Join(dir, "custom.css"), customCSS, 0o644); err != nil {
os.RemoveAll(dir)
return "", fmt.Errorf("write css: %w", err)
}
if err := chmodTree(dir, 0o755, 0o644); err != nil {
os.RemoveAll(dir)
return "", err
}
return dir, nil
}
func chmodTree(root string, dirMode, fileMode os.FileMode) error {
return filepath.WalkDir(root, func(p string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return os.Chmod(p, dirMode)
}
return os.Chmod(p, fileMode)
})
}