ZDDC/zddc/internal/convert/runner.go

package convert

import (
	"bytes"
	"context"
	"errors"
	"fmt"
	"io"
	"io/fs"
	"os"
	"os/exec"
	"path/filepath"
	"strings"
	"sync"
	"time"
)

// ToolSpec identifies the conversion tool to invoke. Runners pick
// whichever field applies to them:
//
//   - bwrapRunner uses Binary — the path or PATH-name of the tool on
//     the zddc-server host (or container). pandoc/latex's entrypoint
//     becomes `pandoc`; alpine-chrome's becomes `chromium-browser`.
//     This is the production-default engine: lightest sandbox, no
//     daemon, no privileged outer container.
//
//   - containerRunner uses Image — the OCI image ref pulled into a
//     fresh container for each conversion (legacy/fallback engine,
//     kept for environments that already host a podman/docker daemon
//     and want OCI-image isolation per conversion).
//
// Both fields are populated by the entry points in convert.go so a
// single call site works regardless of which engine is installed.
type ToolSpec struct {
	Image  string // OCI image ref (containerRunner)
	Binary string // binary name on PATH (bwrapRunner)
}

// Runner executes a conversion sub-process and returns its stdout.
// The host-side implementations are bwrapRunner (default; wraps
// `bubblewrap`) and containerRunner (fallback; wraps `podman run` /
// `docker run`). Tests use a fake.
//
// stdin is piped to the tool's stdin. cmd is the argv passed *to the
// tool* — for pandoc the entrypoint accepts pandoc flags directly;
// for chromium it accepts chromium-browser flags. mounts is a list
// of "<hostPath>:<targetPath>" specs (":ro" is added if no mode
// segment is present); each runner translates them to its own
// bind/--volume syntax.
//
// All exec calls in this package go through Runner.Run. This is the
// first os/exec site in the codebase; the hardening here is the
// pattern for future shell-outs.
type Runner interface {
	Run(ctx context.Context, tool ToolSpec, stdin []byte, mounts []string, cmd []string) ([]byte, error)
}

// ErrUnavailable means no container runtime is present on the host.
// Handlers translate to HTTP 503.
var ErrUnavailable = errors.New("conversion unavailable")

// ConvertError carries the failure surface from a non-zero exit.
// Stderr is captured (truncated to 4 KiB by the runner) so callers can
// surface pandoc/chromium's own complaint.
type ConvertError struct {
	Tool     string // image name fragment, used only for logging
	ExitCode int
	Stderr   string
	Cause    error
}

func (e *ConvertError) Error() string {
	if e == nil {
		return "<nil>"
	}
	if e.Stderr != "" {
		return fmt.Sprintf("%s exit %d: %s", e.Tool, e.ExitCode, strings.TrimSpace(e.Stderr))
	}
	return fmt.Sprintf("%s exit %d: %v", e.Tool, e.ExitCode, e.Cause)
}

func (e *ConvertError) Unwrap() error { return e.Cause }

// containerRunner runs each conversion inside a fresh container.
// The engine ("podman" preferred, "docker" fallback) is resolved once
// at startup by Probe. Resource limits are configurable via
// SetLimits (called from main.go after flag parsing). Images are passed
// per call so the same runner handles both pandoc and chromium
// invocations.
//
// Two modes:
//
//   - **local** (remoteURL=""): the engine binary creates containers
//     directly on the host that runs zddc-server. Used for bare-metal
//     and host-podman deployments. Requires podman or docker on PATH.
//
//   - **remote** (remoteURL="unix:///var/run/podman/podman.sock" or
//     similar): the engine binary is the local podman CLIENT, invoked
//     as `podman --remote --url=<remoteURL> run …`; the actual
//     container creation happens in whatever process owns the socket
//     (typically a `podman system service` sidecar in the same pod).
//     Used for the Kubernetes sidecar pattern so zddc-server's own
//     pod stays unprivileged. Bind-mount paths must resolve identically
//     on both sides — see scratchDir.
//
// The runner relies on `--pull=missing` so the operator never has to
// pre-pull images: the first request that needs an image pulls it,
// subsequent requests use the local cache. Both podman and docker
// honour this flag identically.
type containerRunner struct {
	mu        sync.RWMutex
	engine    string
	remoteURL string
	memMiB    int
	cpus      string
	pids      int
	timeout   time.Duration
}

var (
	// shared default runner, populated by InstallRunner (called from
	// the health probe at startup once the engine is known).
	defaultRunnerMu sync.RWMutex
	defaultRunner   Runner
)

// InstallRunner sets the package-level Runner used by ToDocx/ToHTML/ToPDF.
// Tests inject a fake; production code lets the health probe install a
// containerRunner. Safe to call from multiple goroutines.
func InstallRunner(r Runner) {
	defaultRunnerMu.Lock()
	defaultRunner = r
	defaultRunnerMu.Unlock()
}

// ConfigureLimits applies resource limits to the package-level Runner,
// if it's a containerRunner. No-op when no runner is installed yet
// (the probe failed) or when the installed runner doesn't accept
// limits (e.g. a test fake). Zero values keep the previous setting.
//
// Called from cmd/zddc-server/main.go after Probe so the limits from
// the operator's flags take effect before any conversion request lands.
func ConfigureLimits(memMiB int, cpus string, pids int, timeout time.Duration) {
	defaultRunnerMu.RLock()
	r := defaultRunner
	defaultRunnerMu.RUnlock()
	if cr, ok := r.(*containerRunner); ok {
		cr.SetLimits(memMiB, cpus, pids, timeout)
	}
}

func currentRunner() Runner {
	defaultRunnerMu.RLock()
	r := defaultRunner
	defaultRunnerMu.RUnlock()
	return r
}

// SetLimits updates the resource ceilings used for subsequent Run
// invocations. Zero values keep the previous setting (or the defaults
// set at construction). Safe to call from multiple goroutines.
func (cr *containerRunner) SetLimits(memMiB int, cpus string, pids int, timeout time.Duration) {
	cr.mu.Lock()
	defer cr.mu.Unlock()
	if memMiB > 0 {
		cr.memMiB = memMiB
	}
	if cpus != "" {
		cr.cpus = cpus
	}
	if pids > 0 {
		cr.pids = pids
	}
	if timeout > 0 {
		cr.timeout = timeout
	}
}

func newContainerRunner(engine, remoteURL string) *containerRunner {
	return &containerRunner{
		engine:    engine,
		remoteURL: remoteURL,
		memMiB:    512,
		cpus:      "2",
		pids:      100,
		timeout:   30 * time.Second,
	}
}

// Run executes one container invocation. cmd is the argv passed to the
// image's entrypoint (pandoc for pandoc/latex, chromium-browser for
// alpine-chrome). mounts is a list of "<hostPath>:<containerPath>"
// strings; ":ro" is appended when no mode segment is present. stdin is
// piped to the container, stdout is returned as bytes (capped at
// 128 MiB).
//
// Hardening:
//   - --pull=missing: image is fetched on first use, cached after.
//     Operator only needs podman/docker installed; no manual pull.
//   - --rm: container is removed on exit, even if killed.
//   - --network=none: no network inside the container. Prevents data
//     exfiltration through embedded URLs in source documents.
//   - --read-only + tmpfs on /tmp and /run: image fs is immutable;
//     pandoc/chromium scratch goes to tmpfs only.
//   - --memory / --cpus / --pids-limit: kernel-enforced caps.
//   - --cap-drop=ALL + --security-opt=no-new-privileges: standard
//     container-escape hardening.
//   - context-cancel kill + WaitDelay: a wedged podman gets force-
//     killed; pipes drop after 2s so we don't leak goroutines.
//   - cmd.Env minimal: only PATH + HOME are passed through to the
//     engine binary; the container itself sees only what the image
//     bakes in plus what --env adds (HOME=/tmp).
//
// Note: --user is intentionally NOT set so each image uses its
// default user (pandoc/latex runs as root, alpine-chrome runs as
// uid 1000). With --read-only + tmpfs + --cap-drop=ALL +
// --network=none + --no-new-privileges the additional defense from
// forcing nobody is small and would break alpine-chrome's own
// user-data-dir layout.
func (cr *containerRunner) Run(ctx context.Context, tool ToolSpec, stdin []byte, mounts []string, cmd []string) ([]byte, error) {
	cr.mu.RLock()
	engine := cr.engine
	remoteURL := cr.remoteURL
	memMiB := cr.memMiB
	cpus := cr.cpus
	pids := cr.pids
	timeout := cr.timeout
	cr.mu.RUnlock()

	if engine == "" {
		return nil, ErrUnavailable
	}
	image := tool.Image
	if image == "" {
		return nil, fmt.Errorf("convert.Run: tool.Image is empty (containerRunner requires an OCI image ref)")
	}

	runCtx, cancel := context.WithTimeout(ctx, timeout)
	defer cancel()

	// Client args. In remote mode, prepend --remote and --url so the
	// podman CLI dispatches the request to the sidecar's
	// `podman system service` instead of creating a container locally.
	// The remaining flags (--rm, --pull=missing, etc.) apply to the
	// container that the remote daemon will create — same wire format
	// as local mode.
	var args []string
	if remoteURL != "" {
		args = append(args, "--remote", "--url="+remoteURL)
	}
	args = append(args,
		"run",
		"--rm",
		"--pull=missing",
		"-i",
	)
	// --userns=host only in local mode: needed when zddc-server itself
	// is the one running podman inside a Kubernetes pod, because the
	// kernel won't let an inner rootless podman set up its own userns
	// via newuidmap. In remote (sidecar) mode the sidecar runs as root
	// and creates the inner container in its own (rootful) namespace,
	// so --userns=host is unnecessary and potentially noisy.
	if remoteURL == "" {
		args = append(args, "--userns=host")
	}
	args = append(args,
		"--network=none",
		"--read-only",
		// /tmp must be large enough to host chromium's shared-memory
		// fallback (--disable-dev-shm-usage redirects /dev/shm writes
		// here) plus the user-data-dir. 256 MiB is plenty for the
		// HTML→PDF flow; pandoc itself uses almost none.
		"--tmpfs=/tmp:size=256m,exec",
		"--tmpfs=/run:size=4m",
		fmt.Sprintf("--memory=%dm", memMiB),
		fmt.Sprintf("--cpus=%s", cpus),
		fmt.Sprintf("--pids-limit=%d", pids),
		"--cap-drop=ALL",
		"--security-opt=no-new-privileges",
		"--env=HOME=/tmp",
		"--workdir=/tmp",
	)
	for _, m := range mounts {
		if !strings.Contains(m, ":ro") && !strings.Contains(m, ":rw") {
			m += ":ro"
		}
		args = append(args, "--volume="+m)
	}
	args = append(args, image)
	args = append(args, cmd...)

	c := exec.CommandContext(runCtx, engine, args...)
	c.Cancel = func() error {
		if c.Process == nil {
			return nil
		}
		return c.Process.Kill()
	}
	c.WaitDelay = 2 * time.Second
	c.SysProcAttr = sysProcAttr()
	c.Env = []string{
		"PATH=" + os.Getenv("PATH"),
		"HOME=" + os.TempDir(),
	}
	c.Stdin = bytes.NewReader(stdin)

	var stdoutBuf bytes.Buffer
	c.Stdout = &limitWriter{w: &stdoutBuf, max: 128 << 20}
	stderr := newRingWriter(4 << 10)
	c.Stderr = stderr

	err := c.Run()
	if err != nil {
		exitCode := -1
		if ee, ok := err.(*exec.ExitError); ok {
			exitCode = ee.ExitCode()
		}
		toolName := imageTag(image)
		if runCtx.Err() == context.DeadlineExceeded {
			return nil, &ConvertError{
				Tool:     toolName,
				ExitCode: exitCode,
				Stderr:   stderr.String(),
				Cause:    fmt.Errorf("timeout after %s: %w", timeout, runCtx.Err()),
			}
		}
		return nil, &ConvertError{
			Tool:     toolName,
			ExitCode: exitCode,
			Stderr:   stderr.String(),
			Cause:    err,
		}
	}

	return stdoutBuf.Bytes(), nil
}

// ───────────────────────────────────────────────────────────────────────────
// bwrapRunner — default conversion engine.
//
// Wraps `bubblewrap` to run pandoc / chromium binaries directly in a
// per-call Linux-namespace sandbox. No daemon, no OCI images, no
// privileged outer container. Image-build bundles pandoc + chromium
// into the zddc-server image so the binaries are available on PATH;
// each conversion gets a fresh set of namespaces, a read-only view
// of the host's /usr (so the binary + its libs are visible), a tmpfs
// /tmp, and nothing else.
//
// This matches the threat model of the legacy containerRunner —
// untrusted source-markdown drives the binary, we contain any
// resulting RCE inside the bwrap sandbox — without the operational
// tax of running a container engine per conversion (image pull,
// daemon, socket, ~300ms startup).
//
// Hardening (mirror of containerRunner's flags):
//   - --unshare-all + --share-net=off via omission → no network
//   - --unshare-user-try → user namespace when kernel allows it
//   - --die-with-parent → cleanup on zddc-server exit
//   - --ro-bind /usr /usr, /lib /lib, /lib64 /lib64, /etc /etc, /bin /bin
//     (where present) → tools + libs visible read-only
//   - --proc /proc, --dev /dev → minimal pseudo-filesystems
//   - --tmpfs /tmp (256 MiB) → scratch space, matches container path
//   - --chdir /tmp → workdir
//   - --clearenv + minimal HOME/PATH/LANG → no host env leaks
//   - --cap-drop ALL (bwrap default, explicit for clarity)
// ───────────────────────────────────────────────────────────────────────────

type bwrapRunner struct {
	mu       sync.RWMutex
	bin      string        // path to bwrap binary
	memMiB   int           // currently advisory; bwrap has no built-in cap
	cpus     string        // currently advisory
	pids     int           // currently advisory
	timeout  time.Duration // context deadline per Run
}

func newBwrapRunner(bin string) *bwrapRunner {
	return &bwrapRunner{
		bin:     bin,
		memMiB:  512,
		cpus:    "2",
		pids:    100,
		timeout: 30 * time.Second,
	}
}

// SetLimits — same shape as containerRunner.SetLimits. bwrap itself
// doesn't enforce cgroup limits; we capture the values so an operator
// can read them back via /.profile/config or the convert-health probe.
// Wrapping with systemd-run --scope --property MemoryMax=… is the
// follow-up if hard caps are needed; not in this iteration.
func (br *bwrapRunner) SetLimits(memMiB int, cpus string, pids int, timeout time.Duration) {
	br.mu.Lock()
	defer br.mu.Unlock()
	if memMiB > 0 {
		br.memMiB = memMiB
	}
	if cpus != "" {
		br.cpus = cpus
	}
	if pids > 0 {
		br.pids = pids
	}
	if timeout > 0 {
		br.timeout = timeout
	}
}

func (br *bwrapRunner) Run(ctx context.Context, tool ToolSpec, stdin []byte, mounts []string, cmd []string) ([]byte, error) {
	br.mu.RLock()
	bwrapBin := br.bin
	timeout := br.timeout
	br.mu.RUnlock()

	if bwrapBin == "" {
		return nil, ErrUnavailable
	}
	if tool.Binary == "" {
		return nil, fmt.Errorf("convert.Run: tool.Binary is empty (bwrapRunner requires a host-binary name)")
	}

	runCtx, cancel := context.WithTimeout(ctx, timeout)
	defer cancel()

	args, err := buildBwrapArgs(tool.Binary, mounts, cmd)
	if err != nil {
		return nil, err
	}

	c := exec.CommandContext(runCtx, bwrapBin, args...)
	c.Cancel = func() error {
		if c.Process == nil {
			return nil
		}
		return c.Process.Kill()
	}
	c.WaitDelay = 2 * time.Second
	c.SysProcAttr = sysProcAttr()
	c.Env = []string{
		"PATH=" + os.Getenv("PATH"),
		"HOME=" + os.TempDir(),
	}
	c.Stdin = bytes.NewReader(stdin)

	var stdoutBuf bytes.Buffer
	c.Stdout = &limitWriter{w: &stdoutBuf, max: 128 << 20}
	stderr := newRingWriter(4 << 10)
	c.Stderr = stderr

	if runErr := c.Run(); runErr != nil {
		exitCode := -1
		if ee, ok := runErr.(*exec.ExitError); ok {
			exitCode = ee.ExitCode()
		}
		toolName := tool.Binary
		if runCtx.Err() == context.DeadlineExceeded {
			return nil, &ConvertError{
				Tool:     toolName,
				ExitCode: exitCode,
				Stderr:   stderr.String(),
				Cause:    fmt.Errorf("timeout after %s: %w", timeout, runCtx.Err()),
			}
		}
		return nil, &ConvertError{
			Tool:     toolName,
			ExitCode: exitCode,
			Stderr:   stderr.String(),
			Cause:    runErr,
		}
	}
	return stdoutBuf.Bytes(), nil
}

// buildBwrapArgs assembles the bwrap argv for a single conversion.
// Exposed as a package-internal helper so tests can lock the sandbox
// flag shape without exec'ing bwrap. Returns an error when a mount
// spec is malformed.
func buildBwrapArgs(binary string, mounts, cmd []string) ([]string, error) {
	args := []string{
		// Namespace isolation. --unshare-all unshares user (when
		// available), ipc, pid, net, uts, cgroup; --unshare-user-try
		// downgrades cleanly when the kernel refuses (e.g. some
		// container hosts disable user-namespace creation).
		"--unshare-all",
		"--unshare-user-try",
		"--die-with-parent",
		// Read-only system view. Each --ro-bind only mounts paths
		// that exist on the host; for hosts where /lib is a symlink
		// into /usr/lib (modern Linux) the symlink resolution lets
		// bwrap mount /usr's contents through.
		"--ro-bind", "/usr", "/usr",
		"--ro-bind-try", "/lib", "/lib",
		"--ro-bind-try", "/lib64", "/lib64",
		"--ro-bind-try", "/bin", "/bin",
		"--ro-bind-try", "/sbin", "/sbin",
		"--ro-bind-try", "/etc", "/etc",
		// Pseudo-filesystems. /proc and /dev are required for any
		// non-trivial binary; we make them minimal.
		"--proc", "/proc",
		"--dev", "/dev",
		// Scratch. 256 MiB tmpfs at /tmp matches containerRunner.
		// chromium spills its shared-memory fallback (--disable-dev-
		// shm-usage) here, so the budget actually matters.
		"--tmpfs", "/tmp",
		"--size", "268435456", // 256 MiB; applies to the most recent --tmpfs
		"--chdir", "/tmp",
		// Minimal env. HOME=/tmp lets chromium write its
		// user-data-dir without permission errors; PATH covers the
		// usual install locations for pandoc + chromium across
		// alpine / debian / rhel.
		"--clearenv",
		"--setenv", "HOME", "/tmp",
		"--setenv", "PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
		"--setenv", "LANG", "C.UTF-8",
	}
	// Caller-supplied bind mounts (template, output, …). Same
	// "host:target[:ro|:rw]" syntax as containerRunner; we translate
	// to bwrap's --ro-bind / --bind.
	for _, m := range mounts {
		host, target, mode, ok := splitMount(m)
		if !ok {
			return nil, fmt.Errorf("convert.Run: invalid mount spec %q (want host:target[:ro|:rw])", m)
		}
		if mode == "rw" {
			args = append(args, "--bind", host, target)
		} else {
			args = append(args, "--ro-bind", host, target)
		}
	}
	// Finally the binary + its argv. The binary path is PATH-resolved
	// inside the sandbox via the constructed PATH above; if the
	// operator passed an absolute path it bypasses PATH lookup and is
	// invoked verbatim (still subject to the /usr bind mount).
	args = append(args, binary)
	args = append(args, cmd...)
	return args, nil
}

// splitMount parses "host:target[:ro|:rw]" into its three parts.
// The mode segment is optional; absent means read-only (matches the
// containerRunner default).
func splitMount(m string) (host, target, mode string, ok bool) {
	parts := strings.SplitN(m, ":", 3)
	if len(parts) < 2 {
		return "", "", "", false
	}
	host = parts[0]
	target = parts[1]
	mode = "ro"
	if len(parts) == 3 {
		switch parts[2] {
		case "ro", "rw":
			mode = parts[2]
		default:
			return "", "", "", false
		}
	}
	return host, target, mode, true
}

// imageTag extracts a short name for an image reference, used as the
// "Tool" label on ConvertError. "docker.io/pandoc/latex:latest" →
// "pandoc/latex".
func imageTag(image string) string {
	s := image
	// Strip registry prefix.
	if i := strings.Index(s, "/"); i >= 0 {
		if strings.Contains(s[:i], ".") || strings.Contains(s[:i], ":") {
			s = s[i+1:]
		}
	}
	// Strip tag suffix.
	if i := strings.LastIndex(s, ":"); i >= 0 {
		s = s[:i]
	}
	return s
}

// limitWriter caps the underlying buffer at max bytes. Writes past the
// cap return io.ErrShortWrite, which surfaces as a Run() error — the
// caller then maps to 422 (output too large) at the handler edge.
type limitWriter struct {
	w   io.Writer
	max int64
	n   int64
}

func (l *limitWriter) Write(p []byte) (int, error) {
	if l.n >= l.max {
		return 0, fmt.Errorf("output exceeded %d bytes", l.max)
	}
	rem := l.max - l.n
	if int64(len(p)) > rem {
		n, _ := l.w.Write(p[:rem])
		l.n += int64(n)
		return n, fmt.Errorf("output exceeded %d bytes", l.max)
	}
	n, err := l.w.Write(p)
	l.n += int64(n)
	return n, err
}

// ringWriter keeps only the tail of what's written — useful for stderr
// capture where the most-recent bytes are the ones with the actual
// error message and earlier output is usually progress noise.
type ringWriter struct {
	mu  sync.Mutex
	buf []byte
	max int
}

func newRingWriter(max int) *ringWriter {
	return &ringWriter{max: max}
}

func (r *ringWriter) Write(p []byte) (int, error) {
	r.mu.Lock()
	defer r.mu.Unlock()
	if len(p) >= r.max {
		r.buf = append(r.buf[:0], p[len(p)-r.max:]...)
		return len(p), nil
	}
	r.buf = append(r.buf, p...)
	if len(r.buf) > r.max {
		r.buf = r.buf[len(r.buf)-r.max:]
	}
	return len(p), nil
}

func (r *ringWriter) String() string {
	r.mu.Lock()
	defer r.mu.Unlock()
	return string(r.buf)
}

// writeAssetsToScratch materialises the embedded viewer-template.html
// and custom.css into a fresh scratch dir and returns the host path.
// Caller is responsible for os.RemoveAll(dir) when done. Used by
// ToHTML which needs the template visible inside the container.
//
// scratchRoot controls where the temp dir lands. Empty means "use
// $TMPDIR" (local mode default). In remote/sidecar mode the caller
// passes the shared mount path (e.g. "/work") so the podman-service
// sidecar sees the bind-mount source at the same path.
//
// Files are written world-readable so the container's default user
// (root for pandoc/latex, uid 1000 for alpine-chrome) can read them
// through the read-only bind mount regardless of the host's umask.
func writeAssetsToScratch(scratchRoot string) (string, error) {
	dir, err := os.MkdirTemp(scratchRoot, "zddc-convert-")
	if err != nil {
		return "", fmt.Errorf("scratch dir: %w", err)
	}
	if err := os.WriteFile(filepath.Join(dir, "viewer-template.html"), viewerTemplate, 0o644); err != nil {
		os.RemoveAll(dir)
		return "", fmt.Errorf("write template: %w", err)
	}
	if err := os.WriteFile(filepath.Join(dir, "custom.css"), customCSS, 0o644); err != nil {
		os.RemoveAll(dir)
		return "", fmt.Errorf("write css: %w", err)
	}
	if err := chmodTree(dir, 0o755, 0o644); err != nil {
		os.RemoveAll(dir)
		return "", err
	}
	return dir, nil
}

func chmodTree(root string, dirMode, fileMode os.FileMode) error {
	return filepath.WalkDir(root, func(p string, d fs.DirEntry, err error) error {
		if err != nil {
			return err
		}
		if d.IsDir() {
			return os.Chmod(p, dirMode)
		}
		return os.Chmod(p, fileMode)
	})
}