ZDDC/zddc/internal/convert/health.go
ZDDC 7aec631a22 feat(convert): support remote podman mode + configurable scratch dir
zddc-server can now invoke podman as a CLIENT against a remote socket
instead of creating containers in its own process. The sidecar pattern
in tnd-zddc-chart will use this so zddc-server's own pod stays
unprivileged (only the podman-system-service sidecar runs privileged).

New surface:

  --convert-podman-socket / ZDDC_CONVERT_PODMAN_SOCKET
    e.g. unix:///var/run/podman/podman.sock
    Empty (default) → local mode (podman creates containers in
    zddc-server's own filesystem namespace).
    Non-empty → remote mode: `podman --remote --url=<this> run …`
    dispatches each container request to whatever process owns the
    socket. Typically a `podman system service` sidecar in the same
    Kubernetes pod.

  --convert-scratch-dir / ZDDC_CONVERT_SCRATCH_DIR
    Host-side directory for per-conversion intermediates (template,
    HTML, PDF). In remote mode this MUST be a path the sidecar sees
    at the same mountpoint — typically a shared emptyDir at /work
    in both containers. Empty = $TMPDIR (local-mode default).

Runner behaviour:

  local mode → unchanged. `podman run --userns=host --rm --pull=missing
  --network=none --read-only …`. `--userns=host` stays so nested-podman
  on a privileged host (the previous chart shape) keeps working for
  anyone still using it.

  remote mode → `podman --remote --url=<sock> run --rm --pull=missing
  --network=none --read-only …`. `--userns=host` is dropped because
  the sidecar is rootful inside its own privileged container and
  doesn't need userns juggling.

Health probe gains a Mode field ("local" | "remote") and, in remote
mode, runs `podman --remote --url=<sock> version` to confirm the
sidecar's socket is reachable. Unreachable-socket → 503 with a clear
reason (sidecar may still be starting up); reachable → ready.

Capabilities log now includes engine_version + mode + remote_url for
easier debugging of "which podman is actually doing the work".

No tests removed — the existing fake-runner table covers both modes
since the runner's args are uniform (remote prefix is the only thing
that differs).
2026-05-13 12:17:40 -05:00

219 lines
6.4 KiB
Go

package convert
import (
"context"
"fmt"
"log/slog"
"os/exec"
"strings"
"sync"
"sync/atomic"
"time"
)
// remoteURL is set by Probe from cfg.ConvertPodmanSocket. Empty means
// local mode.
var remoteURL atomic.Pointer[string]
// Capabilities is the snapshot of "can we convert right now?". The
// only hard requirement is a container runtime reachable from
// zddc-server — image presence is left to `--pull=missing` at
// conversion time, so a missing image surfaces as a normal
// ConvertError (not a probe failure).
//
// Mode is "local" when the engine creates containers in the same
// process as zddc-server, or "remote" when zddc-server is the client
// of a podman-system-service sidecar (see ContainerRunner doc).
type Capabilities struct {
Engine string // "podman" | "docker" | ""
EngineVer string // first line of "<engine> --version"
Mode string // "local" or "remote"
RemoteURL string // populated in remote mode
PandocImage string // resolved pandoc image ref
ChromiumImage string // resolved chromium image ref
ProbedAt time.Time
Err error
}
// Ready reports whether conversions can be attempted. The first
// conversion may still fail if the configured image isn't reachable
// from the host's registry (the runner will surface a clear error
// from podman/docker stderr).
func (c Capabilities) Ready() bool {
return c.Engine != "" && c.Err == nil
}
// Reason returns a short human-friendly explanation when Ready() is
// false. Used as the body of a 503.
func (c Capabilities) Reason() string {
if c.Engine == "" {
return "no container runtime (podman or docker) found on PATH"
}
if c.Err != nil {
if c.Mode == "remote" {
return fmt.Sprintf("podman remote socket unreachable (%s): %s", c.RemoteURL, c.Err.Error())
}
return c.Err.Error()
}
return "unavailable"
}
var (
caps atomic.Pointer[Capabilities]
probeCool sync.Mutex
)
// Available returns the current Capabilities snapshot and whether
// conversions can proceed.
func Available() (Capabilities, bool) {
p := caps.Load()
if p == nil {
return Capabilities{}, false
}
return *p, p.Ready()
}
// SetRemoteURL installs the podman remote socket URL for subsequent
// Probe / Reprobe calls. Empty means "local mode" (the engine binary
// creates containers in the same process). Called from
// cmd/zddc-server/main.go after flag parsing, before Probe.
func SetRemoteURL(url string) {
s := url
remoteURL.Store(&s)
}
func currentRemoteURL() string {
if p := remoteURL.Load(); p != nil {
return *p
}
return ""
}
// Probe locates the container engine and installs a containerRunner
// as the package default. Call once at server startup. Returns the
// captured Capabilities for logging.
//
// Engine order: engineOverride (if non-empty) → podman → docker. First
// hit wins. Image presence is NOT probed: the runner uses
// `--pull=missing` so the first conversion request will pull whichever
// image it needs.
//
// In remote mode (SetRemoteURL with non-empty URL), the probe also
// invokes `<engine> --remote --url=<url> version` to confirm the
// sidecar's socket is reachable. A reachable-engine-but-unreachable-
// socket state surfaces as Ready=false so conversion requests serve
// 503 until the sidecar comes up.
//
// Any failure here is non-fatal: the server still starts, conversion
// endpoints just return 503.
func Probe(ctx context.Context, engineOverride string) Capabilities {
probeCool.Lock()
defer probeCool.Unlock()
now := time.Now()
rURL := currentRemoteURL()
c := Capabilities{
PandocImage: currentPandocImage(),
ChromiumImage: currentChromiumImage(),
Mode: "local",
RemoteURL: rURL,
ProbedAt: now,
}
if rURL != "" {
c.Mode = "remote"
}
engine := resolveEngine(engineOverride)
if engine == "" {
c.Err = fmt.Errorf("no container runtime found (tried: %s)", strings.Join(enginesTried(engineOverride), ", "))
caps.Store(&c)
slog.Warn("convert: probe failed", "reason", c.Err.Error())
return c
}
c.Engine = engine
if v, err := probeVersion(ctx, engine); err == nil {
c.EngineVer = v
}
if rURL != "" {
if err := probeRemoteSocket(ctx, engine, rURL); err != nil {
c.Err = err
caps.Store(&c)
slog.Warn("convert: remote socket probe failed",
"engine", engine, "remote_url", rURL, "err", err)
return c
}
}
InstallRunner(newContainerRunner(engine, rURL))
caps.Store(&c)
slog.Info("convert: ready",
"engine", engine,
"engine_version", c.EngineVer,
"mode", c.Mode,
"remote_url", c.RemoteURL,
"pandoc_image", c.PandocImage,
"chromium_image", c.ChromiumImage)
return c
}
// probeRemoteSocket runs `<engine> --remote --url=<url> version` with
// a short timeout. Returns nil on success; a wrapped error otherwise.
// The remote URL is typically a Unix socket path
// (unix:///var/run/podman/podman.sock) in the sidecar pattern but a
// TCP form (tcp://host:port) is accepted too.
func probeRemoteSocket(ctx context.Context, engine, url string) error {
c := exec.CommandContext(ctx, engine, "--remote", "--url="+url, "version", "--format={{.Client.Version}}")
out, err := c.CombinedOutput()
if err != nil {
return fmt.Errorf("podman --remote version: %w (output: %s)", err, strings.TrimSpace(string(out)))
}
return nil
}
// Reprobe re-runs Probe with the existing configuration. Used by the
// handler when a request hits a not-Ready state — gives the operator
// a way to recover (e.g. installed podman after the server started)
// without a server restart. Cooldown of 60 s between probes to keep
// error-path requests cheap.
func Reprobe(ctx context.Context, engineOverride string) Capabilities {
if p := caps.Load(); p != nil {
if time.Since(p.ProbedAt) < 60*time.Second {
return *p
}
}
return Probe(ctx, engineOverride)
}
func resolveEngine(override string) string {
if override != "" {
if p, err := exec.LookPath(override); err == nil {
return p
}
return ""
}
for _, name := range []string{"podman", "docker"} {
if p, err := exec.LookPath(name); err == nil {
return p
}
}
return ""
}
func enginesTried(override string) []string {
if override != "" {
return []string{override}
}
return []string{"podman", "docker"}
}
func probeVersion(ctx context.Context, engine string) (string, error) {
c := exec.CommandContext(ctx, engine, "--version")
out, err := c.CombinedOutput()
if err != nil {
return "", err
}
line := strings.SplitN(strings.TrimSpace(string(out)), "\n", 2)[0]
return line, nil
}