ZDDC/zddc/internal/cache/walker.go
ZDDC 70d49ba111 fix(client): three bugs found by live smoke testing
Phase 3 + 4 live two-instance smoke tests against the synthetic
~/zddc-test-data fixture surfaced three real bugs that the unit
tests missed. All three are fixed in this commit.

1. walker: filenames with spaces/parens land on disk percent-encoded

   walkSubtree was passing the URL-encoded child URL (built via
   url.PathEscape) to fetchFileIfNeeded → cachePathFor, so a file
   named "Foo (IFI) - Bar.md" landed at <root>/.../Foo%20%28IFI%29
   %20-%20Bar.md on disk. Then purgeOrphans iterated os.ReadDir
   (which sees the encoded names) and compared against upstreamNames
   (decoded names from the listing JSON). Every fetched file was
   classified as an orphan and immediately deleted: a 180-file walk
   produced "fetched=180 purged=111" with only 70 files remaining.

   Fix: walker now maintains two parallel path strings — dirURL
   (URL-encoded for HTTP requests) and dirPath (decoded for disk
   keys). fetchFileIfNeeded, fetchListing, persistOnly, and
   purgeOrphans all take the decoded path. listingCachePathFor
   gets dirPath too. Smoke confirmed: dirs=29 files=180 fetched=179
   purged=0 (one file already cached from the user's GET that
   triggered the walk).

2. outbox: replay loop sleeps 5min after eager startup pass

   RunReplayLoop's idle-poll interval is 5min. After the eager
   startup pass with 0 entries, the loop sleeps 5min — even if a
   PUT-while-offline arrives 1 second later, replay won't fire for
   ~5 min. The cache returned 202 promptly but the queued write sat
   on disk until either a 5min nap elapsed or another PUT happened.

   Fix: Outbox gains a wake chan (buffered=1, drop-on-full).
   Enqueue posts to it after writing meta.json. RunReplayLoop selects
   on wake alongside the timer, so a new offline write triggers an
   immediate replay attempt. Smoke confirmed: PUT queued at T+0,
   master back at T+3, replay completes at T+3 (was previously a
   30s wait through the timer-based poll).

3. master: PUT/DELETE didn't honor If-Unmodified-Since

   The cache's outbox sends If-Unmodified-Since: <cached-mtime> on
   replay so the master can reject conflicting writes with 412. The
   master's checkIfMatch only evaluated If-Match (ETag-based), so
   the cache's mtime-based precondition was silently ignored. Result:
   an offline PUT staged before an external mod would clobber the
   newer external content on replay — silent data loss in the exact
   scenario the outbox is designed to detect.

   Fix: checkIfMatch now also evaluates If-Unmodified-Since per
   RFC 7232 §3.4, returning 412 when the file's current mtime is
   strictly later than the header value (1-second resolution to
   match HTTP-Date precision). Smoke confirmed: cache GET → external
   mod via direct file write → cache offline PUT → master back →
   replay sends IUS → master 412 → outbox entry renamed to
   <id>.conflict-<RFC3339>/ → master content preserved (the
   external mod, not the stale offline write).

Also added an info-level "outbox: replay attempt" log to tryReplay
so an operator watching the cache logs sees the replay loop is
alive even when every entry defers (transport error). Previously
the loop was silent unless a replay actually completed (200) or
conflicted (412).

go vet + go test ./... + go test -race ./internal/{cache,auth,handler}/...
all green. Synthetic ~/zddc-test-data fixture (553 files, 144 PDFs)
exercises the walker against realistic ZDDC filenames including
spaces, parens, and accented characters that the unit tests'
"a.txt" / "b.txt" inputs never hit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 09:34:07 -05:00

450 lines
13 KiB
Go

package cache
import (
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
"net/url"
"os"
"path"
"path/filepath"
"strings"
"sync"
"time"
)
// MirrorStateFile is the persisted scheduler state — last walk time
// per subtree. Lives at the cache root so state survives restarts.
const MirrorStateFile = ".zddc-mirror-state.json"
// DefaultMirrorMinInterval is the gate between walks for the same
// subtree. Active mirrors stay current via repeated visits; idle
// mirrors don't generate any upstream traffic until someone hits them.
const DefaultMirrorMinInterval = time.Hour
// DefaultMirrorParallelism caps how many concurrent file fetches a
// single walk fires. Bound is per-walk, not global — multiple
// subtrees walked at once each get their own budget. Conservative
// enough not to starve interactive requests sharing the same
// connection pool.
const DefaultMirrorParallelism = 4
// MirrorScheduler owns the per-subtree access-triggered walk policy.
// Built once at startup from cfg.MirrorSubtree[]; the cache layer
// installs Trigger as its onAccess hook.
type MirrorScheduler struct {
cache *Cache
subtrees []string // normalized: leading "/", no trailing "/"
minInterval time.Duration
parallelism int
mu sync.Mutex
state map[string]subtreeState
statePath string
inFlight map[string]bool // tracked separately from on-disk state — restarts forget
}
type subtreeState struct {
LastWalkAt time.Time `json:"last_walk_at"`
}
type persistedState struct {
Subtrees map[string]subtreeState `json:"subtrees"`
}
// NewMirrorScheduler constructs a scheduler attached to the given
// cache. Returns nil scheduler (and nil error) when no subtrees are
// configured — caller checks for that.
func NewMirrorScheduler(c *Cache, subtrees []string, minInterval time.Duration, parallelism int) (*MirrorScheduler, error) {
if len(subtrees) == 0 {
return nil, nil
}
if minInterval <= 0 {
minInterval = DefaultMirrorMinInterval
}
if parallelism <= 0 {
parallelism = DefaultMirrorParallelism
}
normalized := make([]string, 0, len(subtrees))
for _, s := range subtrees {
s = strings.TrimSpace(s)
if s == "" || s == "/" {
s = "/"
} else {
if !strings.HasPrefix(s, "/") {
s = "/" + s
}
s = strings.TrimRight(s, "/")
}
normalized = append(normalized, s)
}
statePath := filepath.Join(c.root, MirrorStateFile)
state := loadMirrorState(statePath)
s := &MirrorScheduler{
cache: c,
subtrees: normalized,
minInterval: minInterval,
parallelism: parallelism,
state: state,
statePath: statePath,
inFlight: make(map[string]bool),
}
c.onAccess = s.Trigger
return s, nil
}
// Subtrees returns the configured subtree list (read-only).
func (s *MirrorScheduler) Subtrees() []string {
out := make([]string, len(s.subtrees))
copy(out, s.subtrees)
return out
}
// MinInterval returns the configured min walk interval.
func (s *MirrorScheduler) MinInterval() time.Duration { return s.minInterval }
// Trigger inspects an incoming URL and, if it falls under a
// configured subtree AND a walk is due, kicks one in the background.
// Always non-blocking. Safe to call concurrently.
func (s *MirrorScheduler) Trigger(urlPath string) {
subtree := s.matchSubtree(urlPath)
if subtree == "" {
return
}
s.mu.Lock()
defer s.mu.Unlock()
if s.inFlight[subtree] {
return
}
st := s.state[subtree]
if !st.LastWalkAt.IsZero() && time.Since(st.LastWalkAt) < s.minInterval {
return
}
s.inFlight[subtree] = true
go s.runWalk(subtree)
}
// runWalk does one walk-and-record cycle for subtree. Always called
// in a goroutine from Trigger.
func (s *MirrorScheduler) runWalk(subtree string) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
defer cancel()
slog.Info("mirror walk start", "subtree", subtree)
start := time.Now()
stats, err := s.walkSubtree(ctx, subtree)
dur := time.Since(start)
if err != nil {
slog.Warn("mirror walk failed", "subtree", subtree, "duration", dur, "err", err)
} else {
slog.Info("mirror walk done", "subtree", subtree, "duration", dur,
"dirs", stats.dirs, "files", stats.files, "fetched", stats.fetched, "purged", stats.purged)
}
s.mu.Lock()
defer s.mu.Unlock()
s.inFlight[subtree] = false
s.state[subtree] = subtreeState{LastWalkAt: time.Now()}
persistMirrorState(s.statePath, s.state)
}
type walkStats struct {
dirs int
files int
fetched int
purged int
}
// walkSubtree recursively walks the upstream listings under subtree,
// fetching missing/stale files and purging local files no longer in
// the upstream's filtered listing. Bounded parallelism via a
// semaphore. Errors fetching individual files are logged but don't
// abort the walk.
func (s *MirrorScheduler) walkSubtree(ctx context.Context, subtree string) (walkStats, error) {
sem := make(chan struct{}, s.parallelism)
var stats walkStats
var statsMu sync.Mutex
addFetched := func() { statsMu.Lock(); stats.fetched++; statsMu.Unlock() }
addPurged := func() { statsMu.Lock(); stats.purged++; statsMu.Unlock() }
addDirs := func() { statsMu.Lock(); stats.dirs++; statsMu.Unlock() }
addFiles := func() { statsMu.Lock(); stats.files++; statsMu.Unlock() }
var fetchWG sync.WaitGroup
// walkDir keeps two parallel views of the directory's path:
// dirURL — URL-encoded form, used to compose HTTP requests
// dirPath — decoded form, used as the filesystem cache key.
// Filenames with spaces / parens / etc. need URL escaping for
// HTTP but must NOT land on disk percent-encoded — otherwise the
// orphan-purge pass below can't match local entries against the
// upstream's (decoded) listing names.
var walkDir func(dirURL, dirPath string) error
walkDir = func(dirURL, dirPath string) error {
if !strings.HasSuffix(dirURL, "/") {
dirURL += "/"
}
if !strings.HasSuffix(dirPath, "/") {
dirPath += "/"
}
addDirs()
entries, err := s.fetchListing(ctx, dirURL, dirPath)
if err != nil {
return err
}
// Build a set of names the upstream listing exposes so we can
// purge anything local-but-removed-upstream after walking.
upstreamNames := make(map[string]bool, len(entries))
for _, e := range entries {
name := strings.TrimSuffix(e.Name, "/")
upstreamNames[name] = true
}
for _, e := range entries {
if ctx.Err() != nil {
return ctx.Err()
}
name := strings.TrimSuffix(e.Name, "/")
childURL := dirURL + url.PathEscape(name)
childPath := dirPath + name
if e.IsDir {
childURL += "/"
childPath += "/"
if err := walkDir(childURL, childPath); err != nil {
slog.Debug("walk subdir failed", "url", childURL, "err", err)
}
continue
}
addFiles()
fetchWG.Add(1)
sem <- struct{}{}
go func(fileURL, filePath string) {
defer fetchWG.Done()
defer func() { <-sem }()
if s.fetchFileIfNeeded(ctx, fileURL, filePath) {
addFetched()
}
}(childURL, childPath)
}
// Purge local files no longer present upstream. Only acts on
// regular files inside this directory — listing sidecars,
// dot-prefix internals, and subdirectories are left alone.
s.purgeOrphans(dirPath, upstreamNames, addPurged)
return nil
}
err := walkDir(subtree+"/", subtree+"/")
fetchWG.Wait()
return stats, err
}
// purgeOrphans removes local cached files in dirPath (decoded URL
// path) that are absent from the upstream listing. Only operates on
// plain files; ignores dot-prefix entries (cache state) and
// subdirectories (they get their own pass during walkSubtree's
// recursion).
func (s *MirrorScheduler) purgeOrphans(dirPath string, upstreamNames map[string]bool, addPurged func()) {
rel := filepath.FromSlash(strings.Trim(dirPath, "/"))
localDir := filepath.Join(s.cache.root, rel)
entries, err := os.ReadDir(localDir)
if err != nil {
return
}
for _, e := range entries {
if e.IsDir() {
continue
}
name := e.Name()
if strings.HasPrefix(name, ".") {
continue
}
if upstreamNames[name] {
continue
}
_ = os.Remove(filepath.Join(localDir, name))
addPurged()
slog.Info("purged orphan", "path", path.Join(dirPath, name))
}
}
// fetchListing GETs the upstream JSON listing for dirURL (encoded)
// and parses the entries. Caches the JSON sidecar at dirPath
// (decoded) as a side-effect so the directory is browsable offline.
func (s *MirrorScheduler) fetchListing(ctx context.Context, dirURL, dirPath string) ([]listingEntry, error) {
target := s.cache.upstream + dirURL
req, err := http.NewRequestWithContext(ctx, http.MethodGet, target, nil)
if err != nil {
return nil, err
}
req.Header.Set("Accept", "application/json")
if s.cache.bearer != "" {
req.Header.Set("Authorization", "Bearer "+s.cache.bearer)
}
resp, err := s.cache.client.Do(req)
if err != nil {
return nil, fmt.Errorf("fetch listing %s: %w", target, err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("listing %s: status %d", target, resp.StatusCode)
}
bodyBytes, err := io.ReadAll(io.LimitReader(resp.Body, 16*1024*1024))
if err != nil {
return nil, fmt.Errorf("read listing body: %w", err)
}
// Persist the JSON sidecar so the directory is browsable offline.
// listingCachePathFor expects a decoded directory URL (keyed by
// FS path), so dirPath rather than dirURL here.
if path, ok := s.cache.listingCachePathFor(dirPath, "application/json"); ok {
if err := os.MkdirAll(filepath.Dir(path), 0o755); err == nil {
tmp, err := os.CreateTemp(filepath.Dir(path), ".zddc-cache-tmp-*")
if err == nil {
tmpName := tmp.Name()
if _, err := tmp.Write(bodyBytes); err == nil {
_ = tmp.Close()
if lm := resp.Header.Get("Last-Modified"); lm != "" {
if t, err := http.ParseTime(lm); err == nil {
_ = os.Chtimes(tmpName, t, t)
}
}
_ = os.Rename(tmpName, path)
s.cache.maybeWriteMarker()
} else {
_ = tmp.Close()
_ = os.Remove(tmpName)
}
}
}
}
var parsed []listingEntry
if err := json.Unmarshal(bodyBytes, &parsed); err != nil {
return nil, fmt.Errorf("parse listing %s: %w", target, err)
}
return parsed, nil
}
// listingEntry mirrors the subset of zddc/internal/listing/types.go's
// FileInfo we need. Extra fields ignored.
type listingEntry struct {
Name string `json:"name"`
IsDir bool `json:"is_dir"`
}
// fetchFileIfNeeded conditional-GETs a file URL. fileURL is the
// URL-encoded HTTP target; filePath is the decoded URL path used as
// the on-disk cache key. Returns true when the cache was updated
// (200 + write); false when no work was needed (304 or skipped).
func (s *MirrorScheduler) fetchFileIfNeeded(ctx context.Context, fileURL, filePath string) bool {
cachePath, ok := s.cache.cachePathFor(filePath)
if !ok {
return false
}
target := s.cache.upstream + fileURL
req, err := http.NewRequestWithContext(ctx, http.MethodGet, target, nil)
if err != nil {
return false
}
if info, err := os.Stat(cachePath); err == nil {
req.Header.Set("If-Modified-Since", info.ModTime().UTC().Format(http.TimeFormat))
}
if s.cache.bearer != "" {
req.Header.Set("Authorization", "Bearer "+s.cache.bearer)
}
resp, err := s.cache.client.Do(req)
if err != nil {
return false
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusNotModified {
return false
}
if resp.StatusCode != http.StatusOK {
// 4xx → ACL revoked or upstream gone; remove local cache.
if resp.StatusCode == http.StatusForbidden || resp.StatusCode == http.StatusNotFound {
_ = os.Remove(cachePath)
}
return false
}
if !s.cache.responseCacheable(resp) {
return false
}
// persistOnly resolves the disk path via cachePathFor too — pass
// the decoded filePath, not the encoded fileURL.
if err := s.cache.persistOnly(resp, filePath); err != nil {
slog.Debug("walker persist failed", "path", filePath, "err", err)
return false
}
s.cache.maybeWriteMarker()
return true
}
// matchSubtree returns the longest configured subtree that's a prefix
// of urlPath, or "" if none match. "/" matches everything.
func (s *MirrorScheduler) matchSubtree(urlPath string) string {
if urlPath == "" {
urlPath = "/"
}
var best string
for _, st := range s.subtrees {
if st == "/" {
if len(best) == 0 {
best = "/"
}
continue
}
if strings.HasPrefix(urlPath, st+"/") || urlPath == st {
if len(st) > len(best) {
best = st
}
}
}
return best
}
// loadMirrorState reads the persisted state from disk. Returns an
// empty (initialized) map on missing/corrupt input.
func loadMirrorState(path string) map[string]subtreeState {
state := make(map[string]subtreeState)
bytes, err := os.ReadFile(path)
if err != nil {
return state
}
var parsed persistedState
if err := json.Unmarshal(bytes, &parsed); err != nil {
return state
}
if parsed.Subtrees != nil {
state = parsed.Subtrees
}
return state
}
// persistMirrorState writes the state file atomically (temp + rename).
// Best-effort: errors are logged but non-fatal.
func persistMirrorState(path string, state map[string]subtreeState) {
bytes, err := json.MarshalIndent(persistedState{Subtrees: state}, "", " ")
if err != nil {
slog.Debug("marshal mirror state", "err", err)
return
}
tmp, err := os.CreateTemp(filepath.Dir(path), ".zddc-mirror-state-tmp-*")
if err != nil {
slog.Debug("create mirror state tmp", "err", err)
return
}
tmpName := tmp.Name()
if _, err := tmp.Write(bytes); err != nil {
_ = tmp.Close()
_ = os.Remove(tmpName)
return
}
if err := tmp.Close(); err != nil {
_ = os.Remove(tmpName)
return
}
if err := os.Rename(tmpName, path); err != nil {
_ = os.Remove(tmpName)
}
}