fix(cache): track background revalidation goroutines; drain on shutdown + in tests

Root cause of the flaky cache tests (TestServeHTTP_DirectoryListingsCachedAsSidecar
and the other hit-path tests, ~1-in-many under parallel load): on a cache
hit, ServeHTTP launches `go c.revalidate(...)` / `go c.revalidateListing(...)`,
which write into the cache root (MkdirAll + CreateTemp + Rename). Those
goroutines outlive the request — and in tests, the test — so they race
t.TempDir's RemoveAll cleanup, recreating the dir or dropping a temp file
mid-removal. testing then reports "TempDir RemoveAll cleanup: ... directory
not empty" and marks the test failed (with a 0.00s body, no assertion line).
It only surfaced under the full parallel suite / -count because the timing
has to collide.

Fix: track these background goroutines in a sync.WaitGroup via a goBackground
helper, and expose Wait(). newTestCache registers t.Cleanup(c.Wait) — cleanups
fire LIFO and t.TempDir registered its RemoveAll first, so the drain runs
before it (upstream Close was registered earliest, so it runs last and stays
up while goroutines finish). runClient also calls cacheLayer.Wait() after
srv.Shutdown so in-flight sidecar writes complete on graceful shutdown rather
than being abandoned.

Verified: cache package at -count=200 reliably failed before, passes clean
after (0 failures, 0 cleanup errors); full `go test ./...` + vet green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
ZDDC 2026-05-21 16:21:37 -05:00
parent 8ef2ce01d0
commit 1402864c4c
3 changed files with 44 additions and 3 deletions

View file

@ -418,6 +418,10 @@ func runClient(cfg config.Config) {
if err := srv.Shutdown(shutdownCtx); err != nil {
slog.Error("shutdown error", "err", err)
}
// Drain background cache work (revalidation kicked off on hits)
// before exiting, so in-flight sidecar writes finish rather than
// being abandoned mid-flight.
cacheLayer.Wait()
slog.Info("stopped")
}

View file

@ -67,6 +67,13 @@ type Cache struct {
markerOnce sync.Once
// wg tracks background goroutines (cache revalidation on hits,
// mirror-walk hooks) so Wait() can drain them. Without this they
// outlive the request — fine in production until a graceful
// shutdown wants them finished, and in tests they race t.TempDir
// cleanup by writing into the cache root after the test returns.
wg sync.WaitGroup
// onAccess is invoked (when non-nil) after a request is dispatched.
// The walker scheduler installs this hook to kick mirror walks based
// on incoming traffic. Always called in a goroutine — must not
@ -85,6 +92,25 @@ type Cache struct {
// offline writes.
func (c *Cache) SetOutbox(o *Outbox) { c.outbox = o }
// goBackground runs fn in a tracked goroutine so Wait can drain
// in-flight background work — cache revalidation kicked off on a hit,
// and the mirror-walk access hook. These must never block the user
// response, but they also shouldn't outlive a graceful shutdown (or,
// in tests, a t.TempDir cleanup that they'd race by writing into the
// cache root after the test returns).
func (c *Cache) goBackground(fn func()) {
c.wg.Add(1)
go func() {
defer c.wg.Done()
fn()
}()
}
// Wait blocks until all tracked background goroutines have finished.
// Intended for graceful shutdown; tests call it before the temp-dir
// cleanup runs.
func (c *Cache) Wait() { c.wg.Wait() }
// New constructs a Cache from the loaded configuration. Validates
// upstream URL, reads the bearer-file (if configured), prepares the
// HTTP client honoring SkipTLSVerify, and ensures the cache root
@ -181,7 +207,8 @@ func (c *Cache) ServeHTTP(w http.ResponseWriter, r *http.Request) {
// after we've started serving — the user's request never blocks
// on walk scheduling.
if c.onAccess != nil {
go c.onAccess(r.URL.Path)
urlPath := r.URL.Path
c.goBackground(func() { c.onAccess(urlPath) })
}
// Directory listings: try sidecar listing-cache, fall back to
@ -201,7 +228,8 @@ func (c *Cache) ServeHTTP(w http.ResponseWriter, r *http.Request) {
if err == nil && !info.IsDir() {
c.serveFromDisk(w, r, path, info, "hit")
// Background revalidate; never block the user response.
go c.revalidate(r.URL.Path, info.ModTime())
urlPath, mtime := r.URL.Path, info.ModTime()
c.goBackground(func() { c.revalidate(urlPath, mtime) })
return
}
}
@ -221,7 +249,8 @@ func (c *Cache) serveDirectory(w http.ResponseWriter, r *http.Request) {
info, err := os.Stat(path)
if err == nil && !info.IsDir() {
c.serveListingFromDisk(w, r, path, info, "hit")
go c.revalidateListing(r.URL.Path, r.Header.Get("Accept"), info.ModTime())
urlPath, accept, mtime := r.URL.Path, r.Header.Get("Accept"), info.ModTime()
c.goBackground(func() { c.revalidateListing(urlPath, accept, mtime) })
return
}
}

View file

@ -31,6 +31,14 @@ func newTestCache(t *testing.T, mode string, upstreamHandler http.HandlerFunc) (
if err != nil {
t.Fatalf("New: %v", err)
}
// Drain background revalidation goroutines before the test's
// t.TempDir cleanup runs. Cleanups fire LIFO and t.TempDir
// registered its RemoveAll first (at the t.TempDir() call above),
// so this runs before it — preventing a revalidate goroutine from
// recreating the cache dir / dropping a temp file mid-RemoveAll
// ("directory not empty"). The upstream stays up (its Close was
// registered earliest, so it runs last).
t.Cleanup(c.Wait)
return c, upstream
}