ZDDC/zddc/internal/apps/cache.go
ZDDC b20e98b6aa fix(apps): cache key now includes scheme + full host:port (no collisions)
The previous keyForURL stripped default ports (:443 for https, :80
for http) and omitted the scheme, so:

  http://example.com/x.html   ──┐
  https://example.com/x.html  ──┴──→ same cache entry (collision)

  https://example.com/x.html      ──┐
  https://example.com:443/x.html  ──┴──→ same cache entry

This was a defensible HTTP convention but a real correctness issue
on reverse-proxy stacks where http and https legitimately serve
different bytes for the same path, or where two upstreams share a
host but answer on different default ports.

New layout: <scheme>/<host>[:<port>]/<path>. Full origin tuple in
the key, no port stripping, scheme segregation. Examples:

  https/zddc.varasys.io/releases/archive_stable.html
  https/example.com:8443/x.html
  http/example.com/y.html      (distinct from https/example.com/y.html)

Operators retain the "ls _app/ to inspect what's cached" affordance
they relied on; they just see one extra directory layer (scheme
first, then host).

Tests:
  * Updated TestKeyForURL to assert the new layout for every
    previously-covered case
  * New TestKeyForURL_NoCollisions explicitly asserts that the
    dimensions previously collapsed (default-port↔bare, http↔https,
    different non-default ports) now produce distinct keys

Doc references to the cache layout under <ZDDC_ROOT>/_app/ updated
in zddc/README.md (3 mentions).

NOTE: existing _app/ caches under the old layout will be ignored
on first request after upgrade — entries will be re-fetched and
written to the new path. Operators can `rm -rf <ZDDC_ROOT>/_app`
during the upgrade window if they prefer not to have orphans.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 17:57:28 -05:00

187 lines
5.1 KiB
Go

package apps
import (
"fmt"
"io/fs"
"net/url"
"os"
"path/filepath"
"strings"
)
// Cache stores fetched URL responses on disk under <ZDDC_ROOT>/_app/.
// Files are name-keyed by upstream host + path so operators can list
// and inspect them by hand. There is no metadata, no SHA-256, no
// expiration — fetch-once-and-keep-forever. To force a refetch,
// delete the cache file.
type Cache struct {
root string
}
// NewCache creates a Cache rooted at the given path. The directory is
// created if missing. Stale *.tmp files left over from interrupted
// writes are swept on construction.
func NewCache(root string) (*Cache, error) {
root = filepath.Clean(root)
if err := os.MkdirAll(root, 0o755); err != nil {
return nil, fmt.Errorf("create cache root: %w", err)
}
c := &Cache{root: root}
if err := c.sweepTemps(); err != nil {
return nil, fmt.Errorf("sweep temps: %w", err)
}
return c, nil
}
// Root returns the cache directory absolute path.
func (c *Cache) Root() string { return c.root }
// keyForURL converts a URL into a relative filesystem path under the
// cache root.
//
// Layout: <scheme>/<host>[:<port>]/<path>. The full origin tuple is in
// the key so two URLs that resolve different content cannot collide:
//
// https://example.com/x.html → https/example.com/x.html
// http://example.com/x.html → http/example.com/x.html
// https://example.com:8443/x.html → https/example.com:8443/x.html
//
// No port stripping. The previous behavior — collapsing :443 onto bare
// host for https (and :80 for http) — was a defensible HTTP convention
// but conflated "the operator wrote a URL with the default port" with
// "the operator wrote a bare-host URL". With explicit port preserved,
// every URL maps to exactly one filesystem path; operators can still
// `ls _app/https/example.com/` to inspect what's cached. Scheme
// segregation prevents an http:// hit from masquerading as an https://
// hit when both are deliberately distinct (rare, but real on
// reverse-proxied stacks where http and https serve different bytes).
//
// Host is lowercased so the canonical-host normalization survives
// case-insensitive DNS. Port is preserved verbatim.
func keyForURL(rawURL string) (string, error) {
u, err := url.Parse(rawURL)
if err != nil {
return "", fmt.Errorf("parse URL: %w", err)
}
if u.Scheme != "http" && u.Scheme != "https" {
return "", fmt.Errorf("unsupported scheme %q", u.Scheme)
}
if u.Host == "" {
return "", fmt.Errorf("URL is missing host")
}
if u.RawQuery != "" {
return "", fmt.Errorf("URL must not contain query string: %s", rawURL)
}
// Lowercase the host part but preserve the port verbatim. Without
// this we'd lowercase a numeric port unnecessarily, which is fine
// but pointless; with this the ASCII-cased host normalization
// works the same for both default and explicit-port URLs.
host := u.Host
if i := strings.Index(host, ":"); i >= 0 {
host = strings.ToLower(host[:i]) + host[i:]
} else {
host = strings.ToLower(host)
}
p := u.Path
for strings.Contains(p, "//") {
p = strings.ReplaceAll(p, "//", "/")
}
p = strings.TrimPrefix(p, "/")
if p == "" {
p = "index.html"
}
cleaned := filepath.Clean("/" + p)
if strings.Contains(cleaned, "..") {
return "", fmt.Errorf("URL path contains '..'")
}
return u.Scheme + "/" + host + cleaned, nil
}
func (c *Cache) pathFor(rawURL string) (string, error) {
key, err := keyForURL(rawURL)
if err != nil {
return "", err
}
return filepath.Join(c.root, filepath.FromSlash(key)), nil
}
// Has reports whether a cache entry exists for the URL.
func (c *Cache) Has(rawURL string) bool {
p, err := c.pathFor(rawURL)
if err != nil {
return false
}
_, err = os.Stat(p)
return err == nil
}
// Read returns the cached body or os.ErrNotExist.
func (c *Cache) Read(rawURL string) ([]byte, error) {
p, err := c.pathFor(rawURL)
if err != nil {
return nil, err
}
return os.ReadFile(p)
}
// Write atomically stores body for the URL. Parent directories are
// created as needed. Writes via tmp+rename so partial files are never
// observable.
func (c *Cache) Write(rawURL string, body []byte) error {
p, err := c.pathFor(rawURL)
if err != nil {
return err
}
if err := os.MkdirAll(filepath.Dir(p), 0o755); err != nil {
return err
}
return writeAtomic(p, body)
}
func writeAtomic(path string, data []byte) error {
dir := filepath.Dir(path)
tmp, err := os.CreateTemp(dir, filepath.Base(path)+".tmp.*")
if err != nil {
return err
}
tmpName := tmp.Name()
cleanup := func() { _ = os.Remove(tmpName) }
if _, err := tmp.Write(data); err != nil {
_ = tmp.Close()
cleanup()
return err
}
if err := tmp.Sync(); err != nil {
_ = tmp.Close()
cleanup()
return err
}
if err := tmp.Close(); err != nil {
cleanup()
return err
}
if err := os.Rename(tmpName, path); err != nil {
cleanup()
return err
}
return nil
}
func (c *Cache) sweepTemps() error {
err := filepath.WalkDir(c.root, func(p string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if d.IsDir() {
return nil
}
if strings.Contains(d.Name(), ".tmp.") {
_ = os.Remove(p)
}
return nil
})
if err != nil && !os.IsNotExist(err) {
return err
}
return nil
}