The previous keyForURL stripped default ports (:443 for https, :80 for http) and omitted the scheme, so: http://example.com/x.html ──┐ https://example.com/x.html ──┴──→ same cache entry (collision) https://example.com/x.html ──┐ https://example.com:443/x.html ──┴──→ same cache entry This was a defensible HTTP convention but a real correctness issue on reverse-proxy stacks where http and https legitimately serve different bytes for the same path, or where two upstreams share a host but answer on different default ports. New layout: <scheme>/<host>[:<port>]/<path>. Full origin tuple in the key, no port stripping, scheme segregation. Examples: https/zddc.varasys.io/releases/archive_stable.html https/example.com:8443/x.html http/example.com/y.html (distinct from https/example.com/y.html) Operators retain the "ls _app/ to inspect what's cached" affordance they relied on; they just see one extra directory layer (scheme first, then host). Tests: * Updated TestKeyForURL to assert the new layout for every previously-covered case * New TestKeyForURL_NoCollisions explicitly asserts that the dimensions previously collapsed (default-port↔bare, http↔https, different non-default ports) now produce distinct keys Doc references to the cache layout under <ZDDC_ROOT>/_app/ updated in zddc/README.md (3 mentions). NOTE: existing _app/ caches under the old layout will be ignored on first request after upgrade — entries will be re-fetched and written to the new path. Operators can `rm -rf <ZDDC_ROOT>/_app` during the upgrade window if they prefer not to have orphans. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
187 lines
5.1 KiB
Go
187 lines
5.1 KiB
Go
package apps
|
|
|
|
import (
|
|
"fmt"
|
|
"io/fs"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
)
|
|
|
|
// Cache stores fetched URL responses on disk under <ZDDC_ROOT>/_app/.
|
|
// Files are name-keyed by upstream host + path so operators can list
|
|
// and inspect them by hand. There is no metadata, no SHA-256, no
|
|
// expiration — fetch-once-and-keep-forever. To force a refetch,
|
|
// delete the cache file.
|
|
type Cache struct {
|
|
root string
|
|
}
|
|
|
|
// NewCache creates a Cache rooted at the given path. The directory is
|
|
// created if missing. Stale *.tmp files left over from interrupted
|
|
// writes are swept on construction.
|
|
func NewCache(root string) (*Cache, error) {
|
|
root = filepath.Clean(root)
|
|
if err := os.MkdirAll(root, 0o755); err != nil {
|
|
return nil, fmt.Errorf("create cache root: %w", err)
|
|
}
|
|
c := &Cache{root: root}
|
|
if err := c.sweepTemps(); err != nil {
|
|
return nil, fmt.Errorf("sweep temps: %w", err)
|
|
}
|
|
return c, nil
|
|
}
|
|
|
|
// Root returns the cache directory absolute path.
|
|
func (c *Cache) Root() string { return c.root }
|
|
|
|
// keyForURL converts a URL into a relative filesystem path under the
|
|
// cache root.
|
|
//
|
|
// Layout: <scheme>/<host>[:<port>]/<path>. The full origin tuple is in
|
|
// the key so two URLs that resolve different content cannot collide:
|
|
//
|
|
// https://example.com/x.html → https/example.com/x.html
|
|
// http://example.com/x.html → http/example.com/x.html
|
|
// https://example.com:8443/x.html → https/example.com:8443/x.html
|
|
//
|
|
// No port stripping. The previous behavior — collapsing :443 onto bare
|
|
// host for https (and :80 for http) — was a defensible HTTP convention
|
|
// but conflated "the operator wrote a URL with the default port" with
|
|
// "the operator wrote a bare-host URL". With explicit port preserved,
|
|
// every URL maps to exactly one filesystem path; operators can still
|
|
// `ls _app/https/example.com/` to inspect what's cached. Scheme
|
|
// segregation prevents an http:// hit from masquerading as an https://
|
|
// hit when both are deliberately distinct (rare, but real on
|
|
// reverse-proxied stacks where http and https serve different bytes).
|
|
//
|
|
// Host is lowercased so the canonical-host normalization survives
|
|
// case-insensitive DNS. Port is preserved verbatim.
|
|
func keyForURL(rawURL string) (string, error) {
|
|
u, err := url.Parse(rawURL)
|
|
if err != nil {
|
|
return "", fmt.Errorf("parse URL: %w", err)
|
|
}
|
|
if u.Scheme != "http" && u.Scheme != "https" {
|
|
return "", fmt.Errorf("unsupported scheme %q", u.Scheme)
|
|
}
|
|
if u.Host == "" {
|
|
return "", fmt.Errorf("URL is missing host")
|
|
}
|
|
if u.RawQuery != "" {
|
|
return "", fmt.Errorf("URL must not contain query string: %s", rawURL)
|
|
}
|
|
// Lowercase the host part but preserve the port verbatim. Without
|
|
// this we'd lowercase a numeric port unnecessarily, which is fine
|
|
// but pointless; with this the ASCII-cased host normalization
|
|
// works the same for both default and explicit-port URLs.
|
|
host := u.Host
|
|
if i := strings.Index(host, ":"); i >= 0 {
|
|
host = strings.ToLower(host[:i]) + host[i:]
|
|
} else {
|
|
host = strings.ToLower(host)
|
|
}
|
|
p := u.Path
|
|
for strings.Contains(p, "//") {
|
|
p = strings.ReplaceAll(p, "//", "/")
|
|
}
|
|
p = strings.TrimPrefix(p, "/")
|
|
if p == "" {
|
|
p = "index.html"
|
|
}
|
|
cleaned := filepath.Clean("/" + p)
|
|
if strings.Contains(cleaned, "..") {
|
|
return "", fmt.Errorf("URL path contains '..'")
|
|
}
|
|
return u.Scheme + "/" + host + cleaned, nil
|
|
}
|
|
|
|
func (c *Cache) pathFor(rawURL string) (string, error) {
|
|
key, err := keyForURL(rawURL)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return filepath.Join(c.root, filepath.FromSlash(key)), nil
|
|
}
|
|
|
|
// Has reports whether a cache entry exists for the URL.
|
|
func (c *Cache) Has(rawURL string) bool {
|
|
p, err := c.pathFor(rawURL)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
_, err = os.Stat(p)
|
|
return err == nil
|
|
}
|
|
|
|
// Read returns the cached body or os.ErrNotExist.
|
|
func (c *Cache) Read(rawURL string) ([]byte, error) {
|
|
p, err := c.pathFor(rawURL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return os.ReadFile(p)
|
|
}
|
|
|
|
// Write atomically stores body for the URL. Parent directories are
|
|
// created as needed. Writes via tmp+rename so partial files are never
|
|
// observable.
|
|
func (c *Cache) Write(rawURL string, body []byte) error {
|
|
p, err := c.pathFor(rawURL)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := os.MkdirAll(filepath.Dir(p), 0o755); err != nil {
|
|
return err
|
|
}
|
|
return writeAtomic(p, body)
|
|
}
|
|
|
|
func writeAtomic(path string, data []byte) error {
|
|
dir := filepath.Dir(path)
|
|
tmp, err := os.CreateTemp(dir, filepath.Base(path)+".tmp.*")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
tmpName := tmp.Name()
|
|
cleanup := func() { _ = os.Remove(tmpName) }
|
|
if _, err := tmp.Write(data); err != nil {
|
|
_ = tmp.Close()
|
|
cleanup()
|
|
return err
|
|
}
|
|
if err := tmp.Sync(); err != nil {
|
|
_ = tmp.Close()
|
|
cleanup()
|
|
return err
|
|
}
|
|
if err := tmp.Close(); err != nil {
|
|
cleanup()
|
|
return err
|
|
}
|
|
if err := os.Rename(tmpName, path); err != nil {
|
|
cleanup()
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *Cache) sweepTemps() error {
|
|
err := filepath.WalkDir(c.root, func(p string, d fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if d.IsDir() {
|
|
return nil
|
|
}
|
|
if strings.Contains(d.Name(), ".tmp.") {
|
|
_ = os.Remove(p)
|
|
}
|
|
return nil
|
|
})
|
|
if err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|