ZDDC/tests/data/test-archive.sh
ZDDC db1f44cf74 test,docs(zip): browse/archive zip-transmittal coverage + fixture + docs
- tests/browse.spec.js: expand a .zip in the file tree (offline), drill
  into a member subdir, preview a text member — exercises shared/zip-source.js
  and the migrated offline path end to end.
- tests/archive.spec.js: a .zip whose name parses as a transmittal folder
  is scanned like an uncompressed one — members land in the file list with
  tracking numbers parsed, tied to the zip transmittal's folder.
- tests/fixtures/mock-fs-api.js: __setMockDirectoryTree now keeps binary
  leaf values (Uint8Array/ArrayBuffer/Blob) intact instead of String()-ing
  them — needed to feed real zip bytes through the mock FS.
- tests/data/test-archive.sh: each party gets one transmittal delivered as
  a single .zip in received/, so the bitnest fixture exercises the
  zip-as-virtual-directory path.
- ARCHITECTURE.md / AGENTS.md: document .zip-as-navigable-directory (server
  route + ACL model + shared client adapter + the one-level nesting limit).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-12 12:35:48 -05:00

606 lines
22 KiB
Bash
Executable file

#!/bin/sh
# test-archive.sh — build/clear a synthetic ZDDC archive for end-to-end
# testing of master + cache + mirror.
#
# The fixture mimics the SHAPE of a real ZDDC archive (project →
# archive → party → received|issued → dated transmittal folder →
# tracking-number-named files) but contains zero identifying data.
# Every file's content is a 4-line metadata block:
#
# Tracking Number: FAC1-EL-CAL-0020
# Revision: A
# Status: IFI
# Title: <synthetic lorem-ipsum phrase>
#
# rendered into the appropriate format per extension. Open any file
# and you can verify it's the right one. Tracking-number / revision /
# status / extension distributions are derived from a real archive
# CSV (~/archive-export*.csv) but the script never reads that CSV
# at runtime — distributions are baked in here as constants.
#
# Output lives at $TEST_ARCHIVE_DIR (default ~/zddc-test-data),
# OUTSIDE the repo. There is also a defensive .gitignore entry
# matching common in-repo paths in case someone redirects.
#
# PDF generation uses docker.io/pandoc/latex via podman with
# --userns=keep-id so output is owned by the host user. If podman
# isn't available, PDF generation falls back to plaintext PDF
# (a hand-rolled minimal valid PDF — opens but no formatting).
set -eu
# ---------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------
TARGET="${TEST_ARCHIVE_DIR:-$HOME/zddc-test-data}"
SMALL=0
PROJECTS_FULL="Project-1 Project-2 Project-3"
PROJECTS_SMALL="Project-1"
PARTIES_FULL="PartyA PartyB PartyC"
PARTIES_SMALL="PartyA"
TRANSMITTALS_PER_PARTY_FULL=6
TRANSMITTALS_PER_PARTY_SMALL=2
# Each transmittal contains at least one of every extension in
# EXTENSIONS_GUARANTEED (6 of them), plus extras up to this total.
FILES_PER_TRANSMITTAL_FULL=10
FILES_PER_TRANSMITTAL_SMALL=6
PANDOC_IMAGE="docker.io/pandoc/latex:latest"
# Status / revision / extension / discipline distributions, derived
# from a 773-row sample of a real archive. Format-preserving — these
# are public ZDDC convention vocabularies, no identifying data.
STATUSES="IFR IFR IFR IFR IFR IFR IFI IFI IFU IFU IFA RSB" # weighted
REVISIONS="A B 0 0A 0B C D"
# The full extension set per the test plan. Each transmittal gets one
# of each (so every fixture exercises every extension), then EXTRAS
# are sampled from the weighted distribution.
EXTENSIONS_GUARANTEED="md yaml pdf html zddc zip"
EXTENSIONS_WEIGHTED="pdf pdf pdf pdf md md yaml html zip zddc"
DISCIPLINES="EL PM CAL CPT TRN INT MEC SPC"
DOC_TYPES="CAL CPT TRN SPC DRW LST RPT MDL"
# Lorem-ipsum-style title fragments. No real-world references.
TITLE_WORDS="lorem ipsum dolor sit amet consectetur adipiscing elit \
sed eiusmod tempor incididunt labore magna aliqua veniam nostrud \
exercitation ullamco laboris nisi commodo duis aute irure dolore"
# Synthetic admin emails for .zddc ACLs. example.com is reserved
# (RFC 2606), guaranteed not to belong to anyone real.
ADMIN_EMAIL="admin@example.com"
USER_EMAILS="alice@example.com bob@example.com carol@example.com"
# ---------------------------------------------------------------------
# Subcommand dispatch
# ---------------------------------------------------------------------
# ---------------------------------------------------------------------
# Random-number helper. dash (POSIX /bin/sh) has no $RANDOM, so we read
# 2 bytes from /dev/urandom each call and decode to a 16-bit unsigned
# int. Fast (no exec, no awk per call) and properly random across runs.
# ---------------------------------------------------------------------
_rand() {
od -An -N2 -tu2 /dev/urandom | tr -d ' \n'
}
usage() {
cat <<EOF
Usage: $0 <subcommand> [--small]
Subcommands:
build [--small] Generate the synthetic archive (small = ~10x fewer files).
clear Remove the archive directory entirely.
info Show what's there (file count, total size, top-level layout).
help Print this message.
Configuration:
TEST_ARCHIVE_DIR Output directory (default: ~/zddc-test-data).
PDF generation:
Uses $PANDOC_IMAGE via podman (with --userns=keep-id so output is
owned by the host user). Falls back to plaintext PDF when podman
is unavailable.
EOF
}
cmd="${1:-help}"
shift 2>/dev/null || true
for arg in "$@"; do
case "$arg" in
--small) SMALL=1 ;;
*) echo "unknown flag: $arg" >&2; exit 2 ;;
esac
done
# ---------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------
# pick_word <list> — random pick from a whitespace-separated list.
pick_word() {
list="$1"
n=$(echo "$list" | wc -w)
idx=$(( $(_rand) % n + 1 ))
echo "$list" | cut -d' ' -f"$idx"
}
random_int() {
awk -v min="$1" -v max="$2" 'BEGIN { srand(); printf "%d\n", min + int(rand() * (max - min + 1)) }'
}
# Pick a date string YYYY-MM-DD between Jan 1 of last year and today.
random_date() {
days_back=$(awk 'BEGIN { srand(); printf "%d\n", int(rand() * 730) }')
date -d "$days_back days ago" +%Y-%m-%d
}
# Build a 3-6-word lorem title.
random_title() {
n=$(random_int 3 6)
out=""
i=0
while [ "$i" -lt "$n" ]; do
w=$(pick_word "$TITLE_WORDS")
# Capitalize first letter for the first word.
if [ "$i" = 0 ]; then
first=$(printf '%s' "$w" | cut -c1 | tr 'a-z' 'A-Z')
rest=$(printf '%s' "$w" | cut -c2-)
w="${first}${rest}"
fi
out="${out}${out:+ }${w}"
i=$((i + 1))
done
printf '%s' "$out"
}
# Synthetic tracking number: <party>-<facility>-<discipline>-<doctype>-NNNN
make_tracking() {
party="$1" # PartyA → A
party_short=$(printf '%s' "$party" | sed 's/^Party//')
facility="FAC$(random_int 1 4)"
discipline=$(pick_word "$DISCIPLINES")
doctype=$(pick_word "$DOC_TYPES")
seq=$(printf '%04d' "$(random_int 1 999)")
printf '%s-%s-%s-%s-%s' "$party_short" "$facility" "$discipline" "$doctype" "$seq"
}
# Render the metadata block in the right format for an extension.
# Args: ext, tracking, rev, status, title, outpath
render_file() {
ext="$1"; tracking="$2"; rev="$3"; status="$4"; title="$5"; out="$6"
case "$ext" in
md)
cat > "$out" <<EOF
# $tracking
| Field | Value |
|---|---|
| Tracking Number | $tracking |
| Revision | $rev |
| Status | $status |
| Title | $title |
This is a synthetic test fixture. Generated by tests/data/test-archive.sh.
EOF
;;
yaml)
cat > "$out" <<EOF
tracking_number: "$tracking"
revision: "$rev"
status: "$status"
title: "$title"
synthetic: true
generated_by: tests/data/test-archive.sh
EOF
;;
html)
cat > "$out" <<EOF
<!doctype html>
<html><head><meta charset="utf-8"><title>$tracking</title></head>
<body style="font-family:sans-serif;padding:2em">
<h1>$tracking</h1>
<table border="1" cellpadding="6" cellspacing="0">
<tr><th>Tracking Number</th><td>$tracking</td></tr>
<tr><th>Revision</th><td>$rev</td></tr>
<tr><th>Status</th><td>$status</td></tr>
<tr><th>Title</th><td>$title</td></tr>
</table>
<p><em>Synthetic test fixture. Generated by tests/data/test-archive.sh.</em></p>
</body></html>
EOF
;;
zddc)
# *.zddc as a data file (not the special config file).
# YAML-shape since .zddc files ARE YAML.
cat > "$out" <<EOF
# Synthetic .zddc data file (not an ACL config).
tracking_number: "$tracking"
revision: "$rev"
status: "$status"
title: "$title"
synthetic: true
EOF
;;
pdf)
render_pdf "$tracking" "$rev" "$status" "$title" "$out"
;;
zip)
render_zip "$tracking" "$rev" "$status" "$title" "$out"
;;
*)
echo "render_file: unknown extension $ext" >&2
exit 1
;;
esac
}
# Render PDF via pandoc/latex if podman is available; fall back to a
# hand-rolled minimal PDF otherwise.
render_pdf() {
tracking="$1"; rev="$2"; status="$3"; title="$4"; out="$5"
if [ "${PDF_BACKEND:-pandoc}" = "minimal" ] || ! command -v podman >/dev/null 2>&1; then
render_pdf_minimal "$tracking" "$rev" "$status" "$title" "$out"
return
fi
# Build a temp .md alongside, render, drop the .md.
tmp_md="${out%.pdf}.tmp.md"
cat > "$tmp_md" <<EOF
# $tracking
| Field | Value |
|---|---|
| Tracking Number | $tracking |
| Revision | $rev |
| Status | $status |
| Title | $title |
Synthetic test fixture. Generated by tests/data/test-archive.sh.
EOF
# Run pandoc in container. Output dir must be writable by the
# in-container UID; --userns=keep-id keeps it as the host user.
dir=$(dirname "$out")
md_base=$(basename "$tmp_md")
pdf_base=$(basename "$out")
if ! podman run --rm --userns=keep-id \
-v "$dir":/data:Z \
"$PANDOC_IMAGE" "/data/$md_base" -o "/data/$pdf_base" >/dev/null 2>&1; then
# Pandoc failed (image missing? network blocked?). Fall back.
rm -f "$tmp_md"
render_pdf_minimal "$tracking" "$rev" "$status" "$title" "$out"
return
fi
rm -f "$tmp_md"
}
# Hand-rolled minimal valid PDF — opens in any reader, displays the
# metadata block. ~600 bytes. Used only when pandoc isn't reachable.
render_pdf_minimal() {
tracking="$1"; rev="$2"; status="$3"; title="$4"; out="$5"
# PDF strings escape (, ), \ — the lorem-ipsum titles never include
# these so a basic substitution is enough for our fixture.
safe() { printf '%s' "$1" | sed 's/[()\\]/_/g'; }
t=$(safe "$tracking"); r=$(safe "$rev"); s=$(safe "$status"); ti=$(safe "$title")
python3 - "$out" "$t" "$r" "$s" "$ti" <<'PY'
import sys
out, t, r, s, ti = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5]
text = (
f"BT /F1 14 Tf 72 720 Td ({t}) Tj ET\n"
f"BT /F1 12 Tf 72 700 Td (Revision: {r}) Tj ET\n"
f"BT /F1 12 Tf 72 685 Td (Status: {s}) Tj ET\n"
f"BT /F1 12 Tf 72 670 Td (Title: {ti}) Tj ET\n"
f"BT /F1 10 Tf 72 640 Td (Synthetic test fixture - tests/data/test-archive.sh) Tj ET\n"
)
text_b = text.encode("latin-1", errors="replace")
objs = [
b"<</Type/Catalog/Pages 2 0 R>>",
b"<</Type/Pages/Kids[3 0 R]/Count 1>>",
b"<</Type/Page/Parent 2 0 R/MediaBox[0 0 612 792]/Contents 4 0 R/Resources<</Font<</F1 5 0 R>>>>>>",
b"<</Length " + str(len(text_b)).encode() + b">>stream\n" + text_b + b"endstream",
b"<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>",
]
buf = bytearray(b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n")
offsets = []
for i, obj in enumerate(objs, 1):
offsets.append(len(buf))
buf += f"{i} 0 obj\n".encode() + obj + b"\nendobj\n"
xref = len(buf)
buf += f"xref\n0 {len(objs)+1}\n".encode()
buf += b"0000000000 65535 f \n"
for o in offsets:
buf += f"{o:010d} 00000 n \n".encode()
buf += f"trailer <</Size {len(objs)+1}/Root 1 0 R>>\nstartxref\n{xref}\n%%EOF\n".encode()
open(out, "wb").write(buf)
PY
}
# Render a .zip containing a .md, .yaml, and .html with the same
# metadata so unzipping shows three views of the same record. POSIX
# sh has no function-local scope, so nested render_file calls would
# clobber $out — copy to z_out first.
render_zip() {
z_track="$1"; z_rev="$2"; z_status="$3"; z_title="$4"; z_out="$5"
tmpdir=$(mktemp -d)
render_file md "$z_track" "$z_rev" "$z_status" "$z_title" "$tmpdir/$z_track.md"
render_file yaml "$z_track" "$z_rev" "$z_status" "$z_title" "$tmpdir/$z_track.yaml"
render_file html "$z_track" "$z_rev" "$z_status" "$z_title" "$tmpdir/$z_track.html"
(cd "$tmpdir" && zip -q "$z_out" ./*)
rm -rf "$tmpdir"
}
# Write a per-directory .zddc ACL config. Synthetic emails only.
# Project role accepts an optional 3rd arg — a human-friendly project
# title that surfaces in landing/archive UIs. A 4th arg embeds a
# display: block exercising the canonical-folder display-override
# feature (e.g. show "Records" instead of "archive" in the project
# root listing).
write_zddc_config() {
out="$1"
role="${2:-default}" # default | party | project
title="${3:-}"
extra_yaml="${4:-}"
case "$role" in
project|party)
if [ -z "$title" ]; then
title="Synthetic ${role} ACL — test fixture"
fi
{
cat <<EOF
title: "$title"
admins:
- $ADMIN_EMAIL
acl:
permissions:
"$ADMIN_EMAIL": rwcda
"alice@example.com": rwcd
"bob@example.com": rw
"carol@example.com": r
EOF
if [ -n "$extra_yaml" ]; then
printf '%s\n' "$extra_yaml"
fi
} > "$out"
;;
*)
cat > "$out" <<EOF
title: "ZDDC test fixture — synthetic root"
admins:
- $ADMIN_EMAIL
acl:
permissions:
"$ADMIN_EMAIL": rwcda
"*@example.com": r
EOF
;;
esac
}
# Per-project display titles. Stable across rebuilds so the dropdown
# in the archive app has recognisable, human-friendly names. The
# third project also gets a display: override on canonical folders to
# exercise that feature.
project_title() {
case "$1" in
Project-1) echo "Wabash Industrial Refit — Phase 1" ;;
Project-2) echo "North Avenue Transit Spur" ;;
Project-3) echo "Lincoln Square Substation Upgrade" ;;
*) echo "Synthetic project — $1" ;;
esac
}
project_extra_yaml() {
case "$1" in
Project-3)
# Exercise the display-override feature on canonical
# project-root folders. The on-disk names stay lowercase
# (canonical); the UI shows the friendly label.
cat <<'EOF'
display:
archive: "Records"
working: "In-Progress"
staging: "Outbox"
reviewing: "Pending Responses"
EOF
;;
esac
}
# ---------------------------------------------------------------------
# build
# ---------------------------------------------------------------------
cmd_build() {
if [ -e "$TARGET" ]; then
echo "$TARGET already exists. Run '$0 clear' first." >&2
exit 1
fi
if [ "$SMALL" = 1 ]; then
projects="$PROJECTS_SMALL"
parties="$PARTIES_SMALL"
per_party=$TRANSMITTALS_PER_PARTY_SMALL
per_trans=$FILES_PER_TRANSMITTAL_SMALL
echo "building SMALL fixture at $TARGET"
else
projects="$PROJECTS_FULL"
parties="$PARTIES_FULL"
per_party=$TRANSMITTALS_PER_PARTY_FULL
per_trans=$FILES_PER_TRANSMITTAL_FULL
echo "building FULL fixture at $TARGET"
fi
# 0777 on the archive dir lets the rootless-podman pandoc container
# write PDF output regardless of UID-namespace mapping. We're in
# $HOME so the parent dir is already access-controlled by user.
mkdir -p "$TARGET"
chmod 0777 "$TARGET"
# Root .zddc — admins + read-only-for-anyone-with-an-example.com-email.
write_zddc_config "$TARGET/.zddc" default
file_count=0
pdf_count=0
for project in $projects; do
proj_dir="$TARGET/$project"
mkdir -p "$proj_dir"
chmod 0777 "$proj_dir"
write_zddc_config "$proj_dir/.zddc" project \
"$(project_title "$project")" "$(project_extra_yaml "$project")"
for party in $parties; do
party_dir="$proj_dir/archive/$party"
mkdir -p "$party_dir/received" "$party_dir/issued" "$party_dir/incoming"
chmod 0777 "$party_dir" "$party_dir/received" "$party_dir/issued" "$party_dir/incoming"
write_zddc_config "$party_dir/.zddc" party
# Seed incoming/ with a couple of unclassified files (no
# transmittal envelope yet) so the grid view has data to
# exercise. Stays small — incoming is the staging surface
# for files dropped by counterparties; classifier turns
# them into named transmittal folders.
incoming_seed_count=3
j=0
while [ "$j" -lt "$incoming_seed_count" ]; do
j=$((j + 1))
seed_ext=$(pick_word "$EXTENSIONS_WEIGHTED")
seed_title="$(random_title)"
seed_name="incoming-${j}-${seed_title}.${seed_ext}"
seed_path="$party_dir/incoming/$seed_name"
# Use the per-transmittal renderer with dummy
# tracking/rev/status. classifier reads the bytes,
# not the envelope, when renaming.
render_file "$seed_ext" "drop-${j}" "_A" "(IFR)" "$seed_title" "$seed_path"
done
i=0
while [ "$i" -lt "$per_party" ]; do
i=$((i + 1))
# Alternate received / issued.
if [ $((i % 2)) = 0 ]; then
bucket="received"
else
bucket="issued"
fi
# Transmittal envelope: <date>_<tracking> (<status>) - <title>
t_track=$(make_tracking "$party")
t_status=$(pick_word "$STATUSES")
t_title=$(random_title)
t_date=$(random_date)
t_dir="$party_dir/$bucket/${t_date}_${t_track} (${t_status}) - ${t_title}"
mkdir -p "$t_dir"
chmod 0777 "$t_dir"
# Build the per-transmittal extension list: every
# extension in EXTENSIONS_GUARANTEED at least once,
# then weighted-random extras to reach per_trans total.
file_exts="$EXTENSIONS_GUARANTEED"
guaranteed_count=$(echo "$file_exts" | wc -w)
extras=$((per_trans - guaranteed_count))
if [ "$extras" -gt 0 ]; then
k=0
while [ "$k" -lt "$extras" ]; do
file_exts="$file_exts $(pick_word "$EXTENSIONS_WEIGHTED")"
k=$((k + 1))
done
fi
for f_ext in $file_exts; do
f_track=$(make_tracking "$party")
f_rev=$(pick_word "$REVISIONS")
f_status=$(pick_word "$STATUSES")
f_title=$(random_title)
# Filename per ZDDC convention.
f_name="${f_track}_${f_rev} (${f_status}) - ${f_title}.${f_ext}"
f_path="$t_dir/$f_name"
render_file "$f_ext" "$f_track" "$f_rev" "$f_status" "$f_title" "$f_path"
file_count=$((file_count + 1))
if [ "$f_ext" = "pdf" ]; then
pdf_count=$((pdf_count + 1))
fi
done
done
# One transmittal delivered as a single .zip (rather than an
# uncompressed folder): archive treats it like a normal
# transmittal folder, and zddc-server serves "<…>.zip/" as a
# virtual directory whose members are extracted on demand.
z_track=$(make_tracking "$party")
z_status=$(pick_word "$STATUSES")
z_title=$(random_title)
z_date=$(random_date)
z_dest="$party_dir/received/${z_date}_${z_track} (${z_status}) - ${z_title}.zip"
z_tmp=$(mktemp -d)
for z_ext in pdf md yaml; do
m_track=$(make_tracking "$party")
m_rev=$(pick_word "$REVISIONS")
m_status=$(pick_word "$STATUSES")
m_title=$(random_title)
m_name="${m_track}_${m_rev} (${m_status}) - ${m_title}.${z_ext}"
render_file "$z_ext" "$m_track" "$m_rev" "$m_status" "$m_title" "$z_tmp/$m_name"
file_count=$((file_count + 1))
if [ "$z_ext" = "pdf" ]; then
pdf_count=$((pdf_count + 1))
fi
done
( cd "$z_tmp" && zip -q "$z_dest" ./* )
rm -rf "$z_tmp"
chmod 0666 "$z_dest" 2>/dev/null || true
done
done
echo "built: $file_count files ($pdf_count PDFs) at $TARGET"
echo "info: $0 info"
}
# ---------------------------------------------------------------------
# clear
# ---------------------------------------------------------------------
cmd_clear() {
if [ ! -e "$TARGET" ]; then
echo "$TARGET does not exist; nothing to clear"
return 0
fi
# Defense in depth: refuse to rm anything that doesn't look like
# a test-archive directory.
if [ ! -f "$TARGET/.zddc" ]; then
echo "$TARGET does not contain a .zddc — refusing to rm" >&2
echo "(set TEST_ARCHIVE_DIR explicitly if your fixture lives elsewhere)" >&2
exit 1
fi
rm -rf "$TARGET"
echo "cleared $TARGET"
}
# ---------------------------------------------------------------------
# info
# ---------------------------------------------------------------------
cmd_info() {
if [ ! -e "$TARGET" ]; then
echo "$TARGET does not exist (run '$0 build' first)"
return 0
fi
echo "fixture: $TARGET"
files=$(find "$TARGET" -type f | wc -l)
bytes=$(du -sb "$TARGET" 2>/dev/null | awk '{print $1}')
echo "files: $files"
if [ -n "$bytes" ]; then
# Format bytes as KB/MB.
awk -v b="$bytes" 'BEGIN {
if (b < 1024) printf "size: %d B\n", b
else if (b < 1048576) printf "size: %.1f KB\n", b / 1024
else printf "size: %.1f MB\n", b / 1048576
}'
fi
echo "by extension:"
find "$TARGET" -type f -name '*.*' | sed -E 's/.*\.([a-z]+)$/\1/' | sort | uniq -c | sort -rn | head | awk '{printf " %5d %s\n", $1, $2}'
echo "top-level layout:"
find "$TARGET" -maxdepth 3 -mindepth 1 -type d | sed "s|^$TARGET| .|" | head -20
}
case "$cmd" in
build) cmd_build ;;
clear) cmd_clear ;;
info) cmd_info ;;
help|-h|--help) usage ;;
*) echo "unknown subcommand: $cmd" >&2; usage; exit 2 ;;
esac