#!/bin/sh # test-archive.sh — build/clear a synthetic ZDDC archive for end-to-end # testing of master + cache + mirror. # # The fixture mimics the SHAPE of a real ZDDC archive (project → # Archive → party → Received|Issued → dated transmittal folder → # tracking-number-named files) but contains zero identifying data. # Every file's content is a 4-line metadata block: # # Tracking Number: FAC1-EL-CAL-0020 # Revision: A # Status: IFI # Title: # # rendered into the appropriate format per extension. Open any file # and you can verify it's the right one. Tracking-number / revision / # status / extension distributions are derived from a real archive # CSV (~/archive-export*.csv) but the script never reads that CSV # at runtime — distributions are baked in here as constants. # # Output lives at $TEST_ARCHIVE_DIR (default ~/zddc-test-data), # OUTSIDE the repo. There is also a defensive .gitignore entry # matching common in-repo paths in case someone redirects. # # PDF generation uses docker.io/pandoc/latex via podman with # --userns=keep-id so output is owned by the host user. If podman # isn't available, PDF generation falls back to plaintext PDF # (a hand-rolled minimal valid PDF — opens but no formatting). set -eu # --------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------- TARGET="${TEST_ARCHIVE_DIR:-$HOME/zddc-test-data}" SMALL=0 PROJECTS_FULL="Project-1 Project-2 Project-3" PROJECTS_SMALL="Project-1" PARTIES_FULL="PartyA PartyB PartyC" PARTIES_SMALL="PartyA" TRANSMITTALS_PER_PARTY_FULL=6 TRANSMITTALS_PER_PARTY_SMALL=2 # Each transmittal contains at least one of every extension in # EXTENSIONS_GUARANTEED (6 of them), plus extras up to this total. FILES_PER_TRANSMITTAL_FULL=10 FILES_PER_TRANSMITTAL_SMALL=6 PANDOC_IMAGE="docker.io/pandoc/latex:latest" # Status / revision / extension / discipline distributions, derived # from a 773-row sample of a real archive. Format-preserving — these # are public ZDDC convention vocabularies, no identifying data. STATUSES="IFR IFR IFR IFR IFR IFR IFI IFI IFU IFU IFA RSB" # weighted REVISIONS="A B 0 0A 0B C D" # The full extension set per the test plan. Each transmittal gets one # of each (so every fixture exercises every extension), then EXTRAS # are sampled from the weighted distribution. EXTENSIONS_GUARANTEED="md yaml pdf html zddc zip" EXTENSIONS_WEIGHTED="pdf pdf pdf pdf md md yaml html zip zddc" DISCIPLINES="EL PM CAL CPT TRN INT MEC SPC" DOC_TYPES="CAL CPT TRN SPC DRW LST RPT MDL" # Lorem-ipsum-style title fragments. No real-world references. TITLE_WORDS="lorem ipsum dolor sit amet consectetur adipiscing elit \ sed eiusmod tempor incididunt labore magna aliqua veniam nostrud \ exercitation ullamco laboris nisi commodo duis aute irure dolore" # Synthetic admin emails for .zddc ACLs. example.com is reserved # (RFC 2606), guaranteed not to belong to anyone real. ADMIN_EMAIL="admin@example.com" USER_EMAILS="alice@example.com bob@example.com carol@example.com" # --------------------------------------------------------------------- # Subcommand dispatch # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Random-number helper. dash (POSIX /bin/sh) has no $RANDOM, so we read # 2 bytes from /dev/urandom each call and decode to a 16-bit unsigned # int. Fast (no exec, no awk per call) and properly random across runs. # --------------------------------------------------------------------- _rand() { od -An -N2 -tu2 /dev/urandom | tr -d ' \n' } usage() { cat < [--small] Subcommands: build [--small] Generate the synthetic archive (small = ~10x fewer files). clear Remove the archive directory entirely. info Show what's there (file count, total size, top-level layout). help Print this message. Configuration: TEST_ARCHIVE_DIR Output directory (default: ~/zddc-test-data). PDF generation: Uses $PANDOC_IMAGE via podman (with --userns=keep-id so output is owned by the host user). Falls back to plaintext PDF when podman is unavailable. EOF } cmd="${1:-help}" shift 2>/dev/null || true for arg in "$@"; do case "$arg" in --small) SMALL=1 ;; *) echo "unknown flag: $arg" >&2; exit 2 ;; esac done # --------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------- # pick_word — random pick from a whitespace-separated list. pick_word() { list="$1" n=$(echo "$list" | wc -w) idx=$(( $(_rand) % n + 1 )) echo "$list" | cut -d' ' -f"$idx" } random_int() { awk -v min="$1" -v max="$2" 'BEGIN { srand(); printf "%d\n", min + int(rand() * (max - min + 1)) }' } # Pick a date string YYYY-MM-DD between Jan 1 of last year and today. random_date() { days_back=$(awk 'BEGIN { srand(); printf "%d\n", int(rand() * 730) }') date -d "$days_back days ago" +%Y-%m-%d } # Build a 3-6-word lorem title. random_title() { n=$(random_int 3 6) out="" i=0 while [ "$i" -lt "$n" ]; do w=$(pick_word "$TITLE_WORDS") # Capitalize first letter for the first word. if [ "$i" = 0 ]; then first=$(printf '%s' "$w" | cut -c1 | tr 'a-z' 'A-Z') rest=$(printf '%s' "$w" | cut -c2-) w="${first}${rest}" fi out="${out}${out:+ }${w}" i=$((i + 1)) done printf '%s' "$out" } # Synthetic tracking number: ----NNNN make_tracking() { party="$1" # PartyA → A party_short=$(printf '%s' "$party" | sed 's/^Party//') facility="FAC$(random_int 1 4)" discipline=$(pick_word "$DISCIPLINES") doctype=$(pick_word "$DOC_TYPES") seq=$(printf '%04d' "$(random_int 1 999)") printf '%s-%s-%s-%s-%s' "$party_short" "$facility" "$discipline" "$doctype" "$seq" } # Render the metadata block in the right format for an extension. # Args: ext, tracking, rev, status, title, outpath render_file() { ext="$1"; tracking="$2"; rev="$3"; status="$4"; title="$5"; out="$6" case "$ext" in md) cat > "$out" < "$out" < "$out" < $tracking

$tracking

Tracking Number$tracking
Revision$rev
Status$status
Title$title

Synthetic test fixture. Generated by tests/data/test-archive.sh.

EOF ;; zddc) # *.zddc as a data file (not the special config file). # YAML-shape since .zddc files ARE YAML. cat > "$out" <&2 exit 1 ;; esac } # Render PDF via pandoc/latex if podman is available; fall back to a # hand-rolled minimal PDF otherwise. render_pdf() { tracking="$1"; rev="$2"; status="$3"; title="$4"; out="$5" if [ "${PDF_BACKEND:-pandoc}" = "minimal" ] || ! command -v podman >/dev/null 2>&1; then render_pdf_minimal "$tracking" "$rev" "$status" "$title" "$out" return fi # Build a temp .md alongside, render, drop the .md. tmp_md="${out%.pdf}.tmp.md" cat > "$tmp_md" </dev/null 2>&1; then # Pandoc failed (image missing? network blocked?). Fall back. rm -f "$tmp_md" render_pdf_minimal "$tracking" "$rev" "$status" "$title" "$out" return fi rm -f "$tmp_md" } # Hand-rolled minimal valid PDF — opens in any reader, displays the # metadata block. ~600 bytes. Used only when pandoc isn't reachable. render_pdf_minimal() { tracking="$1"; rev="$2"; status="$3"; title="$4"; out="$5" # PDF strings escape (, ), \ — the lorem-ipsum titles never include # these so a basic substitution is enough for our fixture. safe() { printf '%s' "$1" | sed 's/[()\\]/_/g'; } t=$(safe "$tracking"); r=$(safe "$rev"); s=$(safe "$status"); ti=$(safe "$title") python3 - "$out" "$t" "$r" "$s" "$ti" <<'PY' import sys out, t, r, s, ti = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5] text = ( f"BT /F1 14 Tf 72 720 Td ({t}) Tj ET\n" f"BT /F1 12 Tf 72 700 Td (Revision: {r}) Tj ET\n" f"BT /F1 12 Tf 72 685 Td (Status: {s}) Tj ET\n" f"BT /F1 12 Tf 72 670 Td (Title: {ti}) Tj ET\n" f"BT /F1 10 Tf 72 640 Td (Synthetic test fixture - tests/data/test-archive.sh) Tj ET\n" ) text_b = text.encode("latin-1", errors="replace") objs = [ b"<>", b"<>", b"<>>>>>", b"<>stream\n" + text_b + b"endstream", b"<>", ] buf = bytearray(b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n") offsets = [] for i, obj in enumerate(objs, 1): offsets.append(len(buf)) buf += f"{i} 0 obj\n".encode() + obj + b"\nendobj\n" xref = len(buf) buf += f"xref\n0 {len(objs)+1}\n".encode() buf += b"0000000000 65535 f \n" for o in offsets: buf += f"{o:010d} 00000 n \n".encode() buf += f"trailer <>\nstartxref\n{xref}\n%%EOF\n".encode() open(out, "wb").write(buf) PY } # Render a .zip containing a .md, .yaml, and .html with the same # metadata so unzipping shows three views of the same record. POSIX # sh has no function-local scope, so nested render_file calls would # clobber $out — copy to z_out first. render_zip() { z_track="$1"; z_rev="$2"; z_status="$3"; z_title="$4"; z_out="$5" tmpdir=$(mktemp -d) render_file md "$z_track" "$z_rev" "$z_status" "$z_title" "$tmpdir/$z_track.md" render_file yaml "$z_track" "$z_rev" "$z_status" "$z_title" "$tmpdir/$z_track.yaml" render_file html "$z_track" "$z_rev" "$z_status" "$z_title" "$tmpdir/$z_track.html" (cd "$tmpdir" && zip -q "$z_out" ./*) rm -rf "$tmpdir" } # Write a per-directory .zddc ACL config. Synthetic emails only. write_zddc_config() { out="$1" role="${2:-default}" # default | party | project case "$role" in project|party) cat > "$out" < "$out" <&2 exit 1 fi if [ "$SMALL" = 1 ]; then projects="$PROJECTS_SMALL" parties="$PARTIES_SMALL" per_party=$TRANSMITTALS_PER_PARTY_SMALL per_trans=$FILES_PER_TRANSMITTAL_SMALL echo "building SMALL fixture at $TARGET" else projects="$PROJECTS_FULL" parties="$PARTIES_FULL" per_party=$TRANSMITTALS_PER_PARTY_FULL per_trans=$FILES_PER_TRANSMITTAL_FULL echo "building FULL fixture at $TARGET" fi # 0777 on the archive dir lets the rootless-podman pandoc container # write PDF output regardless of UID-namespace mapping. We're in # $HOME so the parent dir is already access-controlled by user. mkdir -p "$TARGET" chmod 0777 "$TARGET" # Root .zddc — admins + read-only-for-anyone-with-an-example.com-email. write_zddc_config "$TARGET/.zddc" default file_count=0 pdf_count=0 for project in $projects; do proj_dir="$TARGET/$project" mkdir -p "$proj_dir" chmod 0777 "$proj_dir" write_zddc_config "$proj_dir/.zddc" project for party in $parties; do party_dir="$proj_dir/Archive/$party" mkdir -p "$party_dir/Received" "$party_dir/Issued" chmod 0777 "$party_dir" "$party_dir/Received" "$party_dir/Issued" write_zddc_config "$party_dir/.zddc" party i=0 while [ "$i" -lt "$per_party" ]; do i=$((i + 1)) # Alternate Received / Issued. if [ $((i % 2)) = 0 ]; then bucket="Received" else bucket="Issued" fi # Transmittal envelope: _ () - t_track=$(make_tracking "$party") t_status=$(pick_word "$STATUSES") t_title=$(random_title) t_date=$(random_date) t_dir="$party_dir/$bucket/${t_date}_${t_track} (${t_status}) - ${t_title}" mkdir -p "$t_dir" chmod 0777 "$t_dir" # Build the per-transmittal extension list: every # extension in EXTENSIONS_GUARANTEED at least once, # then weighted-random extras to reach per_trans total. file_exts="$EXTENSIONS_GUARANTEED" guaranteed_count=$(echo "$file_exts" | wc -w) extras=$((per_trans - guaranteed_count)) if [ "$extras" -gt 0 ]; then k=0 while [ "$k" -lt "$extras" ]; do file_exts="$file_exts $(pick_word "$EXTENSIONS_WEIGHTED")" k=$((k + 1)) done fi for f_ext in $file_exts; do f_track=$(make_tracking "$party") f_rev=$(pick_word "$REVISIONS") f_status=$(pick_word "$STATUSES") f_title=$(random_title) # Filename per ZDDC convention. f_name="${f_track}_${f_rev} (${f_status}) - ${f_title}.${f_ext}" f_path="$t_dir/$f_name" render_file "$f_ext" "$f_track" "$f_rev" "$f_status" "$f_title" "$f_path" file_count=$((file_count + 1)) if [ "$f_ext" = "pdf" ]; then pdf_count=$((pdf_count + 1)) fi done done done done echo "built: $file_count files ($pdf_count PDFs) at $TARGET" echo "info: $0 info" } # --------------------------------------------------------------------- # clear # --------------------------------------------------------------------- cmd_clear() { if [ ! -e "$TARGET" ]; then echo "$TARGET does not exist; nothing to clear" return 0 fi # Defense in depth: refuse to rm anything that doesn't look like # a test-archive directory. if [ ! -f "$TARGET/.zddc" ]; then echo "$TARGET does not contain a .zddc — refusing to rm" >&2 echo "(set TEST_ARCHIVE_DIR explicitly if your fixture lives elsewhere)" >&2 exit 1 fi rm -rf "$TARGET" echo "cleared $TARGET" } # --------------------------------------------------------------------- # info # --------------------------------------------------------------------- cmd_info() { if [ ! -e "$TARGET" ]; then echo "$TARGET does not exist (run '$0 build' first)" return 0 fi echo "fixture: $TARGET" files=$(find "$TARGET" -type f | wc -l) bytes=$(du -sb "$TARGET" 2>/dev/null | awk '{print $1}') echo "files: $files" if [ -n "$bytes" ]; then # Format bytes as KB/MB. awk -v b="$bytes" 'BEGIN { if (b < 1024) printf "size: %d B\n", b else if (b < 1048576) printf "size: %.1f KB\n", b / 1024 else printf "size: %.1f MB\n", b / 1048576 }' fi echo "by extension:" find "$TARGET" -type f -name '*.*' | sed -E 's/.*\.([a-z]+)$/\1/' | sort | uniq -c | sort -rn | head | awk '{printf " %5d %s\n", $1, $2}' echo "top-level layout:" find "$TARGET" -maxdepth 3 -mindepth 1 -type d | sed "s|^$TARGET| .|" | head -20 } case "$cmd" in build) cmd_build ;; clear) cmd_clear ;; info) cmd_info ;; help|-h|--help) usage ;; *) echo "unknown subcommand: $cmd" >&2; usage; exit 2 ;; esac