diff --git a/.gitignore b/.gitignore index ed34e3f..18d2592 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,11 @@ package-lock.json zddc-knowledge*.json zddc-knowledge*.md zddc-knowledge*.html + +# tests/data/test-archive.sh fixture output. Default is ~/zddc-test-data +# (outside the repo); these patterns catch in-repo redirects via +# TEST_ARCHIVE_DIR. Defense in depth — the real-archive CSV reference +# at ~/archive-export*.csv must NEVER end up in the repo. +/zddc-test-data/ +/tests/data/output/ +/archive-export*.csv diff --git a/tests/data/test-archive.sh b/tests/data/test-archive.sh new file mode 100755 index 0000000..d7ef1ec --- /dev/null +++ b/tests/data/test-archive.sh @@ -0,0 +1,516 @@ +#!/bin/sh +# test-archive.sh — build/clear a synthetic ZDDC archive for end-to-end +# testing of master + cache + mirror. +# +# The fixture mimics the SHAPE of a real ZDDC archive (project → +# Archive → party → Received|Issued → dated transmittal folder → +# tracking-number-named files) but contains zero identifying data. +# Every file's content is a 4-line metadata block: +# +# Tracking Number: FAC1-EL-CAL-0020 +# Revision: A +# Status: IFI +# Title: +# +# rendered into the appropriate format per extension. Open any file +# and you can verify it's the right one. Tracking-number / revision / +# status / extension distributions are derived from a real archive +# CSV (~/archive-export*.csv) but the script never reads that CSV +# at runtime — distributions are baked in here as constants. +# +# Output lives at $TEST_ARCHIVE_DIR (default ~/zddc-test-data), +# OUTSIDE the repo. There is also a defensive .gitignore entry +# matching common in-repo paths in case someone redirects. +# +# PDF generation uses docker.io/pandoc/latex via podman with +# --userns=keep-id so output is owned by the host user. If podman +# isn't available, PDF generation falls back to plaintext PDF +# (a hand-rolled minimal valid PDF — opens but no formatting). + +set -eu + +# --------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------- + +TARGET="${TEST_ARCHIVE_DIR:-$HOME/zddc-test-data}" +SMALL=0 +PROJECTS_FULL="Project-1 Project-2 Project-3" +PROJECTS_SMALL="Project-1" +PARTIES_FULL="PartyA PartyB PartyC" +PARTIES_SMALL="PartyA" +TRANSMITTALS_PER_PARTY_FULL=6 +TRANSMITTALS_PER_PARTY_SMALL=2 +# Each transmittal contains at least one of every extension in +# EXTENSIONS_GUARANTEED (6 of them), plus extras up to this total. +FILES_PER_TRANSMITTAL_FULL=10 +FILES_PER_TRANSMITTAL_SMALL=6 + +PANDOC_IMAGE="docker.io/pandoc/latex:latest" + +# Status / revision / extension / discipline distributions, derived +# from a 773-row sample of a real archive. Format-preserving — these +# are public ZDDC convention vocabularies, no identifying data. +STATUSES="IFR IFR IFR IFR IFR IFR IFI IFI IFU IFU IFA RSB" # weighted +REVISIONS="A B 0 0A 0B C D" +# The full extension set per the test plan. Each transmittal gets one +# of each (so every fixture exercises every extension), then EXTRAS +# are sampled from the weighted distribution. +EXTENSIONS_GUARANTEED="md yaml pdf html zddc zip" +EXTENSIONS_WEIGHTED="pdf pdf pdf pdf md md yaml html zip zddc" +DISCIPLINES="EL PM CAL CPT TRN INT MEC SPC" +DOC_TYPES="CAL CPT TRN SPC DRW LST RPT MDL" + +# Lorem-ipsum-style title fragments. No real-world references. +TITLE_WORDS="lorem ipsum dolor sit amet consectetur adipiscing elit \ +sed eiusmod tempor incididunt labore magna aliqua veniam nostrud \ +exercitation ullamco laboris nisi commodo duis aute irure dolore" + +# Synthetic admin emails for .zddc ACLs. example.com is reserved +# (RFC 2606), guaranteed not to belong to anyone real. +ADMIN_EMAIL="admin@example.com" +USER_EMAILS="alice@example.com bob@example.com carol@example.com" + +# --------------------------------------------------------------------- +# Subcommand dispatch +# --------------------------------------------------------------------- + +# --------------------------------------------------------------------- +# Random-number helper. dash (POSIX /bin/sh) has no $RANDOM, so we read +# 2 bytes from /dev/urandom each call and decode to a 16-bit unsigned +# int. Fast (no exec, no awk per call) and properly random across runs. +# --------------------------------------------------------------------- +_rand() { + od -An -N2 -tu2 /dev/urandom | tr -d ' \n' +} + +usage() { + cat < [--small] + +Subcommands: + build [--small] Generate the synthetic archive (small = ~10x fewer files). + clear Remove the archive directory entirely. + info Show what's there (file count, total size, top-level layout). + help Print this message. + +Configuration: + TEST_ARCHIVE_DIR Output directory (default: ~/zddc-test-data). + +PDF generation: + Uses $PANDOC_IMAGE via podman (with --userns=keep-id so output is + owned by the host user). Falls back to plaintext PDF when podman + is unavailable. +EOF +} + +cmd="${1:-help}" +shift 2>/dev/null || true +for arg in "$@"; do + case "$arg" in + --small) SMALL=1 ;; + *) echo "unknown flag: $arg" >&2; exit 2 ;; + esac +done + +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- + +# pick_word — random pick from a whitespace-separated list. +pick_word() { + list="$1" + n=$(echo "$list" | wc -w) + idx=$(( $(_rand) % n + 1 )) + echo "$list" | cut -d' ' -f"$idx" +} + +random_int() { + awk -v min="$1" -v max="$2" 'BEGIN { srand(); printf "%d\n", min + int(rand() * (max - min + 1)) }' +} + +# Pick a date string YYYY-MM-DD between Jan 1 of last year and today. +random_date() { + days_back=$(awk 'BEGIN { srand(); printf "%d\n", int(rand() * 730) }') + date -d "$days_back days ago" +%Y-%m-%d +} + +# Build a 3-6-word lorem title. +random_title() { + n=$(random_int 3 6) + out="" + i=0 + while [ "$i" -lt "$n" ]; do + w=$(pick_word "$TITLE_WORDS") + # Capitalize first letter for the first word. + if [ "$i" = 0 ]; then + first=$(printf '%s' "$w" | cut -c1 | tr 'a-z' 'A-Z') + rest=$(printf '%s' "$w" | cut -c2-) + w="${first}${rest}" + fi + out="${out}${out:+ }${w}" + i=$((i + 1)) + done + printf '%s' "$out" +} + +# Synthetic tracking number: ----NNNN +make_tracking() { + party="$1" # PartyA → A + party_short=$(printf '%s' "$party" | sed 's/^Party//') + facility="FAC$(random_int 1 4)" + discipline=$(pick_word "$DISCIPLINES") + doctype=$(pick_word "$DOC_TYPES") + seq=$(printf '%04d' "$(random_int 1 999)") + printf '%s-%s-%s-%s-%s' "$party_short" "$facility" "$discipline" "$doctype" "$seq" +} + +# Render the metadata block in the right format for an extension. +# Args: ext, tracking, rev, status, title, outpath +render_file() { + ext="$1"; tracking="$2"; rev="$3"; status="$4"; title="$5"; out="$6" + case "$ext" in + md) + cat > "$out" < "$out" < "$out" < +$tracking + +

$tracking

+ + + + + +
Tracking Number$tracking
Revision$rev
Status$status
Title$title
+

Synthetic test fixture. Generated by tests/data/test-archive.sh.

+ +EOF + ;; + zddc) + # *.zddc as a data file (not the special config file). + # YAML-shape since .zddc files ARE YAML. + cat > "$out" <&2 + exit 1 + ;; + esac +} + +# Render PDF via pandoc/latex if podman is available; fall back to a +# hand-rolled minimal PDF otherwise. +render_pdf() { + tracking="$1"; rev="$2"; status="$3"; title="$4"; out="$5" + if [ "${PDF_BACKEND:-pandoc}" = "minimal" ] || ! command -v podman >/dev/null 2>&1; then + render_pdf_minimal "$tracking" "$rev" "$status" "$title" "$out" + return + fi + # Build a temp .md alongside, render, drop the .md. + tmp_md="${out%.pdf}.tmp.md" + cat > "$tmp_md" </dev/null 2>&1; then + # Pandoc failed (image missing? network blocked?). Fall back. + rm -f "$tmp_md" + render_pdf_minimal "$tracking" "$rev" "$status" "$title" "$out" + return + fi + rm -f "$tmp_md" +} + +# Hand-rolled minimal valid PDF — opens in any reader, displays the +# metadata block. ~600 bytes. Used only when pandoc isn't reachable. +render_pdf_minimal() { + tracking="$1"; rev="$2"; status="$3"; title="$4"; out="$5" + # PDF strings escape (, ), \ — the lorem-ipsum titles never include + # these so a basic substitution is enough for our fixture. + safe() { printf '%s' "$1" | sed 's/[()\\]/_/g'; } + t=$(safe "$tracking"); r=$(safe "$rev"); s=$(safe "$status"); ti=$(safe "$title") + python3 - "$out" "$t" "$r" "$s" "$ti" <<'PY' +import sys +out, t, r, s, ti = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5] +text = ( + f"BT /F1 14 Tf 72 720 Td ({t}) Tj ET\n" + f"BT /F1 12 Tf 72 700 Td (Revision: {r}) Tj ET\n" + f"BT /F1 12 Tf 72 685 Td (Status: {s}) Tj ET\n" + f"BT /F1 12 Tf 72 670 Td (Title: {ti}) Tj ET\n" + f"BT /F1 10 Tf 72 640 Td (Synthetic test fixture - tests/data/test-archive.sh) Tj ET\n" +) +text_b = text.encode("latin-1", errors="replace") +objs = [ + b"<>", + b"<>", + b"<>>>>>", + b"<>stream\n" + text_b + b"endstream", + b"<>", +] +buf = bytearray(b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n") +offsets = [] +for i, obj in enumerate(objs, 1): + offsets.append(len(buf)) + buf += f"{i} 0 obj\n".encode() + obj + b"\nendobj\n" +xref = len(buf) +buf += f"xref\n0 {len(objs)+1}\n".encode() +buf += b"0000000000 65535 f \n" +for o in offsets: + buf += f"{o:010d} 00000 n \n".encode() +buf += f"trailer <>\nstartxref\n{xref}\n%%EOF\n".encode() +open(out, "wb").write(buf) +PY +} + +# Render a .zip containing a .md, .yaml, and .html with the same +# metadata so unzipping shows three views of the same record. POSIX +# sh has no function-local scope, so nested render_file calls would +# clobber $out — copy to z_out first. +render_zip() { + z_track="$1"; z_rev="$2"; z_status="$3"; z_title="$4"; z_out="$5" + tmpdir=$(mktemp -d) + render_file md "$z_track" "$z_rev" "$z_status" "$z_title" "$tmpdir/$z_track.md" + render_file yaml "$z_track" "$z_rev" "$z_status" "$z_title" "$tmpdir/$z_track.yaml" + render_file html "$z_track" "$z_rev" "$z_status" "$z_title" "$tmpdir/$z_track.html" + (cd "$tmpdir" && zip -q "$z_out" ./*) + rm -rf "$tmpdir" +} + +# Write a per-directory .zddc ACL config. Synthetic emails only. +write_zddc_config() { + out="$1" + role="${2:-default}" # default | party | project + case "$role" in + project|party) + cat > "$out" < "$out" <&2 + exit 1 + fi + if [ "$SMALL" = 1 ]; then + projects="$PROJECTS_SMALL" + parties="$PARTIES_SMALL" + per_party=$TRANSMITTALS_PER_PARTY_SMALL + per_trans=$FILES_PER_TRANSMITTAL_SMALL + echo "building SMALL fixture at $TARGET" + else + projects="$PROJECTS_FULL" + parties="$PARTIES_FULL" + per_party=$TRANSMITTALS_PER_PARTY_FULL + per_trans=$FILES_PER_TRANSMITTAL_FULL + echo "building FULL fixture at $TARGET" + fi + + # 0777 on the archive dir lets the rootless-podman pandoc container + # write PDF output regardless of UID-namespace mapping. We're in + # $HOME so the parent dir is already access-controlled by user. + mkdir -p "$TARGET" + chmod 0777 "$TARGET" + + # Root .zddc — admins + read-only-for-anyone-with-an-example.com-email. + write_zddc_config "$TARGET/.zddc" default + + file_count=0 + pdf_count=0 + for project in $projects; do + proj_dir="$TARGET/$project" + mkdir -p "$proj_dir" + chmod 0777 "$proj_dir" + write_zddc_config "$proj_dir/.zddc" project + + for party in $parties; do + party_dir="$proj_dir/Archive/$party" + mkdir -p "$party_dir/Received" "$party_dir/Issued" + chmod 0777 "$party_dir" "$party_dir/Received" "$party_dir/Issued" + write_zddc_config "$party_dir/.zddc" party + + i=0 + while [ "$i" -lt "$per_party" ]; do + i=$((i + 1)) + # Alternate Received / Issued. + if [ $((i % 2)) = 0 ]; then + bucket="Received" + else + bucket="Issued" + fi + # Transmittal envelope: _ () - + t_track=$(make_tracking "$party") + t_status=$(pick_word "$STATUSES") + t_title=$(random_title) + t_date=$(random_date) + t_dir="$party_dir/$bucket/${t_date}_${t_track} (${t_status}) - ${t_title}" + mkdir -p "$t_dir" + chmod 0777 "$t_dir" + + # Build the per-transmittal extension list: every + # extension in EXTENSIONS_GUARANTEED at least once, + # then weighted-random extras to reach per_trans total. + file_exts="$EXTENSIONS_GUARANTEED" + guaranteed_count=$(echo "$file_exts" | wc -w) + extras=$((per_trans - guaranteed_count)) + if [ "$extras" -gt 0 ]; then + k=0 + while [ "$k" -lt "$extras" ]; do + file_exts="$file_exts $(pick_word "$EXTENSIONS_WEIGHTED")" + k=$((k + 1)) + done + fi + + for f_ext in $file_exts; do + f_track=$(make_tracking "$party") + f_rev=$(pick_word "$REVISIONS") + f_status=$(pick_word "$STATUSES") + f_title=$(random_title) + # Filename per ZDDC convention. + f_name="${f_track}_${f_rev} (${f_status}) - ${f_title}.${f_ext}" + f_path="$t_dir/$f_name" + render_file "$f_ext" "$f_track" "$f_rev" "$f_status" "$f_title" "$f_path" + file_count=$((file_count + 1)) + if [ "$f_ext" = "pdf" ]; then + pdf_count=$((pdf_count + 1)) + fi + done + done + done + done + + echo "built: $file_count files ($pdf_count PDFs) at $TARGET" + echo "info: $0 info" +} + +# --------------------------------------------------------------------- +# clear +# --------------------------------------------------------------------- + +cmd_clear() { + if [ ! -e "$TARGET" ]; then + echo "$TARGET does not exist; nothing to clear" + return 0 + fi + # Defense in depth: refuse to rm anything that doesn't look like + # a test-archive directory. + if [ ! -f "$TARGET/.zddc" ]; then + echo "$TARGET does not contain a .zddc — refusing to rm" >&2 + echo "(set TEST_ARCHIVE_DIR explicitly if your fixture lives elsewhere)" >&2 + exit 1 + fi + rm -rf "$TARGET" + echo "cleared $TARGET" +} + +# --------------------------------------------------------------------- +# info +# --------------------------------------------------------------------- + +cmd_info() { + if [ ! -e "$TARGET" ]; then + echo "$TARGET does not exist (run '$0 build' first)" + return 0 + fi + echo "fixture: $TARGET" + files=$(find "$TARGET" -type f | wc -l) + bytes=$(du -sb "$TARGET" 2>/dev/null | awk '{print $1}') + echo "files: $files" + if [ -n "$bytes" ]; then + # Format bytes as KB/MB. + awk -v b="$bytes" 'BEGIN { + if (b < 1024) printf "size: %d B\n", b + else if (b < 1048576) printf "size: %.1f KB\n", b / 1024 + else printf "size: %.1f MB\n", b / 1048576 + }' + fi + echo "by extension:" + find "$TARGET" -type f -name '*.*' | sed -E 's/.*\.([a-z]+)$/\1/' | sort | uniq -c | sort -rn | head | awk '{printf " %5d %s\n", $1, $2}' + echo "top-level layout:" + find "$TARGET" -maxdepth 3 -mindepth 1 -type d | sed "s|^$TARGET| .|" | head -20 +} + +case "$cmd" in + build) cmd_build ;; + clear) cmd_clear ;; + info) cmd_info ;; + help|-h|--help) usage ;; + *) echo "unknown subcommand: $cmd" >&2; usage; exit 2 ;; +esac