ZDDC/training-data/validate.sh
ZDDC ea385b5366 Initial commit
ZDDC — Zero Day Document Control. A file-naming convention plus five
single-file HTML tools (archive, transmittal, classifier, mdedit,
landing) and an optional Go HTTP server (zddc-server) with ACL and a
virtual archive index. Self-contained, offline-capable, dependency-free.

See README.md for an overview, AGENTS.md and ARCHITECTURE.md for the
build/release/architecture detail, bootstrap/README.md for the
two-level deployment install pattern, and zddc/README.md for the
HTTP server.
2026-04-27 11:05:47 -05:00

114 lines
2.9 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
DOMAIN="${1:-all}"
PASS=0
WARN=0
FAIL=0
ok() { echo "$*"; ((PASS+=1)) || true; }
warn() { echo "$*"; ((WARN+=1)) || true; }
fail() { echo "$*"; ((FAIL+=1)) || true; }
echo "=== ZDDC Training Data Validator ==="
echo "Domain: $DOMAIN"
echo ""
echo "[ Raw Data ]"
RAW_FILE="raw/interactions.jsonl"
if [ ! -f "$RAW_FILE" ]; then
warn "No raw interactions file found. Collect interactions first."
else
RAW_COUNT=$(grep -c . "$RAW_FILE" 2>/dev/null || echo 0)
ok "Raw interactions: $RAW_COUNT"
fi
echo ""
echo "[ JSONL Validity ]"
check_jsonl() {
local file="$1"
if [ ! -f "$file" ]; then warn "File not found: $file"; return; fi
if [ ! -s "$file" ]; then warn "File is empty: $file"; return; fi
local count
count=$(grep -c . "$file" 2>/dev/null || echo 0)
ok "$file$count lines"
}
if [ "$DOMAIN" = "all" ]; then
for f in processed/*.jsonl validation/*.jsonl; do
[ -f "$f" ] && check_jsonl "$f"
done
else
check_jsonl "processed/${DOMAIN}.jsonl"
fi
echo ""
echo "[ Domain Balance ]"
if [ -f "processed/all.jsonl" ] && [ -s "processed/all.jsonl" ]; then
python3 -c "
import json
from collections import Counter
counts = Counter()
with open('processed/all.jsonl') as f:
for line in f:
line = line.strip()
if line:
try:
obj = json.loads(line)
domain = obj.get('metadata', {}).get('domain', 'unknown')
counts[domain] += 1
except Exception:
pass
if not counts:
print(' no domain data found')
else:
print(f' Total: {sum(counts.values())} examples')
for domain, count in sorted(counts.items(), key=lambda x: -x[1]):
status = 'OK' if count >= 200 else 'LOW'
print(f' [{status}] {domain}: {count}')
"
else
warn "processed/all.jsonl not found — run bash process.sh first"
fi
echo ""
echo "[ Train/Val/Test Split ]"
for split in train val test; do
f="validation/${split}.jsonl"
if [ ! -f "$f" ] || [ ! -s "$f" ]; then
warn "$split split missing or empty"
else
count=$(grep -c . "$f" 2>/dev/null || echo 0)
ok "$split: $count examples"
fi
done
echo ""
echo "[ Existing Adapters ]"
if [ -d "adapters" ] && [ "$(ls -A adapters 2>/dev/null)" ]; then
for adapter_dir in adapters/*/; do
name=$(basename "$adapter_dir")
if [ -f "${adapter_dir}adapter_config.json" ]; then
ok "$name (trained)"
else
warn "$name (incomplete)"
fi
done
else
echo " (no adapters yet)"
fi
echo ""
echo "================================="
echo "PASS: $PASS WARN: $WARN FAIL: $FAIL"
if [ "$FAIL" -gt 0 ]; then
echo "Status: FAIL"
exit 1
elif [ "$WARN" -gt 0 ]; then
echo "Status: WARN"
else
echo "Status: PASS"
fi