ZDDC/training-data/process.sh
ZDDC ea385b5366 Initial commit
ZDDC — Zero Day Document Control. A file-naming convention plus five
single-file HTML tools (archive, transmittal, classifier, mdedit,
landing) and an optional Go HTTP server (zddc-server) with ACL and a
virtual archive index. Self-contained, offline-capable, dependency-free.

See README.md for an overview, AGENTS.md and ARCHITECTURE.md for the
build/release/architecture detail, bootstrap/README.md for the
two-level deployment install pattern, and zddc/README.md for the
HTTP server.
2026-04-27 11:05:47 -05:00

94 lines
2.9 KiB
Bash

#!/bin/bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
echo "=== ZDDC Training Data Processor ==="
echo "Timestamp: $(date +%Y-%m-%d_%H-%M-%S)"
# Step 1: Combine raw interactions
echo "Step 1: Combining raw interactions..."
cat raw/*.jsonl > raw/all-interactions.jsonl 2>/dev/null || echo "No raw interactions yet"
# Step 2: Deduplicate by conversation content
echo "Step 2: Deduplicating..."
if [ -f raw/all-interactions.jsonl ]; then
jq -s 'unique_by(.messages | tojson)' raw/all-interactions.jsonl > processed/all.jsonl
else
echo "Warning: No raw interactions to deduplicate"
fi
# Step 3: Categorize by domain
echo "Step 3: Categorizing by domain..."
# Check if we have data to process
if [ -f processed/all.jsonl ]; then
# Extract unique domains
domains=$(jq -r '.metadata.domain' processed/all.jsonl | sort -u)
for domain in $domains; do
if [ -n "$domain" ]; then
echo " Processing domain: $domain"
jq -s "grep(.metadata.domain == \"$domain\")" processed/all.jsonl > "processed/${domain}.jsonl" 2>/dev/null || \
jq -s 'map(select(.metadata.domain == "'"$domain"'"))' processed/all.jsonl > "processed/${domain}.jsonl"
fi
done
else
echo "Warning: No processed data found"
fi
# Step 4: Create multi-domain dataset
echo "Step 4: Creating multi-domain dataset..."
if [ -f processed/all.jsonl ]; then
cp processed/all.jsonl processed/multi-domain.jsonl
fi
# Step 5: Initialize validation split (empty until we have data)
echo "Step 5: Splitting into train/val/test..."
mkdir -p validation
# Create empty placeholders if no data
if [ ! -f validation/train.jsonl ]; then
touch validation/train.jsonl
fi
if [ ! -f validation/val.jsonl ]; then
touch validation/val.jsonl
fi
if [ ! -f validation/test.jsonl ]; then
touch validation/test.jsonl
fi
# Step 6: Create snapshot
echo "Step 6: Creating snapshot..."
SNAPSHOT_NAME="v$(date +%Y-%m-%d)"
mkdir -p snapshots/$SNAPSHOT_NAME
if [ -f processed/all.jsonl ]; then
cp processed/all.jsonl snapshots/$SNAPSHOT_NAME/dataset.jsonl
cp validation/*.jsonl snapshots/$SNAPSHOT_NAME/
fi
echo ""
echo "=== Processing Complete ==="
# Display summary
if [ -f processed/all.jsonl ]; then
TOTAL=$(wc -l < processed/all.jsonl)
VALIDATION=$(wc -l < processed/val.jsonl 2>/dev/null || echo 0)
echo "Total examples: $TOTAL"
echo ""
echo "Domain breakdown:"
if [ -f processed/all.jsonl ]; then
jq -s 'group_by(.metadata.domain) | map({domain: .[0].metadata.domain, count: length})' processed/all.jsonl | jq '.[] | " \(.domain): \(.count)"'
fi
else
echo "No data processed yet. Collect some interactions first."
fi
echo ""
echo "Next steps:"
echo " 1. Collect interactions: node collect-interaction.js --query \"...\" --qwen \"...\" --expert \"...\""
echo " 2. Process: bash process.sh"
echo " 3. Train: bash train.sh [domain]"