ZDDC — Zero Day Document Control. A file-naming convention plus five single-file HTML tools (archive, transmittal, classifier, mdedit, landing) and an optional Go HTTP server (zddc-server) with ACL and a virtual archive index. Self-contained, offline-capable, dependency-free. See README.md for an overview, AGENTS.md and ARCHITECTURE.md for the build/release/architecture detail, bootstrap/README.md for the two-level deployment install pattern, and zddc/README.md for the HTTP server.
94 lines
2.9 KiB
Bash
94 lines
2.9 KiB
Bash
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
cd "$SCRIPT_DIR"
|
|
|
|
echo "=== ZDDC Training Data Processor ==="
|
|
echo "Timestamp: $(date +%Y-%m-%d_%H-%M-%S)"
|
|
|
|
# Step 1: Combine raw interactions
|
|
echo "Step 1: Combining raw interactions..."
|
|
cat raw/*.jsonl > raw/all-interactions.jsonl 2>/dev/null || echo "No raw interactions yet"
|
|
|
|
# Step 2: Deduplicate by conversation content
|
|
echo "Step 2: Deduplicating..."
|
|
if [ -f raw/all-interactions.jsonl ]; then
|
|
jq -s 'unique_by(.messages | tojson)' raw/all-interactions.jsonl > processed/all.jsonl
|
|
else
|
|
echo "Warning: No raw interactions to deduplicate"
|
|
fi
|
|
|
|
# Step 3: Categorize by domain
|
|
echo "Step 3: Categorizing by domain..."
|
|
|
|
# Check if we have data to process
|
|
if [ -f processed/all.jsonl ]; then
|
|
# Extract unique domains
|
|
domains=$(jq -r '.metadata.domain' processed/all.jsonl | sort -u)
|
|
|
|
for domain in $domains; do
|
|
if [ -n "$domain" ]; then
|
|
echo " Processing domain: $domain"
|
|
jq -s "grep(.metadata.domain == \"$domain\")" processed/all.jsonl > "processed/${domain}.jsonl" 2>/dev/null || \
|
|
jq -s 'map(select(.metadata.domain == "'"$domain"'"))' processed/all.jsonl > "processed/${domain}.jsonl"
|
|
fi
|
|
done
|
|
else
|
|
echo "Warning: No processed data found"
|
|
fi
|
|
|
|
# Step 4: Create multi-domain dataset
|
|
echo "Step 4: Creating multi-domain dataset..."
|
|
if [ -f processed/all.jsonl ]; then
|
|
cp processed/all.jsonl processed/multi-domain.jsonl
|
|
fi
|
|
|
|
# Step 5: Initialize validation split (empty until we have data)
|
|
echo "Step 5: Splitting into train/val/test..."
|
|
mkdir -p validation
|
|
|
|
# Create empty placeholders if no data
|
|
if [ ! -f validation/train.jsonl ]; then
|
|
touch validation/train.jsonl
|
|
fi
|
|
if [ ! -f validation/val.jsonl ]; then
|
|
touch validation/val.jsonl
|
|
fi
|
|
if [ ! -f validation/test.jsonl ]; then
|
|
touch validation/test.jsonl
|
|
fi
|
|
|
|
# Step 6: Create snapshot
|
|
echo "Step 6: Creating snapshot..."
|
|
SNAPSHOT_NAME="v$(date +%Y-%m-%d)"
|
|
mkdir -p snapshots/$SNAPSHOT_NAME
|
|
|
|
if [ -f processed/all.jsonl ]; then
|
|
cp processed/all.jsonl snapshots/$SNAPSHOT_NAME/dataset.jsonl
|
|
cp validation/*.jsonl snapshots/$SNAPSHOT_NAME/
|
|
fi
|
|
|
|
echo ""
|
|
echo "=== Processing Complete ==="
|
|
|
|
# Display summary
|
|
if [ -f processed/all.jsonl ]; then
|
|
TOTAL=$(wc -l < processed/all.jsonl)
|
|
VALIDATION=$(wc -l < processed/val.jsonl 2>/dev/null || echo 0)
|
|
echo "Total examples: $TOTAL"
|
|
|
|
echo ""
|
|
echo "Domain breakdown:"
|
|
if [ -f processed/all.jsonl ]; then
|
|
jq -s 'group_by(.metadata.domain) | map({domain: .[0].metadata.domain, count: length})' processed/all.jsonl | jq '.[] | " \(.domain): \(.count)"'
|
|
fi
|
|
else
|
|
echo "No data processed yet. Collect some interactions first."
|
|
fi
|
|
|
|
echo ""
|
|
echo "Next steps:"
|
|
echo " 1. Collect interactions: node collect-interaction.js --query \"...\" --qwen \"...\" --expert \"...\""
|
|
echo " 2. Process: bash process.sh"
|
|
echo " 3. Train: bash train.sh [domain]"
|