From 1da25eff3ff217de757d53ff477754cf880f6d6e Mon Sep 17 00:00:00 2001 From: ZDDC Date: Mon, 27 Apr 2026 21:45:35 -0500 Subject: [PATCH] chore: remove training-data/ This directory (interaction-log scripts and tooling for AI training data) was included by mistake when the repo was migrated. It has no relationship to ZDDC the project; remove from the repo and the matching section from AGENTS.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 7 - training-data/.gitignore | 6 - training-data/README.md | 167 --------------- training-data/build-interaction-collector.sh | 55 ----- training-data/collect-interaction.js | 197 ------------------ training-data/deploy.sh | 97 --------- training-data/package.json | 16 -- training-data/process.sh | 94 --------- training-data/train.sh | 205 ------------------- training-data/validate.sh | 114 ----------- 10 files changed, 958 deletions(-) delete mode 100644 training-data/.gitignore delete mode 100644 training-data/README.md delete mode 100755 training-data/build-interaction-collector.sh delete mode 100755 training-data/collect-interaction.js delete mode 100755 training-data/deploy.sh delete mode 100644 training-data/package.json delete mode 100644 training-data/process.sh delete mode 100755 training-data/train.sh delete mode 100755 training-data/validate.sh diff --git a/AGENTS.md b/AGENTS.md index 26e8117..19831d9 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -242,13 +242,6 @@ Use `git worktree` to run multiple agents on separate branches simultaneously wi - Toast UI Editor v3.2.2 is bundled in `vendor/`; `template.html` loads it from CDN for dev convenience - `.jsonl # Per-domain splits (auto-generated) -├── validation/ -│ ├── train.jsonl # 80% split -│ ├── val.jsonl # 10% split -│ └── test.jsonl # 10% split (never used during training) -├── adapters/ -│ └── -lora-v1/ # Trained LoRA adapter -│ └── -lora-v1-merged/ # Merged standalone model -├── snapshots/ -│ └── v/ # Versioned dataset snapshots -├── collect-interaction.js # Capture a weak-spot interaction -├── process.sh # Cluster raw data into domain splits -├── validate.sh # Check data quality before training -├── train.sh # Train a LoRA adapter -└── deploy.sh # Merge adapter into standalone model -``` - ---- - -## Workflow - -### Step 1 — Collect a weak-spot interaction - -When Qwen gets stuck or you ask it to consult Sonnet/Opus: - -```bash -node collect-interaction.js \ - --query "How do I parse ZDDC filenames?" \ - --qwen "[Qwen's suboptimal answer]" \ - --expert "[Sonnet's correct answer]" -``` - -Optionally specify domain explicitly (otherwise auto-detected): - -```bash -node collect-interaction.js \ - --query "..." \ - --qwen "..." \ - --expert "..." \ - --domain zddc-naming -``` - -Raw interaction is appended to `raw/interactions.jsonl`. - -### Step 2 — Process (after ~50 new interactions) - -```bash -bash process.sh -``` - -Deduplicates, clusters by domain, creates train/val/test splits. - -### Step 3 — Validate - -```bash -bash validate.sh -``` - -Checks JSONL validity, domain balance, and split sizes. - -### Step 4 — Train - -```bash -bash train.sh # train multi-domain adapter -bash train.sh zddc-naming # train domain-specific adapter -``` - -Outputs LoRA adapter to `adapters/-lora-v1/`. - -### Step 5 — Deploy (optional) - -```bash -bash deploy.sh # merge multi-domain adapter -bash deploy.sh zddc-naming # merge specific adapter -``` - -Merges the LoRA weights into the base model and saves a standalone model. - ---- - -## Training Data Format - -Each line in a `.jsonl` file is one training example: - -```json -{ - "messages": [ - {"role": "user", "content": "Query that exposed weakness"}, - {"role": "assistant", "content": "Qwen's original response"}, - {"role": "user", "content": "consult Sonnet"}, - {"role": "assistant", "content": "Expert's correct response"} - ], - "metadata": { - "domain": "zddc-naming", - "adapter": "lora-v1-zddc_naming", - "timestamp": "2025-10-31T14:30:00.000Z", - "interaction_id": "int_1735648200000_abc123", - "source": "manual-expert-consultation" - } -} -``` - ---- - -## Auto-Detected Domains - -| Domain | Trigger keywords | -|--------|----------------| -| `zddc-naming` | zddc, trackingnumber, revision, status code | -| `html-architecture` | html, spa, single-file, es module, vanilla js | -| `build-system` | build.sh, dist/, template.html | -| `coding-debugging` | debug, error, fix, console | -| `reasoning-architecture` | reason, analyze, architecture, design | -| `general-coding` | (default) | - ---- - -## LoRA Configuration - -| Parameter | Value | Notes | -|-----------|-------|-------| -| Base model | `Qwen/Qwen2.5-7B-Instruct` | Replace with Qwen3 when available on HF | -| Rank | 64 | Increase to 128 if underfitting | -| Alpha | 64 | 1:1 with rank | -| Target modules | q_proj, v_proj, k_proj, o_proj | All attention projections | -| Dropout | 0.05 | Light regularisation | -| Learning rate | 1e-4 | Cosine decay with 10% warmup | -| Epochs | 3 | Monitor val loss to catch overfitting | -| Batch size | 8 effective | 4 per-device × 2 gradient accumulation | -| Precision | bfloat16 | Requires Ampere GPU or newer | - ---- - -## Hardware Requirements - -| Setup | Min VRAM | Method | Notes | -|-------|----------|--------|-------| -| Qwen-7B LoRA | 24 GB | LoRA bf16 | Recommended | -| Qwen-7B QLoRA | 16 GB | QLoRA 4-bit | Add `--load_in_4bit` flag | -| Qwen-14B LoRA | 48 GB | LoRA bf16 | Better quality | - -Your system has 96 GB VRAM — full LoRA on Qwen-14B is feasible. - ---- - -## When to Retrain - -- Every **50–100 new interactions** collected -- When a new domain accumulates **200+ examples** -- After a major project phase where Qwen struggled repeatedly diff --git a/training-data/build-interaction-collector.sh b/training-data/build-interaction-collector.sh deleted file mode 100755 index d29bea1..0000000 --- a/training-data/build-interaction-collector.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -# build-interaction-collector.sh -# Install dependencies for the interaction collector - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" - -echo "=== Installing Node.js Dependencies ===" - -# Check if Node.js is available -if ! command -v node &> /dev/null; then - echo "Error: Node.js is not installed" - echo "Please install Node.js 18+ from https://nodejs.org/" - exit 1 -fi - -NODE_VERSION=$(node --version) -echo "Node.js version: $NODE_VERSION" - -# Check minimal version -NODE_MAJOR=$(echo $NODE_VERSION | cut -c2- | cut -d. -f1) -if [ "$NODE_MAJOR" -lt 18 ]; then - echo "Error: Node.js 18+ is required" - exit 1 -fi - -echo "✓ Node.js version check passed" - -# Create package.json if it doesn't exist -if [ ! -f package.json ]; then - cat > package.json << 'EOF' -{ - "name": "zddc-training-data", - "version": "1.0.0", - "description": "Training data collection for ZDDC fine-tuning", - "type": "module", - "main": "collect-interaction.js", - "scripts": { - "collect": "node collect-interaction.js" - }, - "keywords": ["zddc", "training", "lora"], - "license": "MIT" -} -EOF -fi - -echo "✓ Created/verified package.json" -echo "" -echo "=== Ready to use ===" -echo "Usage: node collect-interaction.js --query \"...\" --qwen \"...\" --expert \"...\"" -echo " node collect-interaction.js --query \"How do I...\" --qwen \"[Qwen answer]\" --expert \"[Expert answer]\"" -echo "" -echo "To batch process interactions, use process.sh" diff --git a/training-data/collect-interaction.js b/training-data/collect-interaction.js deleted file mode 100755 index 40b1f42..0000000 --- a/training-data/collect-interaction.js +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/env node -/** - * Training Data Collector for Qwen3-Coder-Next - * - * Captures conversations where Qwen needs expert (Sonnet/Opus) assistance - * and stores them in a format suitable for LoRA fine-tuning - * - * Usage: - * node collect-interaction.js --query "..." --qwen "..." --expert "..." --domain "domain-name" - */ - -import fs from 'fs'; -import path from 'path'; -import { fileURLToPath } from 'url'; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); - -// Get script directory -const SCRIPT_DIR = __dirname; -const RAW_FILE = path.join(SCRIPT_DIR, 'raw', 'interactions.jsonl'); - -/** - * Generate unique interaction ID - */ -function generateInteractionId() { - const timestamp = Date.now(); - const random = Math.random().toString(36).substring(2, 10); - return `int_${timestamp}_${random}`; -} - -/** - * Detect domain from conversation content - */ -function detectDomain(messages) { - const text = JSON.stringify(messages).toLowerCase(); - - // ZDDC naming patterns - if (text.includes('zddc') || - text.includes('trackingnumber') || - text.includes('revision') || - text.includes('_a (ifr)') || - text.includes('status code')) { - return 'zddc-naming'; - } - - // HTML SPA patterns - if (text.includes('html') || - text.includes('spa') || - text.includes('single-file') || - text.includes('es module') || - text.includes('vanilla js')) { - return 'html-architecture'; - } - - // Build system patterns - if (text.includes('build') || - text.includes('build.sh') || - text.includes('dist/') || - text.includes('template.html')) { - return 'build-system'; - } - - // Debugging patterns - if (text.includes('debug') || - text.includes('error') || - text.includes('fix') || - text.includes('console')) { - return 'coding-debugging'; - } - - // Reasoning patterns - if (text.includes('reason') || - text.includes('analyze') || - text.includes('architecture') || - text.includes('design')) { - return 'reasoning-architecture'; - } - - // Default to general coding - return 'general-coding'; -} - -/** - * Create training example object - */ -function createTrainingExample(userQuery, qwenResponse, expertResponse, options = {}) { - const domain = options.domain || detectDomain([userQuery, qwenResponse, expertResponse]); - - return { - messages: [ - { role: 'user', content: userQuery }, - { role: 'assistant', content: qwenResponse }, - { role: 'user', content: 'consult Sonnet' }, - { role: 'assistant', content: expertResponse } - ], - metadata: { - domain: domain, - adapter: `lora-v1-${domain.replace(/-/g, '_')}`, - timestamp: new Date().toISOString(), - interaction_id: generateInteractionId(), - source: 'manual-expert-consultation', - ...options.metadata - } - }; -} - -/** - * Append to JSONL file - */ -function appendToJSONL(filePath, data) { - const jsonLine = JSON.stringify(data); - fs.appendFileSync(filePath, jsonLine + '\n'); -} - -/** - * Format domain name from detected or provided - */ -function formatDomainName(domain) { - // Convert hyphens to underscores for adapter name - return domain.replace(/-/g, '_'); -} - -/** - * Collect a training example - */ -export function collect({ - userQuery, - qwenResponse, - expertResponse, - domain = null, - metadata = {} -}) { - if (!userQuery || !qwenResponse || !expertResponse) { - console.error('Error: Missing required parameters'); - console.error('Usage: node collect-interaction.js --query "..." --qwen "..." --expert "..."'); - process.exit(1); - } - - const trainingExample = createTrainingExample( - userQuery, - qwenResponse, - expertResponse, - { domain, metadata } - ); - - // Ensure raw directory exists - fs.mkdirSync(path.dirname(RAW_FILE), { recursive: true }); - - // Append to raw file - appendToJSONL(RAW_FILE, trainingExample); - - console.log('\n=== Training Example Captured ==='); - console.log(`Domain: ${trainingExample.metadata.domain}`); - console.log(`Adapter: ${trainingExample.metadata.adapter}`); - console.log(`Interaction ID: ${trainingExample.metadata.interaction_id}`); - console.log(`Timestamp: ${trainingExample.metadata.timestamp}`); - console.log(`Raw file: ${RAW_FILE}`); - console.log('=================================\n'); - - return trainingExample; -} - -/** - * CLI interface - */ -function main() { - const args = process.argv.slice(2); - - // Parse arguments - const queryIdx = args.findIndex(arg => arg === '--query'); - const qwenIdx = args.findIndex(arg => arg === '--qwen'); - const expertIdx = args.findIndex(arg => arg === '--expert'); - const domainIdx = args.findIndex(arg => arg === '--domain'); - - if (queryIdx === -1 || qwenIdx === -1 || expertIdx === -1) { - console.error('Error: Missing required arguments'); - console.error('Usage: node collect-interaction.js --query "..." --qwen "..." --expert "..." [--domain "domain"]'); - console.error(' node collect-interaction.js --query "Query" --qwen "Qwen answer" --expert "Expert answer"'); - process.exit(1); - } - - const userQuery = args[queryIdx + 1]; - const qwenResponse = args[qwenIdx + 1]; - const expertResponse = args[expertIdx + 1]; - const domain = domainIdx !== -1 ? args[domainIdx + 1] : null; - - collect({ userQuery, qwenResponse, expertResponse, domain }); -} - -// Export for module usage -export default { collect, createTrainingExample, detectDomain }; - -// Run if executed directly -if (import.meta.url === `file://${process.argv[1]}`) { - main(); -} diff --git a/training-data/deploy.sh b/training-data/deploy.sh deleted file mode 100755 index 46d3490..0000000 --- a/training-data/deploy.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# deploy.sh — Merge a trained LoRA adapter into a standalone model -# Usage: -# bash deploy.sh # deploy multi-domain adapter -# bash deploy.sh zddc-naming # deploy specific domain adapter -# -# Output: adapters/-lora-v1-merged/ - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" - -DOMAIN="${1:-multi-domain}" -BASE_MODEL="Qwen/Qwen2.5-7B-Instruct" -ADAPTER_DIR="adapters/${DOMAIN}-lora-v1" -MERGED_DIR="adapters/${DOMAIN}-lora-v1-merged" - -echo "=== ZDDC LoRA Deployment ===" -echo "Domain: $DOMAIN" -echo "Adapter: $ADAPTER_DIR" -echo "Output: $MERGED_DIR" -echo "" - -if [ ! -d "$ADAPTER_DIR" ]; then - echo "Error: adapter not found: $ADAPTER_DIR" - echo "Run: bash train.sh $DOMAIN" - exit 1 -fi -if [ ! -f "$ADAPTER_DIR/adapter_config.json" ]; then - echo "Error: adapter incomplete (missing adapter_config.json)" - exit 1 -fi - -command -v python3 &>/dev/null || { echo "Error: python3 required"; exit 1; } -python3 -c "import torch, transformers, peft" 2>/dev/null || \ - pip install torch transformers peft --quiet - -mkdir -p "$MERGED_DIR" - -DEPLOY_PY=$(mktemp /tmp/deploy_lora_XXXXXX.py) -trap 'rm -f "$DEPLOY_PY"' EXIT - -cat > "$DEPLOY_PY" << 'PYEOF' -import sys, os, torch -from transformers import AutoTokenizer, AutoModelForCausalLM -from peft import PeftModel - -base_model_name = sys.argv[1] -adapter_dir = sys.argv[2] -merged_dir = sys.argv[3] - -print(f"Loading base model: {base_model_name}") -tok = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) -model = AutoModelForCausalLM.from_pretrained( - base_model_name, torch_dtype=torch.bfloat16, - device_map="auto", trust_remote_code=True) - -print(f"Loading adapter: {adapter_dir}") -model = PeftModel.from_pretrained(model, adapter_dir) - -print("Merging weights into base model...") -model = model.merge_and_unload() - -print(f"Saving merged model to {merged_dir} ...") -model.save_pretrained(merged_dir, safe_serialization=True) -tok.save_pretrained(merged_dir) - -print("\nRunning test inference...") -prompt = "<|im_start|>user\nWhat is the ZDDC file naming convention?<|im_end|>\n<|im_start|>assistant\n" -inputs = tok(prompt, return_tensors="pt").to(model.device) -with torch.no_grad(): - out = model.generate( - **inputs, max_new_tokens=128, temperature=0.7, - do_sample=True, pad_token_id=tok.eos_token_id) -response = tok.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) -print(f"Test prompt: What is the ZDDC file naming convention?") -print(f"Model response: {response}") - -size = sum( - os.path.getsize(os.path.join(merged_dir, f)) - for f in os.listdir(merged_dir) - if os.path.isfile(os.path.join(merged_dir, f))) -print(f"\nMerged model size: {size/(1024**3):.2f} GB") -print(f"Saved to: {merged_dir}") -PYEOF - -python3 "$DEPLOY_PY" "$BASE_MODEL" "$ADAPTER_DIR" "$MERGED_DIR" - -echo "" -echo "=== Deployment Complete ===" -echo "Merged model: $MERGED_DIR" -echo "" -echo "To use:" -echo " from transformers import AutoTokenizer, AutoModelForCausalLM" -echo " model = AutoModelForCausalLM.from_pretrained('$MERGED_DIR')" -echo " tokenizer = AutoTokenizer.from_pretrained('$MERGED_DIR')" diff --git a/training-data/package.json b/training-data/package.json deleted file mode 100644 index ae67840..0000000 --- a/training-data/package.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "name": "zddc-training-data", - "version": "1.0.0", - "description": "Training data collection and LoRA fine-tuning pipeline for Qwen3-Coder-Next", - "type": "module", - "main": "collect-interaction.js", - "scripts": { - "collect": "node collect-interaction.js", - "process": "bash process.sh", - "validate": "bash validate.sh", - "train": "bash train.sh", - "deploy": "bash deploy.sh" - }, - "keywords": ["zddc", "training", "lora", "fine-tuning", "qwen"], - "license": "AGPL-3.0" -} diff --git a/training-data/process.sh b/training-data/process.sh deleted file mode 100644 index a38e74d..0000000 --- a/training-data/process.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/bin/bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" - -echo "=== ZDDC Training Data Processor ===" -echo "Timestamp: $(date +%Y-%m-%d_%H-%M-%S)" - -# Step 1: Combine raw interactions -echo "Step 1: Combining raw interactions..." -cat raw/*.jsonl > raw/all-interactions.jsonl 2>/dev/null || echo "No raw interactions yet" - -# Step 2: Deduplicate by conversation content -echo "Step 2: Deduplicating..." -if [ -f raw/all-interactions.jsonl ]; then - jq -s 'unique_by(.messages | tojson)' raw/all-interactions.jsonl > processed/all.jsonl -else - echo "Warning: No raw interactions to deduplicate" -fi - -# Step 3: Categorize by domain -echo "Step 3: Categorizing by domain..." - -# Check if we have data to process -if [ -f processed/all.jsonl ]; then - # Extract unique domains - domains=$(jq -r '.metadata.domain' processed/all.jsonl | sort -u) - - for domain in $domains; do - if [ -n "$domain" ]; then - echo " Processing domain: $domain" - jq -s "grep(.metadata.domain == \"$domain\")" processed/all.jsonl > "processed/${domain}.jsonl" 2>/dev/null || \ - jq -s 'map(select(.metadata.domain == "'"$domain"'"))' processed/all.jsonl > "processed/${domain}.jsonl" - fi - done -else - echo "Warning: No processed data found" -fi - -# Step 4: Create multi-domain dataset -echo "Step 4: Creating multi-domain dataset..." -if [ -f processed/all.jsonl ]; then - cp processed/all.jsonl processed/multi-domain.jsonl -fi - -# Step 5: Initialize validation split (empty until we have data) -echo "Step 5: Splitting into train/val/test..." -mkdir -p validation - -# Create empty placeholders if no data -if [ ! -f validation/train.jsonl ]; then - touch validation/train.jsonl -fi -if [ ! -f validation/val.jsonl ]; then - touch validation/val.jsonl -fi -if [ ! -f validation/test.jsonl ]; then - touch validation/test.jsonl -fi - -# Step 6: Create snapshot -echo "Step 6: Creating snapshot..." -SNAPSHOT_NAME="v$(date +%Y-%m-%d)" -mkdir -p snapshots/$SNAPSHOT_NAME - -if [ -f processed/all.jsonl ]; then - cp processed/all.jsonl snapshots/$SNAPSHOT_NAME/dataset.jsonl - cp validation/*.jsonl snapshots/$SNAPSHOT_NAME/ -fi - -echo "" -echo "=== Processing Complete ===" - -# Display summary -if [ -f processed/all.jsonl ]; then - TOTAL=$(wc -l < processed/all.jsonl) - VALIDATION=$(wc -l < processed/val.jsonl 2>/dev/null || echo 0) - echo "Total examples: $TOTAL" - - echo "" - echo "Domain breakdown:" - if [ -f processed/all.jsonl ]; then - jq -s 'group_by(.metadata.domain) | map({domain: .[0].metadata.domain, count: length})' processed/all.jsonl | jq '.[] | " \(.domain): \(.count)"' - fi -else - echo "No data processed yet. Collect some interactions first." -fi - -echo "" -echo "Next steps:" -echo " 1. Collect interactions: node collect-interaction.js --query \"...\" --qwen \"...\" --expert \"...\"" -echo " 2. Process: bash process.sh" -echo " 3. Train: bash train.sh [domain]" diff --git a/training-data/train.sh b/training-data/train.sh deleted file mode 100755 index 85094ce..0000000 --- a/training-data/train.sh +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# train.sh — Train a LoRA adapter for Qwen3-Coder-Next -# Usage: -# bash train.sh # train on all domains (multi-domain) -# bash train.sh zddc-naming # train on a specific domain -# -# Output: adapters/-lora-v1/ - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" - -DOMAIN="${1:-all}" -BASE_MODEL="Qwen/Qwen2.5-7B-Instruct" -LORA_RANK=64 -LORA_ALPHA=64 -LEARNING_RATE="1e-4" -NUM_EPOCHS=3 -MAX_SEQ_LENGTH=2048 -WARMUP_RATIO=0.1 -WEIGHT_DECAY=0.01 - -if [ "$DOMAIN" = "all" ]; then - TRAIN_FILE="validation/train.jsonl" - OUTPUT_DIR="adapters/multi-domain-lora-v1" -else - TRAIN_FILE="processed/${DOMAIN}.jsonl" - OUTPUT_DIR="adapters/${DOMAIN}-lora-v1" -fi -VAL_FILE="validation/val.jsonl" - -echo "=== ZDDC LoRA Fine-Tuning ===" -echo "Domain: $DOMAIN" -echo "Base model: $BASE_MODEL" -echo "Train file: $TRAIN_FILE" -echo "Output dir: $OUTPUT_DIR" -echo "LoRA rank: $LORA_RANK / alpha: $LORA_ALPHA" -echo "Learning rate: $LEARNING_RATE Epochs: $NUM_EPOCHS" -echo "" - -if [ ! -f "$TRAIN_FILE" ]; then - echo "Error: training file not found: $TRAIN_FILE" - echo "Run bash process.sh first." - exit 1 -fi -TRAIN_COUNT=$(grep -c . "$TRAIN_FILE" 2>/dev/null || echo 0) -if [ "$TRAIN_COUNT" -eq 0 ]; then - echo "Error: training file is empty. Collect at least 50 interactions first." - exit 1 -fi -echo "Training examples: $TRAIN_COUNT" - -NO_EVAL=0 -if [ ! -f "$VAL_FILE" ] || [ "$(grep -c . "$VAL_FILE" 2>/dev/null || echo 0)" -eq 0 ]; then - echo "Warning: no validation data found. Skipping eval." - NO_EVAL=1 -fi - -command -v python3 &>/dev/null || { echo "Error: python3 required"; exit 1; } - -echo "Checking Python dependencies..." -python3 -c "import torch, transformers, peft, trl, datasets, accelerate" 2>/dev/null || { - echo "Installing required packages..." - pip install torch transformers peft trl datasets accelerate --quiet -} -echo "Dependencies OK" -echo "" - -mkdir -p "$OUTPUT_DIR" - -TRAIN_PY=$(mktemp /tmp/train_lora_XXXXXX.py) -trap 'rm -f "$TRAIN_PY"' EXIT - -cat > "$TRAIN_PY" << 'PYEOF' -import json, sys, argparse, torch -from pathlib import Path -from datasets import Dataset -from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments -from peft import LoraConfig, get_peft_model, TaskType -from trl import SFTTrainer - -def load_jsonl(path): - out = [] - with open(path) as f: - for line in f: - line = line.strip() - if line: - out.append(json.loads(line)) - return out - -def fmt(ex): - text = "" - for m in ex["messages"]: - text += f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>\n" - return {"text": text} - -p = argparse.ArgumentParser() -p.add_argument("--model_name", default="Qwen/Qwen2.5-7B-Instruct") -p.add_argument("--train_file", required=True) -p.add_argument("--val_file") -p.add_argument("--output_dir", required=True) -p.add_argument("--lora_rank", type=int, default=64) -p.add_argument("--lora_alpha", type=int, default=64) -p.add_argument("--learning_rate", type=float, default=1e-4) -p.add_argument("--num_epochs", type=int, default=3) -p.add_argument("--max_seq_length", type=int, default=2048) -p.add_argument("--warmup_ratio", type=float, default=0.1) -p.add_argument("--weight_decay", type=float, default=0.01) -p.add_argument("--no_eval", action="store_true") -args = p.parse_args() - -print(f"Loading tokenizer: {args.model_name}") -tok = AutoTokenizer.from_pretrained( - args.model_name, model_max_length=args.max_seq_length, - padding_side="right", trust_remote_code=True) -if tok.pad_token is None: - tok.pad_token = tok.eos_token - -print("Loading base model...") -model = AutoModelForCausalLM.from_pretrained( - args.model_name, torch_dtype=torch.bfloat16, - device_map="auto", trust_remote_code=True) -model.gradient_checkpointing_enable() - -print("Applying LoRA...") -lora_cfg = LoraConfig( - r=args.lora_rank, lora_alpha=args.lora_alpha, - target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], - lora_dropout=0.05, bias="none", task_type=TaskType.CAUSAL_LM) -model = get_peft_model(model, lora_cfg) -model.print_trainable_parameters() - -train_ds = Dataset.from_list([fmt(d) for d in load_jsonl(args.train_file)]) -print(f"Training examples: {len(train_ds)}") - -eval_ds = None -if not args.no_eval and args.val_file: - vp = Path(args.val_file) - if vp.exists() and vp.stat().st_size > 0: - vdata = load_jsonl(args.val_file) - if vdata: - eval_ds = Dataset.from_list([fmt(d) for d in vdata]) - print(f"Validation examples: {len(eval_ds)}") - -ta = TrainingArguments( - output_dir=args.output_dir, - per_device_train_batch_size=4, - per_device_eval_batch_size=4, - gradient_accumulation_steps=2, - learning_rate=args.learning_rate, - num_train_epochs=args.num_epochs, - warmup_ratio=args.warmup_ratio, - weight_decay=args.weight_decay, - lr_scheduler_type="cosine", - bf16=True, fp16=False, - logging_steps=10, - save_steps=100, - eval_steps=50 if eval_ds else None, - evaluation_strategy="steps" if eval_ds else "no", - save_strategy="steps", - save_total_limit=3, - load_best_model_at_end=(eval_ds is not None), - report_to=[], - seed=42, -) - -trainer = SFTTrainer( - model=model, args=ta, - train_dataset=train_ds, eval_dataset=eval_ds, - dataset_text_field="text", - max_seq_length=args.max_seq_length, - tokenizer=tok) - -print("\nStarting training...") -trainer.train() - -print(f"\nSaving adapter to {args.output_dir} ...") -trainer.model.save_pretrained(args.output_dir) -tok.save_pretrained(args.output_dir) -print(f"Done. Adapter saved to: {args.output_dir}") -PYEOF - -CMD="python3 $TRAIN_PY \ - --model_name $BASE_MODEL \ - --train_file $TRAIN_FILE \ - --val_file $VAL_FILE \ - --output_dir $OUTPUT_DIR \ - --lora_rank $LORA_RANK \ - --lora_alpha $LORA_ALPHA \ - --learning_rate $LEARNING_RATE \ - --num_epochs $NUM_EPOCHS \ - --max_seq_length $MAX_SEQ_LENGTH \ - --warmup_ratio $WARMUP_RATIO \ - --weight_decay $WEIGHT_DECAY" - -[ "$NO_EVAL" -eq 1 ] && CMD="$CMD --no_eval" - -echo "Launching training..." -eval "$CMD" - -echo "" -echo "=== Training Complete ===" -echo "Adapter: $OUTPUT_DIR" -echo "Next: bash test.sh $DOMAIN OR bash deploy.sh $DOMAIN" diff --git a/training-data/validate.sh b/training-data/validate.sh deleted file mode 100755 index f29795a..0000000 --- a/training-data/validate.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" - -DOMAIN="${1:-all}" -PASS=0 -WARN=0 -FAIL=0 - -ok() { echo " ✓ $*"; ((PASS+=1)) || true; } -warn() { echo " ⚠ $*"; ((WARN+=1)) || true; } -fail() { echo " ✗ $*"; ((FAIL+=1)) || true; } - -echo "=== ZDDC Training Data Validator ===" -echo "Domain: $DOMAIN" -echo "" - -echo "[ Raw Data ]" -RAW_FILE="raw/interactions.jsonl" -if [ ! -f "$RAW_FILE" ]; then - warn "No raw interactions file found. Collect interactions first." -else - RAW_COUNT=$(grep -c . "$RAW_FILE" 2>/dev/null || echo 0) - ok "Raw interactions: $RAW_COUNT" -fi - -echo "" -echo "[ JSONL Validity ]" -check_jsonl() { - local file="$1" - if [ ! -f "$file" ]; then warn "File not found: $file"; return; fi - if [ ! -s "$file" ]; then warn "File is empty: $file"; return; fi - local count - count=$(grep -c . "$file" 2>/dev/null || echo 0) - ok "$file — $count lines" -} - -if [ "$DOMAIN" = "all" ]; then - for f in processed/*.jsonl validation/*.jsonl; do - [ -f "$f" ] && check_jsonl "$f" - done -else - check_jsonl "processed/${DOMAIN}.jsonl" -fi - -echo "" -echo "[ Domain Balance ]" -if [ -f "processed/all.jsonl" ] && [ -s "processed/all.jsonl" ]; then - python3 -c " -import json -from collections import Counter -counts = Counter() -with open('processed/all.jsonl') as f: - for line in f: - line = line.strip() - if line: - try: - obj = json.loads(line) - domain = obj.get('metadata', {}).get('domain', 'unknown') - counts[domain] += 1 - except Exception: - pass -if not counts: - print(' no domain data found') -else: - print(f' Total: {sum(counts.values())} examples') - for domain, count in sorted(counts.items(), key=lambda x: -x[1]): - status = 'OK' if count >= 200 else 'LOW' - print(f' [{status}] {domain}: {count}') -" -else - warn "processed/all.jsonl not found — run bash process.sh first" -fi - -echo "" -echo "[ Train/Val/Test Split ]" -for split in train val test; do - f="validation/${split}.jsonl" - if [ ! -f "$f" ] || [ ! -s "$f" ]; then - warn "$split split missing or empty" - else - count=$(grep -c . "$f" 2>/dev/null || echo 0) - ok "$split: $count examples" - fi -done - -echo "" -echo "[ Existing Adapters ]" -if [ -d "adapters" ] && [ "$(ls -A adapters 2>/dev/null)" ]; then - for adapter_dir in adapters/*/; do - name=$(basename "$adapter_dir") - if [ -f "${adapter_dir}adapter_config.json" ]; then - ok "$name (trained)" - else - warn "$name (incomplete)" - fi - done -else - echo " (no adapters yet)" -fi - -echo "" -echo "=================================" -echo "PASS: $PASS WARN: $WARN FAIL: $FAIL" -if [ "$FAIL" -gt 0 ]; then - echo "Status: FAIL" - exit 1 -elif [ "$WARN" -gt 0 ]; then - echo "Status: WARN" -else - echo "Status: PASS" -fi