feat(classifier): hash-check same-name files; verify every copy; resume converges

Same tracking number + revision ⇒ same document, so the bytes must match:
- Pre-flight (and a new "Check" button) groups fully-classified files by their
  canonical name and SHA-256s any collisions. Identical bytes collapse to ONE
  copy (deduped); DIFFERENT bytes are a conflict — flagged ≠ in red in the
  By-tracking table (with a tooltip) and held back from the copy so the user
  fixes them first. Flags clear when a placement changes.
- Every file copied this run is VERIFIED: read the written target back, compare
  SHA-256 to the source. One re-copy attempt on mismatch; if it still fails, the
  bad target is removed so a re-run re-copies it — resume converges on a
  fully-correct archive (skip-if-exists stays the fast path for good files).

classify gains transient hash-conflict flags (setHashConflicts/hasHashConflict),
copy gains sourceSha (cached), writeTarget, verifyOne, removeTarget, resolvePlan
and audit(); copyTo runs the verify pass and reports verifyFailed.

Tests: identical pair dedups + differing pair conflicts/flags; a corrupting
write fails verification and is removed (54 green).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
ZDDC 2026-06-11 09:01:02 -05:00
parent 61c1b4f90d
commit d951c3a5e7
6 changed files with 231 additions and 49 deletions

View file

@ -182,7 +182,8 @@
modeClassifyBtn: document.getElementById('modeClassifyBtn'), modeClassifyBtn: document.getElementById('modeClassifyBtn'),
spreadsheetPane: document.getElementById('spreadsheetPane'), spreadsheetPane: document.getElementById('spreadsheetPane'),
targetPane: document.getElementById('targetPane'), targetPane: document.getElementById('targetPane'),
copyOutputBtn: document.getElementById('copyOutputBtn') copyOutputBtn: document.getElementById('copyOutputBtn'),
checkDuplicatesBtn: document.getElementById('checkDuplicatesBtn')
}; };
} }
@ -372,6 +373,7 @@
if (app.dom.modeRenameBtn) app.dom.modeRenameBtn.addEventListener('click', function () { setMode('rename'); }); if (app.dom.modeRenameBtn) app.dom.modeRenameBtn.addEventListener('click', function () { setMode('rename'); });
if (app.dom.modeClassifyBtn) app.dom.modeClassifyBtn.addEventListener('click', function () { setMode('classify'); }); if (app.dom.modeClassifyBtn) app.dom.modeClassifyBtn.addEventListener('click', function () { setMode('classify'); });
if (app.dom.copyOutputBtn) app.dom.copyOutputBtn.addEventListener('click', function () { app.modules.copy.run(); }); if (app.dom.copyOutputBtn) app.dom.copyOutputBtn.addEventListener('click', function () { app.modules.copy.run(); });
if (app.dom.checkDuplicatesBtn) app.dom.checkDuplicatesBtn.addEventListener('click', function () { app.modules.copy.audit(); });
// Live source-tree filter (matches file path + name; reveals the hierarchy). // Live source-tree filter (matches file path + name; reveals the hierarchy).
if (app.dom.treeFilterInput) app.dom.treeFilterInput.addEventListener('input', function () { if (app.dom.treeFilterInput) app.dom.treeFilterInput.addEventListener('input', function () {

View file

@ -62,6 +62,13 @@
// id -> { node, kind:'tracking'|'party'|'slot'|'transmittal', parent } // id -> { node, kind:'tracking'|'party'|'slot'|'transmittal', parent }
var nodeIndex = {}; var nodeIndex = {};
// Transient (not serialized): srcKeys flagged by the copy audit as a
// same-name/different-content conflict. Cleared whenever a placement changes.
var hashConflicts = {};
function setHashConflicts(map) { hashConflicts = map || {}; notify(); }
function hasHashConflict(key) { return !!hashConflicts[key]; }
function clearHashConflicts() { hashConflicts = {}; }
// ── pub/sub ────────────────────────────────────────────────────────────── // ── pub/sub ──────────────────────────────────────────────────────────────
var listeners = []; var listeners = [];
function on(cb) { listeners.push(cb); return function () { listeners = listeners.filter(function (f) { return f !== cb; }); }; } function on(cb) { listeners.push(cb); return function () { listeners = listeners.filter(function (f) { return f !== cb; }); }; }
@ -139,6 +146,7 @@
a.excluded = false; // placing un-excludes a.excluded = false; // placing un-excludes
cleanAssignment(k); cleanAssignment(k);
}); });
clearHashConflicts(); // a placement changed → stale conflict flags
notify(); notify();
} }
function setExcluded(keys, excluded) { function setExcluded(keys, excluded) {
@ -148,6 +156,7 @@
if (excluded) { a.trackingNodeId = null; a.transmittalNodeId = null; } if (excluded) { a.trackingNodeId = null; a.transmittalNodeId = null; }
cleanAssignment(k); cleanAssignment(k);
}); });
clearHashConflicts();
notify(); notify();
} }
// Forget any assignment for these source keys (e.g. when a .zip flips // Forget any assignment for these source keys (e.g. when a .zip flips
@ -608,6 +617,7 @@
// assignments // assignments
assignmentFor: assignmentFor, getAssignment: getAssignment, assignmentFor: assignmentFor, getAssignment: getAssignment,
place: place, setExcluded: setExcluded, dropAssignments: dropAssignments, place: place, setExcluded: setExcluded, dropAssignments: dropAssignments,
setHashConflicts: setHashConflicts, hasHashConflict: hasHashConflict,
setTitleOverride: setTitleOverride, setTitleOverride: setTitleOverride,
// trees // trees
addTrackingNode: addTrackingNode, addParty: addParty, addTrackingNode: addTrackingNode, addParty: addParty,

View file

@ -124,6 +124,23 @@
// re-running after an interruption skips the work already done — no source // re-running after an interruption skips the work already done — no source
// read, no hashing. (Canonical ZDDC names ⇒ same name = same document, and // read, no hashing. (Canonical ZDDC names ⇒ same name = same document, and
// the server archive is WORM, so we never overwrite.) // the server archive is WORM, so we never overwrite.)
// SHA-256 of a source file's bytes, cached on the file object (reused by the
// duplicate-conflict audit AND the post-copy verify).
async function sourceSha(fileObj) {
if (fileObj.sha256) return fileObj.sha256;
var blob = await readSource(fileObj);
var h = await window.zddc.crypto.sha256File(blob);
fileObj.sha256 = h;
return h;
}
async function writeTarget(out, p) {
var dir = await ensureDir(out, p.d.outPath);
var srcFile = await readSource(p.file); // READ source (never write it)
var fh = await dir.getFileHandle(p.d.filename, { create: true });
var w = await fh.createWritable();
await w.write(srcFile);
await w.close();
}
async function copyOne(out, p) { async function copyOne(out, p) {
// Cheap existence probe: resolve the dir WITHOUT creating it (the HTTP // Cheap existence probe: resolve the dir WITHOUT creating it (the HTTP
// handle doesn't verify here, but getFileHandle below does a HEAD). // handle doesn't verify here, but getFileHandle below does a HEAD).
@ -132,58 +149,109 @@
try { await probe.getFileHandle(p.d.filename); return 'skipped'; } try { await probe.getFileHandle(p.d.filename); return 'skipped'; }
catch (e) { /* NotFound → write it below */ } catch (e) { /* NotFound → write it below */ }
} }
// Write path: create the folder chain (idempotent) then read + write. await writeTarget(out, p);
var dir = await ensureDir(out, p.d.outPath);
var srcFile = await readSource(p.file); // READ source (never write it)
var fh = await dir.getFileHandle(p.d.filename, { create: true });
var w = await fh.createWritable();
await w.write(srcFile);
await w.close();
return 'copied'; return 'copied';
} }
// Read the written target back and compare its SHA-256 to the source.
async function verifyOne(out, p) {
var dir = await resolveDir(out, p.d.outPath, false);
if (!dir) return false;
var fh; try { fh = await dir.getFileHandle(p.d.filename); } catch (e) { return false; }
var th = await window.zddc.crypto.sha256File(await fh.getFile());
return th === (await sourceSha(p.file));
}
async function removeTarget(out, p) {
var dir = await resolveDir(out, p.d.outPath, false);
if (dir && dir.removeEntry) { try { await dir.removeEntry(p.d.filename); } catch (e) { /* best effort */ } }
}
// Snapshot-loaded files have no live handle — re-grant read on the source
// (one click) before we read any bytes (hashing or copying). Returns false
// if the source can't be read.
async function ensureSourceReadable(items) {
if (!items.some(function (p) { return !p.file.handle; })) return true;
if (!window.app.rootHandle) {
toast('The source directory isnt connected. Re-open the workspace to reconnect it.', 'error');
return false;
}
var ok = await window.app.modules.persist.verifyPermission(window.app.rootHandle, false);
if (!ok) { toast('Permission to read the source directory was denied.', 'error'); return false; }
return true;
}
// Group fully-classified files by their canonical output name. Files with the
// SAME tracking number + revision MUST have the same content: identical bytes
// collapse to a single copy; differing bytes are a CONFLICT the user must fix.
async function resolvePlan(items) {
var by = {};
items.forEach(function (p) { (by[p.outRel] = by[p.outRel] || []).push(p); });
var todo = [], conflicts = [], conflictKeys = {}, dupeCount = 0, keys = Object.keys(by);
for (var i = 0; i < keys.length; i++) {
var group = by[keys[i]];
if (group.length === 1) { todo.push(group[0]); continue; }
var hashes = [], bad = false;
for (var j = 0; j < group.length; j++) {
try { hashes.push(await sourceSha(group[j].file)); } catch (e) { bad = true; hashes.push('ERR' + j); }
}
var distinct = {}; hashes.forEach(function (h) { distinct[h] = true; });
if (!bad && Object.keys(distinct).length === 1) {
todo.push(group[0]); dupeCount += group.length - 1; // identical → one copy
} else {
conflicts.push(keys[i]);
group.forEach(function (g) { conflictKeys[g.d.key] = true; });
}
}
return { todo: todo, conflicts: conflicts, conflictKeys: conflictKeys, dupeCount: dupeCount };
}
// Pre-flight shared by Copy and the standalone "Check" button: hash colliding
// names, flag conflicts in the UI, return the deduped todo (or null to abort).
async function preflight(verb) {
var items = plan();
if (!items.length) {
toast('Nothing ' + verb + ' yet — no files are fully classified (need a tracking leaf AND a transmittal).', 'warning');
return null;
}
if (!(await ensureSourceReadable(items))) return null;
setStatus('Checking for same-name/different-content conflicts…');
var r = await resolvePlan(items);
setStatus('');
C().setHashConflicts(r.conflictKeys);
if (r.conflicts.length) {
toast(r.conflicts.length + ' same-name/different-content conflict(s) flagged (≠ in red): same tracking+revision, different bytes. Fix these before copying.', 'error');
}
if (r.dupeCount) toast(r.dupeCount + ' exact duplicate(s) collapse to one copy.', 'info');
return r;
}
// Standalone audit (the "Check" button) — flag conflicts without copying.
async function audit() {
var r = await preflight('to check');
if (r && !r.conflicts.length) {
toast('No conflicts — ' + r.todo.length + ' file' + (r.todo.length === 1 ? '' : 's') + ' ready to copy.', 'success');
}
return r;
}
async function run() { async function run() {
if (!C().isEnabled()) return; if (!C().isEnabled()) return;
var items = plan(); var r = await preflight('to copy');
if (!items.length) { if (!r) return;
toast('Nothing to copy yet — no files are fully classified (need both a tracking leaf and a transmittal).', 'warning'); var todo = r.todo;
return; if (!todo.length) { if (r.conflicts.length) toast('Resolve the flagged conflicts, then copy.', 'warning'); return; }
}
var cf = conflictsIn(items);
var blocked = {};
cf.conflicts.forEach(function (path) { blocked[path] = true; });
var todo = items.filter(function (p) { return !blocked[p.outRel]; });
if (cf.conflicts.length) { // Where to file the canonical copies: the server archive (HTTP) or a local
toast(cf.conflicts.length + ' output-name collision(s) — two source files map to the same name. Skipped:\n' // folder. Both read the source, never write it, both resumable + verified.
+ cf.conflicts.join('\n'), 'error');
}
if (!todo.length) return;
// Where to file the canonical copies: the server archive (HTTP) or a
// local folder (File System Access). Both read the source, never write it,
// and both are resumable — already-present targets are skipped.
var dest = await chooseDestination(todo.length); var dest = await chooseDestination(todo.length);
if (!dest) return; if (!dest) return;
// Snapshot-loaded files have no live handle — re-grant read on the
// workspace source directory (one click) before reading.
if (todo.some(function (p) { return !p.file.handle; })) {
if (!window.app.rootHandle) {
toast('The source directory isnt connected. Re-open the workspace to reconnect it.', 'error');
return;
}
var srcOk = await window.app.modules.persist.verifyPermission(window.app.rootHandle, false);
if (!srcOk) { toast('Permission to read the source directory was denied.', 'error'); return; }
}
return dest === 'server' ? copyToServer(todo) : copyToLocal(todo); return dest === 'server' ? copyToServer(todo) : copyToLocal(todo);
} }
function summary(s, where) { function summary(s, where) {
var msg = 'Copy to ' + where + ' — ' + s.copied + ' copied, ' + s.skipped + ' already there' var msg = 'Copy to ' + where + ' — ' + s.copied + ' copied & verified, ' + s.skipped + ' already there'
+ (s.errors ? (', ' + s.errors + ' failed (retry to resume)') : '') + '.'; + (s.verifyFailed ? (', ' + s.verifyFailed + ' FAILED verification (bad copy removed — re-run)') : '')
toast(msg, s.errors ? 'warning' : 'success'); + (s.errors ? (', ' + s.errors + ' errored (retry to resume)') : '') + '.';
toast(msg, (s.errors || s.verifyFailed) ? 'warning' : 'success');
} }
async function copyToLocal(todo) { async function copyToLocal(todo) {
@ -302,11 +370,14 @@
// Resumable: copyOne skips targets that already exist, so a re-run after an // Resumable: copyOne skips targets that already exist, so a re-run after an
// interruption only does the remaining work. // interruption only does the remaining work.
async function copyTo(out, todo) { async function copyTo(out, todo) {
var s = { copied: 0, skipped: 0, errors: 0 }; var s = { copied: 0, skipped: 0, errors: 0, verifyFailed: 0 };
var copied = [];
for (var i = 0; i < todo.length; i++) { for (var i = 0; i < todo.length; i++) {
setStatus('Copying… ' + (i + 1) + '/' + todo.length + ' — ' + todo[i].d.filename); setStatus('Copying… ' + (i + 1) + '/' + todo.length + ' — ' + todo[i].d.filename);
try { try {
s[await copyOne(out, todo[i])]++; var r = await copyOne(out, todo[i]);
s[r]++;
if (r === 'copied') copied.push(todo[i]);
} catch (e) { } catch (e) {
s.errors++; s.errors++;
if (window.zddc && window.zddc.toast) { if (window.zddc && window.zddc.toast) {
@ -314,6 +385,28 @@
} }
} }
} }
// Verification pass over JUST the files copied this run: read each target
// back, compare SHA-256 to the source. One re-copy attempt on mismatch;
// if it still fails, remove the bad target so a re-run re-copies it — so
// resume converges on a fully-correct archive.
for (var k = 0; k < copied.length; k++) {
setStatus('Verifying… ' + (k + 1) + '/' + copied.length + ' — ' + copied[k].d.filename);
try {
if (await verifyOne(out, copied[k])) continue;
await writeTarget(out, copied[k]);
if (await verifyOne(out, copied[k])) continue;
s.verifyFailed++;
await removeTarget(out, copied[k]);
if (window.zddc && window.zddc.toast) {
window.zddc.toast('Verification failed for ' + copied[k].outRel + ' — removed the bad copy; re-run to retry.', 'error');
}
} catch (e) {
s.verifyFailed++;
if (window.zddc && window.zddc.toast) {
window.zddc.toast('Verify error for ' + copied[k].outRel + ' — ' + (e.message || e), 'error');
}
}
}
setStatus(''); setStatus('');
return s; return s;
} }
@ -322,11 +415,13 @@
window.app.modules.copy = { window.app.modules.copy = {
run: run, run: run,
audit: audit,
readyCount: readyCount, readyCount: readyCount,
chooseOutput: chooseOutput, chooseOutput: chooseOutput,
// test/advanced seams // test/advanced seams
plan: plan, plan: plan,
conflictsIn: conflictsIn, conflictsIn: conflictsIn,
resolvePlan: resolvePlan,
copyTo: copyTo, copyTo: copyTo,
}; };
})(); })();

View file

@ -276,17 +276,20 @@
// delegated preview + name-edit handlers apply. // delegated preview + name-edit handlers apply.
function fileCellContent(f) { function fileCellContent(f) {
var d = C().deriveTarget(f); var d = C().deriveTarget(f);
var row = el('div', 'tfile' + (d.errors.length ? ' tfile--err' : '')); var conflict = C().hasHashConflict(d.key); // same name, different bytes
var bad = d.errors.length || conflict;
var row = el('div', 'tfile' + (bad ? ' tfile--err' : ''));
row.dataset.key = d.key; row.dataset.key = d.key;
var orig = f.originalFilename + (f.extension ? '.' + f.extension : ''); var orig = f.originalFilename + (f.extension ? '.' + f.extension : '');
var name = el('input', 'tfile__name' + (d.errors.length ? ' tfile__name--err' : '')); var name = el('input', 'tfile__name' + (bad ? ' tfile__name--err' : ''));
name.type = 'text'; name.type = 'text';
name.value = d.filename || ''; name.value = d.filename || '';
name.placeholder = '(incomplete)'; name.placeholder = '(incomplete)';
name.title = (d.errors.length ? d.errors.join('; ') + ' · ' : '') + 'original: ' + orig; name.title = (conflict ? 'Same tracking+revision as another file but DIFFERENT content — fix before copying · ' : '')
+ (d.errors.length ? d.errors.join('; ') + ' · ' : '') + 'original: ' + orig;
row.appendChild(name); row.appendChild(name);
row.appendChild(el('span', 'tfile__badge' + (d.errors.length ? ' tfile__badge--err' : ' tfile__badge--ok'), row.appendChild(el('span', 'tfile__badge' + (bad ? ' tfile__badge--err' : ' tfile__badge--ok'),
d.errors.length ? '⚠' : '✓')); conflict ? '≠' : (d.errors.length ? '⚠' : '✓')));
return row; return row;
} }

View file

@ -170,7 +170,8 @@
<button id="importDatasetBtn" class="btn btn-secondary btn-sm" title="Load an edited classification JSON back in — replaces the current classifications. (To move a whole scanned workspace between browsers, use “Import workspace” on the welcome screen.)">Import edits</button> <button id="importDatasetBtn" class="btn btn-secondary btn-sm" title="Load an edited classification JSON back in — replaces the current classifications. (To move a whole scanned workspace between browsers, use “Import workspace” on the welcome screen.)">Import edits</button>
<input type="file" id="importDatasetInput" accept="application/json,.json" hidden> <input type="file" id="importDatasetInput" accept="application/json,.json" hidden>
<button id="resetDatasetBtn" class="btn btn-sm btn-danger" title="Discard all classifications and start over from the raw scanned input (does not touch your files)">Reset</button> <button id="resetDatasetBtn" class="btn btn-sm btn-danger" title="Discard all classifications and start over from the raw scanned input (does not touch your files)">Reset</button>
<button id="copyOutputBtn" class="btn btn-primary btn-sm" disabled title="Copy mapped files to an output directory (source untouched)">Copy…</button> <button id="checkDuplicatesBtn" class="btn btn-secondary btn-sm" title="Check for files with the same tracking number + revision but different content (flagged ≠ in red)">Check</button>
<button id="copyOutputBtn" class="btn btn-primary btn-sm" disabled title="Copy mapped files to the server archive or a local folder (source untouched, resumable, verified)">Copy…</button>
</div> </div>
</div> </div>
<div class="target-body"> <div class="target-body">

View file

@ -1062,3 +1062,74 @@ test('copy: PUTs into a server-style handle, then resumes by skipping existing',
expect(r.paths[0].startsWith('ClientCorp/received/')).toBe(true); expect(r.paths[0].startsWith('ClientCorp/received/')).toBe(true);
expect(r.paths[0].endsWith('ACME-MECH-0001_A (IFR) - foundation.pdf')).toBe(true); expect(r.paths[0].endsWith('ACME-MECH-0001_A (IFR) - foundation.pdf')).toBe(true);
}); });
test('copy audit: same name+rev — identical content dedups, different content conflicts', async ({ page }) => {
await page.click('#modeClassifyBtn');
const r = await page.evaluate(async () => {
const c = window.app.modules.classify, copy = window.app.modules.copy;
c.reset();
const mk = (folder, content) => ({
originalFilename: 'doc', extension: 'pdf', folderPath: 'R/' + folder,
handle: { getFile: async () => new File([content], 'doc.pdf') },
});
const s1 = mk('S1', 'SAME'), s2 = mk('S2', 'SAME'), d1 = mk('D1', 'AAA'), d2 = mk('D2', 'BBB');
window.app.folderTree = [{ name: 'R', path: 'R', files: [], children: [
{ name: 'S1', path: 'R/S1', files: [s1], children: [] },
{ name: 'S2', path: 'R/S2', files: [s2], children: [] },
{ name: 'D1', path: 'R/D1', files: [d1], children: [] },
{ name: 'D2', path: 'R/D2', files: [d2], children: [] },
] }];
const L1 = c.addTrackingNode(c.addTrackingNode(null, 'ACME-0001'), 'A (IFR)');
const L2 = c.addTrackingNode(c.addTrackingNode(null, 'ACME-0002'), 'A (IFR)');
const T = c.addTransmittalBin(c.addParty('CC'), 'received', { date: '2026-03-14', type: 'TRN', seq: '0007' });
[[s1, L1], [s2, L1], [d1, L2], [d2, L2]].forEach(([f, leaf]) => {
c.place([c.srcKeyForFile(f)], leaf, 'tracking');
c.place([c.srcKeyForFile(f)], T, 'transmittal');
});
const res = await copy.resolvePlan(copy.plan());
return {
todo: res.todo.length, dupes: res.dupeCount, conflicts: res.conflicts.length,
s1Flagged: !!res.conflictKeys[c.srcKeyForFile(s1)],
d1Flagged: !!res.conflictKeys[c.srcKeyForFile(d1)],
};
});
expect(r.todo).toBe(1); // the identical pair collapses to one; the conflicting pair is excluded
expect(r.dupes).toBe(1); // one duplicate collapsed
expect(r.conflicts).toBe(1); // one same-name/different-content group
expect(r.s1Flagged).toBe(false);
expect(r.d1Flagged).toBe(true);
});
test('copy: verifies copied bytes; a bad write fails verification and is removed', async ({ page }) => {
await page.click('#modeClassifyBtn');
const r = await page.evaluate(async () => {
const c = window.app.modules.classify, copy = window.app.modules.copy;
c.reset();
const f = { originalFilename: 'doc', extension: 'pdf', folderPath: 'R',
handle: { getFile: async () => new File(['GOOD'], 'doc.pdf') } };
window.app.folderTree = [{ name: 'R', path: 'R', files: [f], children: [] }];
const leaf = c.addTrackingNode(c.addTrackingNode(null, 'ACME-0001'), 'A (IFR)');
const bin = c.addTransmittalBin(c.addParty('CC'), 'received', { date: '2026-03-14', type: 'TRN', seq: '0007' });
c.place([c.srcKeyForFile(f)], leaf, 'tracking'); c.place([c.srcKeyForFile(f)], bin, 'transmittal');
// A dir whose writes CORRUPT the content → verification must catch it.
const store = {}, removed = [];
const mkdir = (base) => ({
getDirectoryHandle: async (n) => mkdir(base + n + '/'),
getFileHandle: async (n, opts) => {
const full = base + n;
if ((!opts || !opts.create) && !(full in store)) { const e = new Error('NF'); e.name = 'NotFoundError'; throw e; }
return {
getFile: async () => new File([store[full]], n),
createWritable: async () => ({ write: async () => { store[full] = 'CORRUPT'; }, close: async () => {} }),
};
},
removeEntry: async (n) => { delete store[base + n]; removed.push(base + n); },
});
const s = await copy.copyTo(mkdir(''), copy.plan());
return { copied: s.copied, verifyFailed: s.verifyFailed, removed: removed.length, left: Object.keys(store).length };
});
expect(r.copied).toBe(1);
expect(r.verifyFailed).toBe(1); // SHA mismatch caught
expect(r.removed).toBe(1); // bad copy removed…
expect(r.left).toBe(0); // …so a re-run re-copies it
});