feat(classifier): persist & copy files inside .zip archives

Zip members were live-only: expandable while the source was connected, but the
workspace snapshot dropped the archive (.zip became a plain file), so a
classification made inside one vanished on reopen — and copy couldn't extract it
anyway (it tried to walk the archive path as a real directory).

Now zips are first-class:
- snapshotTree/loadSnapshot persist the scanned archive subtree — zip-root +
  virtual folders + members carry isVirtual/zipPath/zipEntryPath, so the tree
  rebuilds on reopen and assignments inside an archive survive. An archive that
  was never opened persists as a lazy 'zip' node that reopens on demand.
- scanner.ensureZipLoaded(rootHandle, zipPath) reloads an archive from the
  workspace root when the in-memory cache is cold (post-restore); scanZipNode
  falls back to it when a restored zip node has no live file object.
- copy.js reads a member via scanner.extractZipMember (Blob from the archive)
  instead of a non-existent file handle; preview.js reloads the archive for a
  restored member before opening it.

This also reconciles export/import with the snapshot: both now keep zip members,
so a round-trip no longer leaves dangling in-archive assignments.

Tests: zip subtree snapshot round-trip; copy extracts a member to the output (45).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
ZDDC 2026-06-10 13:27:00 -05:00
parent e1c479dba5
commit 203674ee4c
4 changed files with 161 additions and 18 deletions

View file

@ -90,7 +90,7 @@
async function sameContent(existingHandle, srcFileObj) {
var ef = await existingHandle.getFile();
var sf = await (await srcHandle(srcFileObj)).getFile();
var sf = await readSource(srcFileObj);
if (ef.size !== sf.size) return false;
var a = await window.zddc.crypto.sha256File(ef);
var b = await window.zddc.crypto.sha256File(sf);
@ -105,6 +105,16 @@
return window.app.modules.scanner.resolveFileHandle(window.app.rootHandle, fileObj);
}
// Read a source file's bytes (a File or Blob). A zip member is extracted
// from its archive (lazily reloaded from the root); a plain file is read
// through its resolved handle. The source is never written either way.
async function readSource(fileObj) {
if (fileObj.isVirtual) {
return window.app.modules.scanner.extractZipMember(window.app.rootHandle, fileObj);
}
return (await srcHandle(fileObj)).getFile();
}
// Copy one file. Returns 'copied' | 'skipped' (identical) | 'differ' (left alone).
async function copyOne(out, p) {
var dir = await ensureDir(out, p.d.outPath);
@ -113,7 +123,7 @@
if (existing) {
return (await sameContent(existing, p.file)) ? 'skipped' : 'differ';
}
var srcFile = await (await srcHandle(p.file)).getFile(); // READ source (never write it)
var srcFile = await readSource(p.file); // READ source (never write it)
var fh = await dir.getFileHandle(p.d.filename, { create: true });
var w = await fh.createWritable();
await w.write(srcFile);

View file

@ -526,12 +526,22 @@
// permission re-grant) before opening the preview window.
async function previewFile(file) {
try {
if (!file.handle && !file.isVirtual && window.app.rootHandle) {
const sc = window.app.modules.scanner;
if (file.isVirtual) {
// Snapshot-restored zip member — reload its archive from the root.
if (window.app.rootHandle && !sc.getZipCache(file.zipPath)) {
if (window.app.modules.persist && window.app.modules.persist.verifyPermission) {
const ok = await window.app.modules.persist.verifyPermission(window.app.rootHandle, false);
if (!ok) { if (window.zddc) window.zddc.toast('Permission to read the source directory was denied.', 'error'); return; }
}
await sc.ensureZipLoaded(window.app.rootHandle, file.zipPath);
}
} else if (!file.handle && window.app.rootHandle) {
if (window.app.modules.persist && window.app.modules.persist.verifyPermission) {
const ok = await window.app.modules.persist.verifyPermission(window.app.rootHandle, false);
if (!ok) { if (window.zddc) window.zddc.toast('Permission to read the source directory was denied.', 'error'); return; }
}
await window.app.modules.scanner.resolveFileHandle(window.app.rootHandle, file);
await sc.resolveFileHandle(window.app.rootHandle, file);
}
await openPreviewWindow(file);
} catch (e) {

View file

@ -369,11 +369,23 @@
// Read a lazy zip node's contents on demand (when opened), building its
// child nodes and folding its internal totals into ancestors.
async function scanZipNode(node) {
if (node.scanState !== 'zip-pending' || !node._zipFileObj) return;
if (node.scanState !== 'zip-pending') return;
var fileObj = node._zipFileObj;
if (!fileObj) {
// Restored from a snapshot — no live file object. Resolve the .zip
// from the workspace root by its path so it can be opened on demand.
if (!window.app.rootHandle || !node.zipPath) return;
try {
var dir = await resolveDirHandle(window.app.rootHandle, relFromRoot(parentPath(node.zipPath)));
fileObj = { handle: await dir.getFileHandle(baseName(node.zipPath)), folderHandle: dir };
} catch (e) {
reportScanError(node.path, e); node.scanState = 'done'; node.runFiles = 0; node.runDirs = 0; return;
}
}
node.scanState = 'scanning';
scheduleRender();
try {
await scanZipIntoNode(node, node._zipFileObj); // builds children, runFiles/runDirs, sets 'done'
await scanZipIntoNode(node, fileObj); // builds children, runFiles/runDirs, sets 'done'
} catch (e) {
reportScanError(node.path, e);
node.scanState = 'done';
@ -754,22 +766,30 @@
// ── Workspace snapshot (scan once, resume without re-walking the FS) ────
// Serialize the completed scan to compact JSON (short keys: large trees).
// Zip-root nodes are NOT preserved as expandable folders — the .zip stays a
// plain file in its parent (classifying inside archives is out of scope for
// a persisted workspace).
// Zip subtrees ARE preserved: a scanned archive keeps its virtual folders +
// members so classifications inside it survive reopen; copy/preview re-load
// the archive lazily from the root (ensureZipLoaded). An archive that was
// never opened persists as a lazy 'zip' node that reopens on demand.
function snapshotTree() {
function serFile(f) { return { o: f.originalFilename, e: f.extension, p: f.folderPath }; }
function serFile(f) {
var o = { o: f.originalFilename, e: f.extension, p: f.folderPath };
if (f.isVirtual) { o.z = f.zipPath; o.ze = f.zipEntryPath; } // zip member
return o;
}
function serNode(n) {
var o = { n: n.name, p: n.path };
if (n.isZipRoot) o.zr = 1; // archive root (zipPath === n.path)
else if (n.isVirtualDir) o.vd = n.zipPath; // folder inside an archive
if (n.files && n.files.length) o.f = n.files.map(serFile);
var realKids = (n.children || []).filter(function (c) { return !c.isZipRoot; });
if (realKids.length) o.c = realKids.map(serNode);
if (n.children && n.children.length) o.c = n.children.map(serNode);
// Record scan progress so an interrupted scan can resume: 'children'
// = direct entries fully read (kids may still be pending); anything
// unfinished (pending/scanning/zip) → 'pending' to re-read. 'done'
// is the default and omitted.
// unfinished → 'pending' to re-read. An unopened archive persists as
// 'zip' (reopen lazily, never a real dir re-walk). 'done' is the
// default and omitted.
var st = n.scanState;
if (st && st !== 'done') o.s = (st === 'children') ? 'children' : 'pending';
if (n.isZipRoot && st !== 'done') o.s = 'zip';
else if (st && st !== 'done') o.s = (st === 'children') ? 'children' : 'pending';
return o;
}
return (window.app.folderTree || []).map(serNode);
@ -780,7 +800,7 @@
// workspace root handle at copy/preview time.
function loadSnapshot(snap) {
function deFile(sf) {
return {
var fo = {
handle: null, folderHandle: null,
originalFilename: sf.o, extension: sf.e,
size: null, lastModified: null,
@ -788,11 +808,18 @@
isDirty: false, error: false, errorMessage: '', validation: null, sha256: null,
folderPath: sf.p,
};
if (sf.z) { fo.isVirtual = true; fo.zipPath = sf.z; fo.zipEntryPath = sf.ze; }
return fo;
}
function deNode(sn, parent) {
var node = makeNode({ name: sn.n, kind: 'directory' }, sn.p, parent);
var desc = { name: sn.n, kind: 'directory' };
if (sn.zr) { desc.isZipRoot = true; desc.zipPath = sn.p; }
else if (sn.vd) { desc.isVirtualDir = true; desc.zipPath = sn.vd; }
var node = makeNode(desc, sn.p, parent);
node.handle = null;
node.scanState = sn.s || 'done'; // 'pending'/'children' resume on reconnect
if (sn.zr || sn.vd) node.virtualPath = sn.p;
// 'zip' restores an unopened archive (reopen lazily); else resume marker.
node.scanState = sn.s === 'zip' ? 'zip-pending' : (sn.s || 'done');
node.expanded = false;
node.files = (sn.f || []).map(deFile);
node.children = (sn.c || []).map(function (c) { return deNode(c, node); });
@ -819,6 +846,29 @@
// ── Lazy handle resolution (snapshot files carry paths, not handles) ────
function relFromRoot(p) { var i = (p || '').indexOf('/'); return i < 0 ? '' : p.slice(i + 1); }
function parentPath(p) { var i = (p || '').lastIndexOf('/'); return i < 0 ? '' : p.slice(0, i); }
function baseName(p) { var i = (p || '').lastIndexOf('/'); return i < 0 ? p : p.slice(i + 1); }
// Load (and cache) a zip archive by its tree path. After a snapshot restore
// the in-memory cache is empty, so resolve the .zip from the workspace root
// and parse it on demand. Returns the cache record { zip, fileHandle, ... }.
async function ensureZipLoaded(rootHandle, zipPath) {
var cached = zipCache.get(zipPath);
if (cached && cached.zip) return cached;
if (!rootHandle) throw new Error('source directory not connected');
var dir = await resolveDirHandle(rootHandle, relFromRoot(parentPath(zipPath)));
var fh = await dir.getFileHandle(baseName(zipPath));
var zip = await JSZip.loadAsync(await (await fh.getFile()).arrayBuffer());
var rec = { zip: zip, fileHandle: fh, folderHandle: dir };
zipCache.set(zipPath, rec);
return rec;
}
// Read a zip member's bytes as a Blob (lazily loading its archive).
async function extractZipMember(rootHandle, fileObj) {
var rec = await ensureZipLoaded(rootHandle, fileObj.zipPath);
var entry = rec.zip.file(fileObj.zipEntryPath);
if (!entry) throw new Error('zip member not found: ' + fileObj.zipEntryPath);
return await entry.async('blob');
}
async function resolveDirHandle(rootHandle, relPath) {
var cur = rootHandle;
var parts = (relPath || '').split('/').filter(Boolean);
@ -886,6 +936,8 @@
loadSnapshot,
resolveFileHandle,
resolveDirHandle,
ensureZipLoaded,
extractZipMember,
resumeScan
};
})();

View file

@ -812,3 +812,74 @@ test('search opens only the branch with a hit, leaving siblings collapsed', asyn
expect(r.folders).toEqual(['Project', 'Project/Electrical']);
expect(r.files).toEqual(['Switchgear Spec.pdf']);
});
test('snapshot: a scanned zip subtree round-trips with its virtual members', async ({ page }) => {
const r = await page.evaluate(() => {
const sc = window.app.modules.scanner;
window.app.folderTree = [{
name: 'Root', path: 'Root', scanState: 'done', files: [], children: [{
name: 'docs.zip', path: 'Root/docs.zip', isZipRoot: true, zipPath: 'Root/docs.zip',
scanState: 'done', children: [], files: [{
originalFilename: 'spec', extension: 'pdf', folderPath: 'Root/docs.zip',
isVirtual: true, zipPath: 'Root/docs.zip', zipEntryPath: 'spec.pdf',
}],
}],
}];
const json = JSON.stringify(sc.snapshotTree());
window.app.folderTree = [];
sc.loadSnapshot(JSON.parse(json));
const zip = window.app.folderTree[0].children[0];
const m = zip.files[0];
return {
isZipRoot: zip.isZipRoot, zipPath: zip.zipPath, done: zip.scanState === 'done',
virtual: m.isVirtual, mZip: m.zipPath, entry: m.zipEntryPath, handleNull: m.handle === null,
};
});
expect(r.isZipRoot).toBe(true); // archive preserved as an expandable folder
expect(r.zipPath).toBe('Root/docs.zip');
expect(r.done).toBe(true);
expect(r.virtual).toBe(true); // member flagged virtual…
expect(r.mZip).toBe('Root/docs.zip'); // …with enough to re-extract
expect(r.entry).toBe('spec.pdf');
expect(r.handleNull).toBe(true);
});
test('copy: a zip member is extracted from its archive and written out', async ({ page }) => {
await page.click('#modeClassifyBtn');
const res = await page.evaluate(async () => {
const c = window.app.modules.classify, copy = window.app.modules.copy;
const f = {
originalFilename: 'spec', extension: 'pdf', folderPath: 'Root/docs.zip',
isVirtual: true, zipPath: 'Root/docs.zip', zipEntryPath: 'spec.pdf', handle: null,
};
window.app.folderTree = [{ name: 'Root', path: 'Root', files: [], children: [
{ name: 'docs.zip', path: 'Root/docs.zip', isZipRoot: true, files: [f], children: [] },
] }];
// Stub archive extraction — return the member's bytes as a Blob.
window.app.rootHandle = {};
window.app.modules.scanner.extractZipMember = async () => new File(['ZIPBYTES'], 'spec.pdf');
const leaf = c.addTrackingNode(c.addTrackingNode(null, 'ACME-MECH-0001'), 'A (IFR)');
const bin = c.addTransmittalBin(c.addParty('ClientCorp'), 'received', { date: '2026-03-14', type: 'TRN', seq: '0007' });
c.place([c.srcKeyForFile(f)], leaf, 'tracking'); c.place([c.srcKeyForFile(f)], bin, 'transmittal');
const outStore = {};
const mkOut = (prefix) => ({
name: prefix || 'out',
getDirectoryHandle: async (n) => mkOut((prefix ? prefix + '/' : '') + n),
getFileHandle: async (n, opts) => {
const full = (prefix ? prefix + '/' : '') + n;
if (!opts || !opts.create) { if (!(full in outStore)) { const e = new Error('NF'); e.name = 'NotFoundError'; throw e; } }
return {
getFile: async () => new File([outStore[full] != null ? outStore[full] : ''], n),
createWritable: async () => ({ write: async (d) => { outStore[full] = (d && d.text) ? await d.text() : d; }, close: async () => { } }),
};
},
});
const s = await copy.copyTo(mkOut(''), copy.plan());
return { copied: s.copied, content: Object.values(outStore)[0], wrote: Object.keys(outStore).some((k) => k.endsWith('spec.pdf')) };
});
expect(res.copied).toBe(1);
expect(res.wrote).toBe(true);
expect(res.content).toBe('ZIPBYTES');
});