feat(server): full md/docx/html conversion matrix + base64 image inlining
Generalize the conversion engine from markdown-source-only to a (from→to)
dispatcher, convert.Convert, supporting:
md → docx | html | pdf
docx → md | html
html → md | docx
- convertToMarkdown (docx→md, html→md): pandoc -t gfm --wrap=none with an
embedded inline-media.lua filter that base64-inlines mediabag images as data:
URIs, so the output .md is self-contained (markdown has no --embed-resources).
- convertToHTML now takes a source format: docx→html reuses the doctype template
and --embed-resources base64-inlines the docx's images automatically.
- convertToDocx takes a source format: html→docx embeds images natively.
- ToDocx/ToHTML/ToPDF are kept as the md-source entry points, delegating to the
shared internals. writeScratchFiles generalizes the old template-set writer.
Routing (converthandler.go):
- RecognizeVirtualConvert maps any target ext {md,docx,html,pdf} to the first
existing real sibling source by precedence (md←docx,html; docx←md,html;
html←md,docx; pdf←md). Real files still win (dispatcher stats first).
- ServeConverted accepts md; buildAndStore dispatches on (ext(src), format) via
convert.Convert; purgeConverted clears all derived siblings on any write.
Tests: per-direction command-shape assertions (convert) + recognizer matrix and
precedence (handler). Verified end-to-end with real pandoc (docx→md/html,
html→md/docx, base64 images). Full ./... suite green.
PDF stays markdown-only for now (docx/html→pdf would need a two-stage hop).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
894610d59e
commit
16d88010a6
7 changed files with 317 additions and 52 deletions
|
|
@ -154,12 +154,20 @@ func currentChromiumBinary() string {
|
||||||
// full file content (envelope + body); pandoc handles
|
// full file content (envelope + body); pandoc handles
|
||||||
// `markdown+yaml_metadata_block` natively.
|
// `markdown+yaml_metadata_block` natively.
|
||||||
func ToDocx(ctx context.Context, source []byte, m Metadata) ([]byte, error) {
|
func ToDocx(ctx context.Context, source []byte, m Metadata) ([]byte, error) {
|
||||||
|
return convertToDocx(ctx, "markdown+yaml_metadata_block", source, m)
|
||||||
|
}
|
||||||
|
|
||||||
|
// convertToDocx renders source (in pandoc input format fromFmt) to DOCX bytes
|
||||||
|
// via a single pandoc exec (stdin → stdout; no scratch dir). Images in the
|
||||||
|
// source's mediabag — present when fromFmt is "html" — are embedded into the
|
||||||
|
// .docx natively by pandoc's docx writer.
|
||||||
|
func convertToDocx(ctx context.Context, fromFmt string, source []byte, m Metadata) ([]byte, error) {
|
||||||
r := currentRunner()
|
r := currentRunner()
|
||||||
if r == nil {
|
if r == nil {
|
||||||
return nil, ErrUnavailable
|
return nil, ErrUnavailable
|
||||||
}
|
}
|
||||||
cmd := []string{
|
cmd := []string{
|
||||||
"--from=markdown+yaml_metadata_block",
|
"--from=" + fromFmt,
|
||||||
"--to=docx",
|
"--to=docx",
|
||||||
"--output=-",
|
"--output=-",
|
||||||
}
|
}
|
||||||
|
|
@ -168,6 +176,68 @@ func ToDocx(ctx context.Context, source []byte, m Metadata) ([]byte, error) {
|
||||||
return r.Run(ctx, currentPandocBinary(), source, "", cmd)
|
return r.Run(ctx, currentPandocBinary(), source, "", cmd)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// convertToMarkdown renders source (DOCX or HTML, per fromFmt) to GitHub-
|
||||||
|
// flavored markdown. Embedded images are inlined as base64 data: URIs via the
|
||||||
|
// inline-media.lua filter so the output .md is self-contained; --wrap=none keeps
|
||||||
|
// paragraphs on one line (no hard line breaks).
|
||||||
|
func convertToMarkdown(ctx context.Context, fromFmt string, source []byte) ([]byte, error) {
|
||||||
|
r := currentRunner()
|
||||||
|
if r == nil {
|
||||||
|
return nil, ErrUnavailable
|
||||||
|
}
|
||||||
|
scratch, err := writeScratchFiles(currentScratchDir(), map[string][]byte{"inline-media.lua": inlineMediaLua})
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("scratch: %w", err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(scratch)
|
||||||
|
cmd := []string{
|
||||||
|
"--from=" + fromFmt,
|
||||||
|
"--to=gfm",
|
||||||
|
"--wrap=none",
|
||||||
|
"--lua-filter=" + filepath.Join(scratch, "inline-media.lua"),
|
||||||
|
"--output=-",
|
||||||
|
"-",
|
||||||
|
}
|
||||||
|
return r.Run(ctx, currentPandocBinary(), source, scratch, cmd)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert renders source from one document format to another. Supported pairs:
|
||||||
|
//
|
||||||
|
// md → docx | html | pdf
|
||||||
|
// docx → md | html
|
||||||
|
// html → md | docx
|
||||||
|
//
|
||||||
|
// ts is the resolved HTML template set, used only for the *→html and md→pdf
|
||||||
|
// directions and ignored otherwise. Unsupported pairs return an error.
|
||||||
|
func Convert(ctx context.Context, from, to string, source []byte, m Metadata, ts TemplateSet) ([]byte, error) {
|
||||||
|
switch from {
|
||||||
|
case "md", "markdown":
|
||||||
|
switch to {
|
||||||
|
case "docx":
|
||||||
|
return ToDocx(ctx, source, m)
|
||||||
|
case "html":
|
||||||
|
return ToHTML(ctx, source, m, ts)
|
||||||
|
case "pdf":
|
||||||
|
return ToPDF(ctx, source, m, ts)
|
||||||
|
}
|
||||||
|
case "docx":
|
||||||
|
switch to {
|
||||||
|
case "md":
|
||||||
|
return convertToMarkdown(ctx, "docx", source)
|
||||||
|
case "html":
|
||||||
|
return convertToHTML(ctx, "docx", source, m, ts)
|
||||||
|
}
|
||||||
|
case "html", "htm":
|
||||||
|
switch to {
|
||||||
|
case "md":
|
||||||
|
return convertToMarkdown(ctx, "html", source)
|
||||||
|
case "docx":
|
||||||
|
return convertToDocx(ctx, "html", source, m)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("unsupported conversion %s→%s", from, to)
|
||||||
|
}
|
||||||
|
|
||||||
// ToHTML renders source markdown to standalone HTML using the doctype
|
// ToHTML renders source markdown to standalone HTML using the doctype
|
||||||
// template in ts. Embeds CSS + images via --embed-resources. The
|
// template in ts. Embeds CSS + images via --embed-resources. The
|
||||||
// template + its partials live in a per-call scratch dir; the host path
|
// template + its partials live in a per-call scratch dir; the host path
|
||||||
|
|
@ -175,6 +245,14 @@ func ToDocx(ctx context.Context, source []byte, m Metadata) ([]byte, error) {
|
||||||
// sandbox at the same path. A zero-value ts falls back to the embedded
|
// sandbox at the same path. A zero-value ts falls back to the embedded
|
||||||
// default template.
|
// default template.
|
||||||
func ToHTML(ctx context.Context, source []byte, m Metadata, ts TemplateSet) ([]byte, error) {
|
func ToHTML(ctx context.Context, source []byte, m Metadata, ts TemplateSet) ([]byte, error) {
|
||||||
|
return convertToHTML(ctx, "markdown+yaml_metadata_block", source, m, ts)
|
||||||
|
}
|
||||||
|
|
||||||
|
// convertToHTML renders source (in pandoc input format fromFmt) to standalone
|
||||||
|
// HTML through the doctype template in ts. --embed-resources base64-inlines CSS
|
||||||
|
// and any mediabag images (so DOCX images survive docx→html with no extra
|
||||||
|
// filter). The template + partials are written to a per-call scratch dir.
|
||||||
|
func convertToHTML(ctx context.Context, fromFmt string, source []byte, m Metadata, ts TemplateSet) ([]byte, error) {
|
||||||
r := currentRunner()
|
r := currentRunner()
|
||||||
if r == nil {
|
if r == nil {
|
||||||
return nil, ErrUnavailable
|
return nil, ErrUnavailable
|
||||||
|
|
@ -182,7 +260,7 @@ func ToHTML(ctx context.Context, source []byte, m Metadata, ts TemplateSet) ([]b
|
||||||
if ts.Name == "" || len(ts.Files) == 0 {
|
if ts.Name == "" || len(ts.Files) == 0 {
|
||||||
ts = DefaultTemplateSet(DefaultTemplateName)
|
ts = DefaultTemplateSet(DefaultTemplateName)
|
||||||
}
|
}
|
||||||
scratch, err := writeTemplateSetToScratch(currentScratchDir(), ts)
|
scratch, err := writeScratchFiles(currentScratchDir(), ts.Files)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("scratch: %w", err)
|
return nil, fmt.Errorf("scratch: %w", err)
|
||||||
}
|
}
|
||||||
|
|
@ -190,7 +268,7 @@ func ToHTML(ctx context.Context, source []byte, m Metadata, ts TemplateSet) ([]b
|
||||||
|
|
||||||
tplPath := filepath.Join(scratch, ts.Name)
|
tplPath := filepath.Join(scratch, ts.Name)
|
||||||
cmd := []string{
|
cmd := []string{
|
||||||
"--from=markdown+yaml_metadata_block",
|
"--from=" + fromFmt,
|
||||||
"--to=html5",
|
"--to=html5",
|
||||||
"--standalone",
|
"--standalone",
|
||||||
"--embed-resources",
|
"--embed-resources",
|
||||||
|
|
|
||||||
|
|
@ -41,6 +41,77 @@ func (f *fakeRunner) lastCall() (string, []string) {
|
||||||
return f.binaries[len(f.binaries)-1], f.calls[len(f.calls)-1]
|
return f.binaries[len(f.binaries)-1], f.calls[len(f.calls)-1]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestConvert_Directions(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
from, to string
|
||||||
|
wantArgs []string // substrings that must appear in the pandoc command
|
||||||
|
wantErr bool
|
||||||
|
}{
|
||||||
|
{"docx", "md", []string{"--from=docx", "--to=gfm", "--wrap=none"}, false},
|
||||||
|
{"html", "md", []string{"--from=html", "--to=gfm", "--wrap=none"}, false},
|
||||||
|
{"docx", "html", []string{"--from=docx", "--to=html5", "--embed-resources"}, false},
|
||||||
|
{"html", "docx", []string{"--from=html", "--to=docx"}, false},
|
||||||
|
{"md", "docx", []string{"--from=markdown+yaml_metadata_block", "--to=docx"}, false},
|
||||||
|
{"md", "html", []string{"--from=markdown+yaml_metadata_block", "--to=html5"}, false},
|
||||||
|
{"docx", "pdf", nil, true}, // pdf is markdown-only
|
||||||
|
{"docx", "docx", nil, true}, // same-format is unsupported
|
||||||
|
{"html", "html", nil, true},
|
||||||
|
}
|
||||||
|
for _, c := range cases {
|
||||||
|
t.Run(c.from+"_to_"+c.to, func(t *testing.T) {
|
||||||
|
f := &fakeRunner{resp: []byte("OUT")}
|
||||||
|
InstallRunner(f)
|
||||||
|
t.Cleanup(func() { InstallRunner(nil) })
|
||||||
|
SetBinaries("pandoc", "chromium-browser")
|
||||||
|
|
||||||
|
_, err := Convert(context.Background(), c.from, c.to, []byte("x"), Metadata{}, TemplateSet{})
|
||||||
|
if c.wantErr {
|
||||||
|
if err == nil {
|
||||||
|
t.Fatalf("Convert(%s→%s): expected error, got nil", c.from, c.to)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Convert(%s→%s): %v", c.from, c.to, err)
|
||||||
|
}
|
||||||
|
binary, call := f.lastCall()
|
||||||
|
if binary != "pandoc" {
|
||||||
|
t.Errorf("expected pandoc, got %q", binary)
|
||||||
|
}
|
||||||
|
for _, want := range c.wantArgs {
|
||||||
|
if !contains(call, want) {
|
||||||
|
t.Errorf("Convert(%s→%s) missing %q in %v", c.from, c.to, want, call)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// To-markdown directions inline images via the lua filter.
|
||||||
|
if c.to == "md" {
|
||||||
|
if !hasPrefArg(call, "--lua-filter=") || !hasSuffArg(call, "inline-media.lua") {
|
||||||
|
t.Errorf("Convert(%s→md) missing inline-media.lua filter: %v", c.from, call)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// hasPrefArg / hasSuffArg report whether any arg has the given prefix/suffix.
|
||||||
|
func hasPrefArg(args []string, prefix string) bool {
|
||||||
|
for _, a := range args {
|
||||||
|
if strings.HasPrefix(a, prefix) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func hasSuffArg(args []string, suffix string) bool {
|
||||||
|
for _, a := range args {
|
||||||
|
if strings.HasSuffix(a, suffix) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
func TestToDocx_UsesPandocBinary(t *testing.T) {
|
func TestToDocx_UsesPandocBinary(t *testing.T) {
|
||||||
f := &fakeRunner{resp: []byte("FAKE-DOCX")}
|
f := &fakeRunner{resp: []byte("FAKE-DOCX")}
|
||||||
InstallRunner(f)
|
InstallRunner(f)
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,14 @@ import (
|
||||||
//go:embed all:templates
|
//go:embed all:templates
|
||||||
var templatesFS embed.FS
|
var templatesFS embed.FS
|
||||||
|
|
||||||
|
// inlineMediaLua is the pandoc filter that base64-inlines images into markdown
|
||||||
|
// output (docx→md / html→md), written to the per-call scratch dir alongside the
|
||||||
|
// conversion. Server-only — the CLI convert script extracts media to a folder
|
||||||
|
// instead.
|
||||||
|
//
|
||||||
|
//go:embed inline-media.lua
|
||||||
|
var inlineMediaLua []byte
|
||||||
|
|
||||||
// DefaultTemplateName is used when a document declares no `template:` field or
|
// DefaultTemplateName is used when a document declares no `template:` field or
|
||||||
// names one that doesn't resolve.
|
// names one that doesn't resolve.
|
||||||
const DefaultTemplateName = "report"
|
const DefaultTemplateName = "report"
|
||||||
|
|
|
||||||
31
zddc/internal/convert/inline-media.lua
Normal file
31
zddc/internal/convert/inline-media.lua
Normal file
|
|
@ -0,0 +1,31 @@
|
||||||
|
-- inline-media.lua — pandoc filter that rewrites every image to a self-contained
|
||||||
|
-- base64 data: URI, pulling the bytes from pandoc's mediabag (populated when
|
||||||
|
-- reading DOCX, or fetched for HTML). Used by the docx→md / html→md conversions
|
||||||
|
-- so the resulting markdown carries its images inline (markdown output has no
|
||||||
|
-- native --embed-resources equivalent).
|
||||||
|
|
||||||
|
local b = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
|
||||||
|
|
||||||
|
local function base64(data)
|
||||||
|
return ((data:gsub('.', function(x)
|
||||||
|
local r, byte = '', x:byte()
|
||||||
|
for i = 8, 1, -1 do r = r .. (byte % 2 ^ i - byte % 2 ^ (i - 1) > 0 and '1' or '0') end
|
||||||
|
return r
|
||||||
|
end) .. '0000'):gsub('%d%d%d?%d?%d?%d?', function(x)
|
||||||
|
if #x < 6 then return '' end
|
||||||
|
local c = 0
|
||||||
|
for i = 1, 6 do c = c + (x:sub(i, i) == '1' and 2 ^ (6 - i) or 0) end
|
||||||
|
return b:sub(c + 1, c + 1)
|
||||||
|
end) .. ({ '', '==', '=' })[#data % 3 + 1])
|
||||||
|
end
|
||||||
|
|
||||||
|
function Image(img)
|
||||||
|
local mt, data = pandoc.mediabag.lookup(img.src)
|
||||||
|
if not data then
|
||||||
|
mt, data = pandoc.mediabag.fetch(img.src)
|
||||||
|
end
|
||||||
|
if data then
|
||||||
|
img.src = 'data:' .. (mt or 'application/octet-stream') .. ';base64,' .. base64(data)
|
||||||
|
end
|
||||||
|
return img
|
||||||
|
end
|
||||||
|
|
@ -274,26 +274,26 @@ func (r *ringWriter) String() string {
|
||||||
return string(r.buf)
|
return string(r.buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
// writeTemplateSetToScratch materialises a TemplateSet (the chosen doctype
|
// writeScratchFiles materialises a set of named byte buffers (template +
|
||||||
// template plus its partials) into a fresh scratch dir and returns the host
|
// partials, or a lua filter) into a fresh scratch dir and returns the host
|
||||||
// path. Caller is responsible for os.RemoveAll(dir) when done. Used by ToHTML,
|
// path. Caller is responsible for os.RemoveAll(dir) when done. pandoc resolves
|
||||||
// which needs the template + partials visible inside the sandbox (pandoc
|
// `$partial()$` includes and --lua-filter paths from this dir, so everything
|
||||||
// resolves `$partial()$` includes from the template's own directory).
|
// lands flat alongside the entry file.
|
||||||
//
|
//
|
||||||
// scratchRoot controls where the temp dir lands. Empty means "use $TMPDIR".
|
// scratchRoot controls where the temp dir lands. Empty means "use $TMPDIR".
|
||||||
//
|
//
|
||||||
// Files are written world-readable so the binary's default user can read them
|
// Files are written world-readable so the binary's default user can read them
|
||||||
// through the wrapper's bind mount regardless of the host's umask. File names
|
// through the wrapper's bind mount regardless of the host's umask. Keys are
|
||||||
// are base names only (no path separators) — they all land flat in the dir.
|
// reduced to base names only (no path separators).
|
||||||
func writeTemplateSetToScratch(scratchRoot string, ts TemplateSet) (string, error) {
|
func writeScratchFiles(scratchRoot string, files map[string][]byte) (string, error) {
|
||||||
dir, err := os.MkdirTemp(scratchRoot, "zddc-convert-")
|
dir, err := os.MkdirTemp(scratchRoot, "zddc-convert-")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("scratch dir: %w", err)
|
return "", fmt.Errorf("scratch dir: %w", err)
|
||||||
}
|
}
|
||||||
for name, b := range ts.Files {
|
for name, b := range files {
|
||||||
if err := os.WriteFile(filepath.Join(dir, filepath.Base(name)), b, 0o644); err != nil {
|
if err := os.WriteFile(filepath.Join(dir, filepath.Base(name)), b, 0o644); err != nil {
|
||||||
os.RemoveAll(dir)
|
os.RemoveAll(dir)
|
||||||
return "", fmt.Errorf("write template %q: %w", name, err)
|
return "", fmt.Errorf("write scratch file %q: %w", name, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if err := chmodTree(dir, 0o755, 0o644); err != nil {
|
if err := chmodTree(dir, 0o755, 0o644); err != nil {
|
||||||
|
|
|
||||||
|
|
@ -48,35 +48,50 @@ var convertSF singleflightGroup
|
||||||
// runner itself enforces a finer-grained timeout on the container.
|
// runner itself enforces a finer-grained timeout on the container.
|
||||||
const convertTimeout = 90 * time.Second
|
const convertTimeout = 90 * time.Second
|
||||||
|
|
||||||
|
// convertSourceExts maps a requested target extension to the candidate source
|
||||||
|
// extensions in precedence order — the first existing real sibling wins. The
|
||||||
|
// matrix: md↔docx↔html all directions, plus md→pdf (PDF stays markdown-only).
|
||||||
|
var convertSourceExts = map[string][]string{
|
||||||
|
"md": {"docx", "html"},
|
||||||
|
"docx": {"md", "html"},
|
||||||
|
"html": {"md", "docx"},
|
||||||
|
"pdf": {"md"},
|
||||||
|
}
|
||||||
|
|
||||||
// RecognizeVirtualConvert reports whether urlPath names a virtual
|
// RecognizeVirtualConvert reports whether urlPath names a virtual
|
||||||
// "<file>.<format>" — a rendered form of a sibling markdown source.
|
// "<file>.<format>" — a rendered form of a sibling source document in a
|
||||||
// Returns (mdAbsPath, format, true) when <file>.md exists on disk and
|
// different format. Returns (srcAbsPath, format, true) when the requested
|
||||||
// the requested extension is one of docx / html / pdf. The caller
|
// extension is convertible (md/docx/html/pdf) and a sibling source exists on
|
||||||
// (the dispatcher) only invokes this when a stat on the requested
|
// disk, picked by convertSourceExts precedence. The caller (the dispatcher) only
|
||||||
// path itself fails — a real on-disk file always wins.
|
// invokes this when a stat on the requested path itself fails — a real on-disk
|
||||||
|
// file always wins.
|
||||||
//
|
//
|
||||||
// A virtual file URL means `<a href="…/foo.docx">` works without any
|
// A virtual file URL means `<a href="…/foo.docx">` works without any
|
||||||
// query-string handling, and a script's `curl -O …/foo.pdf` writes the
|
// query-string handling, and a script's `curl -O …/foo.md` writes the expected
|
||||||
// expected filename.
|
// filename.
|
||||||
func RecognizeVirtualConvert(fsRoot, urlPath string) (mdAbs, format string, ok bool) {
|
func RecognizeVirtualConvert(fsRoot, urlPath string) (srcAbs, format string, ok bool) {
|
||||||
lower := strings.ToLower(urlPath)
|
lower := strings.ToLower(urlPath)
|
||||||
for _, ext := range []string{".docx", ".html", ".pdf"} {
|
for target, sources := range convertSourceExts {
|
||||||
|
ext := "." + target
|
||||||
if !strings.HasSuffix(lower, ext) {
|
if !strings.HasSuffix(lower, ext) {
|
||||||
continue
|
continue // distinct suffixes — at most one target matches
|
||||||
}
|
}
|
||||||
base := urlPath[:len(urlPath)-len(ext)]
|
base := urlPath[:len(urlPath)-len(ext)]
|
||||||
if base == "" || strings.HasSuffix(base, "/") {
|
if base == "" || strings.HasSuffix(base, "/") {
|
||||||
continue
|
return "", "", false
|
||||||
}
|
}
|
||||||
rel := strings.Trim(base, "/") + ".md"
|
stem := strings.Trim(base, "/")
|
||||||
abs := filepath.Join(fsRoot, filepath.FromSlash(rel))
|
for _, srcExt := range sources {
|
||||||
// Path containment.
|
abs := filepath.Join(fsRoot, filepath.FromSlash(stem+"."+srcExt))
|
||||||
if abs != fsRoot && !strings.HasPrefix(abs, fsRoot+string(filepath.Separator)) {
|
// Path containment.
|
||||||
continue
|
if abs != fsRoot && !strings.HasPrefix(abs, fsRoot+string(filepath.Separator)) {
|
||||||
}
|
continue
|
||||||
if info, err := os.Stat(abs); err == nil && !info.IsDir() {
|
}
|
||||||
return abs, ext[1:], true
|
if info, err := os.Stat(abs); err == nil && !info.IsDir() {
|
||||||
|
return abs, target, true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return "", "", false
|
||||||
}
|
}
|
||||||
return "", "", false
|
return "", "", false
|
||||||
}
|
}
|
||||||
|
|
@ -87,9 +102,9 @@ func RecognizeVirtualConvert(fsRoot, urlPath string) (mdAbs, format string, ok b
|
||||||
func ServeConverted(cfg config.Config, w http.ResponseWriter, r *http.Request, srcAbs, format string, chain zddc.PolicyChain) {
|
func ServeConverted(cfg config.Config, w http.ResponseWriter, r *http.Request, srcAbs, format string, chain zddc.PolicyChain) {
|
||||||
format = strings.ToLower(strings.TrimSpace(format))
|
format = strings.ToLower(strings.TrimSpace(format))
|
||||||
switch format {
|
switch format {
|
||||||
case "docx", "html", "pdf":
|
case "md", "docx", "html", "pdf":
|
||||||
default:
|
default:
|
||||||
http.Error(w, "Bad Request — convert must be docx, html, or pdf", http.StatusBadRequest)
|
http.Error(w, "Bad Request — convert must be md, docx, html, or pdf", http.StatusBadRequest)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -159,17 +174,13 @@ func buildAndStore(ctx context.Context, fsRoot, srcAbs string, srcInfo os.FileIn
|
||||||
ctx, cancel := context.WithTimeout(ctx, convertTimeout)
|
ctx, cancel := context.WithTimeout(ctx, convertTimeout)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
var out []byte
|
// Source format is the on-disk extension; target is the requested format.
|
||||||
switch format {
|
from := strings.TrimPrefix(strings.ToLower(filepath.Ext(srcAbs)), ".")
|
||||||
case "docx":
|
var ts convert.TemplateSet
|
||||||
out, err = convert.ToDocx(ctx, source, meta)
|
if format == "html" || format == "pdf" {
|
||||||
case "html":
|
ts = resolveTemplateSet(fsRoot, filepath.Dir(srcAbs), source)
|
||||||
out, err = convert.ToHTML(ctx, source, meta, resolveTemplateSet(fsRoot, filepath.Dir(srcAbs), source))
|
|
||||||
case "pdf":
|
|
||||||
out, err = convert.ToPDF(ctx, source, meta, resolveTemplateSet(fsRoot, filepath.Dir(srcAbs), source))
|
|
||||||
default:
|
|
||||||
return fmt.Errorf("unsupported format %q", format)
|
|
||||||
}
|
}
|
||||||
|
out, err := convert.Convert(ctx, from, format, source, meta, ts)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
@ -290,20 +301,21 @@ func contentDispositionFor(format, base string) string {
|
||||||
return fmt.Sprintf(`inline; filename="%s.%s"`, base, format)
|
return fmt.Sprintf(`inline; filename="%s.%s"`, base, format)
|
||||||
}
|
}
|
||||||
|
|
||||||
// purgeConverted removes the cached .zddc.d/converted/<base>.{docx,html,pdf}
|
// purgeConverted removes the cached .zddc.d/converted/<base>.{md,docx,html,pdf}
|
||||||
// sidecars for an .md source. Called from the file API after a
|
// sidecars for a convertible source. Called from the file API after a successful
|
||||||
// successful PUT/DELETE/MOVE so the next GET ?convert= regenerates.
|
// PUT/DELETE/MOVE so the next virtual-convert GET regenerates. Best-effort:
|
||||||
// Best-effort: errors (including "directory doesn't exist") are
|
// errors (including "directory doesn't exist") are swallowed. Sources whose
|
||||||
// swallowed. Non-.md sources are a no-op so this is safe to call
|
// extension isn't convertible are a no-op, so this is safe to call
|
||||||
// unconditionally after any write.
|
// unconditionally after any write.
|
||||||
func purgeConverted(srcAbs string) {
|
func purgeConverted(srcAbs string) {
|
||||||
if !strings.HasSuffix(strings.ToLower(srcAbs), ".md") {
|
ext := strings.TrimPrefix(strings.ToLower(filepath.Ext(srcAbs)), ".")
|
||||||
|
if _, ok := convertSourceExts[ext]; !ok {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
dir := filepath.Dir(srcAbs)
|
dir := filepath.Dir(srcAbs)
|
||||||
base := strings.TrimSuffix(filepath.Base(srcAbs), filepath.Ext(srcAbs))
|
base := strings.TrimSuffix(filepath.Base(srcAbs), filepath.Ext(srcAbs))
|
||||||
for _, ext := range []string{".docx", ".html", ".pdf"} {
|
for target := range convertSourceExts {
|
||||||
_ = os.Remove(filepath.Join(dir, ReservedSidecar, "converted", base+ext))
|
_ = os.Remove(filepath.Join(dir, ReservedSidecar, "converted", base+"."+target))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
65
zddc/internal/handler/converthandler_test.go
Normal file
65
zddc/internal/handler/converthandler_test.go
Normal file
|
|
@ -0,0 +1,65 @@
|
||||||
|
package handler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRecognizeVirtualConvert_MatrixAndPrecedence(t *testing.T) {
|
||||||
|
root := t.TempDir()
|
||||||
|
write := func(rel string) {
|
||||||
|
p := filepath.Join(root, filepath.FromSlash(rel))
|
||||||
|
if err := os.MkdirAll(filepath.Dir(p), 0o755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(p, []byte("x"), 0o644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sources on disk: doc.md, only.docx, both.md + both.docx, page.html.
|
||||||
|
write("doc.md")
|
||||||
|
write("only.docx")
|
||||||
|
write("both.md")
|
||||||
|
write("both.docx")
|
||||||
|
write("page.html")
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
url string
|
||||||
|
wantOK bool
|
||||||
|
wantSrcExt string
|
||||||
|
wantFormat string
|
||||||
|
}{
|
||||||
|
{"md→docx", "/doc.docx", true, ".md", "docx"},
|
||||||
|
{"md→html", "/doc.html", true, ".md", "html"},
|
||||||
|
{"md→pdf", "/doc.pdf", true, ".md", "pdf"},
|
||||||
|
{"docx→md (only docx present)", "/only.md", true, ".docx", "md"},
|
||||||
|
{"docx→html (only docx present)", "/only.html", true, ".docx", "html"},
|
||||||
|
{"docx has no pdf source", "/only.pdf", false, "", ""},
|
||||||
|
{"both present, html prefers md source", "/both.html", true, ".md", "html"},
|
||||||
|
{"html→md", "/page.md", true, ".html", "md"},
|
||||||
|
{"html→docx", "/page.docx", true, ".html", "docx"},
|
||||||
|
{"no source at all", "/missing.html", false, "", ""},
|
||||||
|
{"directory url ignored", "/doc/", false, "", ""},
|
||||||
|
{"non-convertible target", "/doc.txt", false, "", ""},
|
||||||
|
}
|
||||||
|
for _, c := range cases {
|
||||||
|
t.Run(c.name, func(t *testing.T) {
|
||||||
|
src, format, ok := RecognizeVirtualConvert(root, c.url)
|
||||||
|
if ok != c.wantOK {
|
||||||
|
t.Fatalf("ok=%v want %v (src=%q format=%q)", ok, c.wantOK, src, format)
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if format != c.wantFormat {
|
||||||
|
t.Errorf("format=%q want %q", format, c.wantFormat)
|
||||||
|
}
|
||||||
|
if filepath.Ext(src) != c.wantSrcExt {
|
||||||
|
t.Errorf("source ext=%q want %q (src=%q)", filepath.Ext(src), c.wantSrcExt, src)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in a new issue