ZDDC/pandoc/convert
2026-06-11 13:32:31 -05:00

577 lines
19 KiB
Bash

#!/bin/bash
# Function to show help
show_help() {
echo "Universal File Converter"
echo "Supported conversions: DOCX→MD, MD→HTML, HTML→MD, MD→DOCX, HTML→DOCX"
echo "Usage: $0 [-f] [-o outputdir] [-t format] [-T template] [--no-toc] input1.ext [input2.ext ...]"
echo " -f: Force overwrite existing output files"
echo " -o: Output directory (default: same as input)"
echo " -t: Target format (md, html, docx) - overrides auto-detection"
echo " -T: Template file path (default: templates/<template>.html, where <template>"
echo " comes from the doc's YAML front matter; falls back to templates/report.html)"
echo " --no-toc: Skip table of contents generation"
}
# Function to source ZDDC config files if they exist
source_config_file() {
local config_file="$1"
if [ -f "$config_file" ]; then
echo " → Loading ZDDC configuration from: $config_file"
set -a # automatically export all variables
. "$config_file"
set +a # turn off automatic export
return 0
fi
return 1
}
# Load ZDDC configuration file
load_zddc_config() {
local search_dir="$1"
# Search for zddc.conf then .zddc.conf in the search directory
if source_config_file "$search_dir/zddc.conf"; then
return 0
elif source_config_file "$search_dir/.zddc.conf"; then
return 0
fi
# No config file found - continue with defaults
return 1
}
# Source global ZDDC config from current working directory
# This is called once at startup - do NOT call again inside convert functions
load_zddc_config "$(pwd)"
# Parse arguments
FORCE_OVERWRITE=false
OUTPUT_DIR=""
TARGET_FORMAT=""
CUSTOM_TEMPLATE=""
NO_TOC=false
while [ $# -gt 0 ]; do
case $1 in
-h|--help)
show_help
exit 0
;;
-f)
FORCE_OVERWRITE=true
echo "Force overwrite mode: ON"
shift
;;
-o)
OUTPUT_DIR="$2"
echo "Output directory: $OUTPUT_DIR"
shift 2
;;
-t)
TARGET_FORMAT="$2"
echo "Target format: $TARGET_FORMAT"
shift 2
;;
-T)
CUSTOM_TEMPLATE="$2"
echo "Custom template: $CUSTOM_TEMPLATE"
shift 2
;;
--no-toc)
NO_TOC=true
echo "Table of contents: DISABLED"
shift
;;
-*)
echo "Unknown option: $1"
show_help
exit 1
;;
*)
break
;;
esac
done
if [ "$FORCE_OVERWRITE" = "false" ]; then
echo "Force overwrite mode: OFF (will skip existing output files)"
fi
if [ -z "$OUTPUT_DIR" ]; then
echo "Output directory: same as input files"
fi
if [ $# -eq 0 ]; then
echo "Error: No input files specified"
show_help
exit 1
fi
# Validate target format if specified
if [ -n "$TARGET_FORMAT" ]; then
TARGET_FORMAT_LOWER=$(echo "$TARGET_FORMAT" | tr '[:upper:]' '[:lower:]')
if [ "$TARGET_FORMAT_LOWER" != "md" ] && [ "$TARGET_FORMAT_LOWER" != "html" ] && [ "$TARGET_FORMAT_LOWER" != "docx" ]; then
echo "Error: Invalid target format '$TARGET_FORMAT'. Supported: md, html, docx"
exit 1
fi
echo "Target format override: $TARGET_FORMAT_LOWER"
fi
echo "Processing $# files..."
TOTAL_FILES=$#
SUCCESSFUL=0
FAILED=0
SKIPPED=0
# Parse a ZDDC filename stem (no extension) into ZDDC_TRACKING / ZDDC_REVISION /
# ZDDC_STATUS / ZDDC_TITLE. Returns 0 on a full match, 1 otherwise.
# Each field is extracted with its own sed backref rather than a delimiter-joined
# string + cut, so a title containing the join character (e.g. '|') can't corrupt
# the split.
parse_zddc_filename() {
local stem="$1"
local sub='s/^\([^_]*\)_\([^ ]*\) *(\([^)]*\)) *- *\(.*\)$'
# Gate on a full match before extracting (empty fields are otherwise ambiguous).
printf '%s\n' "$stem" | grep -Eq '^[^_]+_[^ ]+ *\([^)]*\) *- *.+$' || return 1
ZDDC_TRACKING=$(printf '%s\n' "$stem" | sed -n "${sub}/\\1/p")
ZDDC_REVISION=$(printf '%s\n' "$stem" | sed -n "${sub}/\\2/p")
ZDDC_STATUS=$(printf '%s\n' "$stem" | sed -n "${sub}/\\3/p")
ZDDC_TITLE=$(printf '%s\n' "$stem" | sed -n "${sub}/\\4/p")
return 0
}
# Function to convert DOCX to Markdown
convert_docx_to_md() {
local INPUT="$1"
local OUTPUT_FILE="$2"
local TEMP_FILE="$3"
local MEDIA_DIR="$4"
local BASENAME="$5"
local FILENAME_NO_EXT="$6"
# Convert using pandoc with proper extension stripping to temp file first
if pandoc -f docx -t gfm --markdown-headings=atx --extract-media="$MEDIA_DIR" --wrap=none "$INPUT" -o "$TEMP_FILE"; then
# Parse ZDDC filename pattern: trackingNumber_revision (status) - title.extension
if parse_zddc_filename "$FILENAME_NO_EXT"; then
TRACKING_NUMBER="$ZDDC_TRACKING"
REVISION="$ZDDC_REVISION"
STATUS="$ZDDC_STATUS"
TITLE="$ZDDC_TITLE"
echo " → ZDDC metadata detected:"
echo " • Tracking: $TRACKING_NUMBER"
echo " • Revision: $REVISION"
echo " • Status: $STATUS"
echo " • Title: $TITLE"
# Create YAML front matter and combine with content
{
echo "---"
echo "client: \"${client:-}\""
echo "project: \"${project:-}\""
echo "tracking_number: \"$TRACKING_NUMBER\""
echo "revision: \"$REVISION\""
echo "status: \"$STATUS\""
echo "title: \"$TITLE\""
echo "source_file: \"$BASENAME\""
echo "created: \"$(date -u +%Y-%m-%d)\""
echo "---"
echo ""
cat "$TEMP_FILE"
} > "$OUTPUT_FILE"
rm "$TEMP_FILE"
else
# No ZDDC pattern detected, just move temp file to final location
mv "$TEMP_FILE" "$OUTPUT_FILE"
fi
echo " ✓ Successfully converted: $BASENAME (DOCX→MD)"
return 0
else
echo " ✗ Failed to convert: $BASENAME (DOCX→MD)"
# Clean up temp file on failure
[ -f "$TEMP_FILE" ] && rm "$TEMP_FILE"
return 1
fi
}
# Function to convert HTML to Markdown
convert_html_to_md() {
local INPUT="$1"
local OUTPUT_FILE="$2"
local BASENAME="$3"
if pandoc "$INPUT" -f html -t gfm --markdown-headings=atx --wrap=none -o "$OUTPUT_FILE"; then
echo " ✓ Successfully converted: $BASENAME (HTML→MD)"
return 0
else
echo " ✗ Failed to convert: $BASENAME (HTML→MD)"
return 1
fi
}
# Function to convert Markdown to DOCX
convert_md_to_docx() {
local INPUT="$1"
local OUTPUT_FILE="$2"
local BASENAME="$3"
if pandoc "$INPUT" -f gfm -t docx -o "$OUTPUT_FILE"; then
echo " ✓ Successfully converted: $BASENAME (MD→DOCX)"
return 0
else
echo " ✗ Failed to convert: $BASENAME (MD→DOCX)"
return 1
fi
}
# Function to convert HTML to DOCX
convert_html_to_docx() {
local INPUT="$1"
local OUTPUT_FILE="$2"
local BASENAME="$3"
if pandoc "$INPUT" -f html -t docx -o "$OUTPUT_FILE"; then
echo " ✓ Successfully converted: $BASENAME (HTML→DOCX)"
return 0
else
echo " ✗ Failed to convert: $BASENAME (HTML→DOCX)"
return 1
fi
}
# Function to convert Markdown to HTML
convert_md_to_html() {
local INPUT="$1"
local OUTPUT_FILE="$2"
local BASENAME="$3"
local INPUT_DIR="$4"
# No need to reload config - already loaded at startup
# Config variables from zddc.conf are already in environment
# Get absolute paths - use pwd-based approach for POSIX compatibility
case "$INPUT" in
/*) INPUT_ABS="$INPUT" ;;
*) INPUT_ABS="$(pwd)/$INPUT" ;;
esac
case "$OUTPUT_FILE" in
/*) OUTPUT_ABS="$OUTPUT_FILE" ;;
*) OUTPUT_ABS="$(pwd)/$OUTPUT_FILE" ;;
esac
# Determine template to use
if [ -n "$CUSTOM_TEMPLATE" ]; then
# Use custom template if specified
if [ -f "$CUSTOM_TEMPLATE" ]; then
TEMPLATE_ABS="$CUSTOM_TEMPLATE"
echo " → Using custom template: $TEMPLATE_ABS"
else
echo " ⚠ Warning: Custom template not found: $CUSTOM_TEMPLATE, using default discovery"
CUSTOM_TEMPLATE=""
fi
fi
# Default template discovery if no custom template or custom template not found.
# Named templates live in a templates/ dir (report.html, letter.html,
# specification.html, sharing _head/_doc/_scripts partials). The document
# selects one via a `template:` field in its YAML front matter; default report.
if [ -z "$CUSTOM_TEMPLATE" ]; then
# Convert script directory to absolute path
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
# Check if script is a symlink and resolve target directory
SCRIPT_TARGET_DIR=""
if [ -L "$0" ]; then
# readlink -f is available on Linux with GNU coreutils
SCRIPT_TARGET=$(readlink -f "$0")
SCRIPT_TARGET_DIR=$(dirname "$SCRIPT_TARGET")
fi
# Template name from the doc's front matter (sanitized to a bare basename).
TEMPLATE_NAME=$(sed -n '/^---[[:space:]]*$/,/^---[[:space:]]*$/ s/^template:[[:space:]]*"\{0,1\}\([A-Za-z0-9_-]\{1,\}\)"\{0,1\}[[:space:]]*$/\1/p' "$INPUT_ABS" | head -1)
[ -n "$TEMPLATE_NAME" ] || TEMPLATE_NAME="report"
# Search order: input dir, script dir, symlink target dir — each a templates/
# subdir. Use absolute paths since pandoc runs after a cd into the input dir.
INPUT_DIR_ABS=$(dirname "$INPUT_ABS")
TEMPLATE_ABS=""
for _tdir in "$INPUT_DIR_ABS/templates" "$SCRIPT_DIR/templates" "$SCRIPT_TARGET_DIR/templates"; do
[ -n "$_tdir" ] || continue
if [ -f "$_tdir/$TEMPLATE_NAME.html" ]; then
TEMPLATE_ABS="$_tdir/$TEMPLATE_NAME.html"
echo " → Using template: $TEMPLATE_ABS"
break
elif [ -f "$_tdir/report.html" ]; then
TEMPLATE_ABS="$_tdir/report.html"
echo " ⚠ Template '$TEMPLATE_NAME' not found; using $TEMPLATE_ABS"
break
fi
done
if [ -z "$TEMPLATE_ABS" ]; then
echo " ⚠ Warning: templates/ not found, using pandoc default template"
fi
fi
# Change to input directory so pandoc can find relative resources
ORIGINAL_DIR=$(pwd)
cd "$INPUT_DIR"
# Build pandoc command as an argument array (safe form, no eval — each value
# is a separate array element so it can't be re-split or injected by the shell).
PANDOC_ARGS=()
PANDOC_ARGS+=("--from" "markdown+yaml_metadata_block")
PANDOC_ARGS+=("--standalone")
PANDOC_ARGS+=("--embed-resources")
PANDOC_ARGS+=("--section-divs")
# Add TOC options if not disabled
if [ "$NO_TOC" = "false" ]; then
PANDOC_ARGS+=("--toc" "--toc-depth=6")
fi
if [ -n "$TEMPLATE_ABS" ]; then
PANDOC_ARGS+=("--template" "$TEMPLATE_ABS")
fi
# Generate timestamp for conversion (force English locale)
GENERATION_TIME=$(LC_TIME=C date '+%B %d, %Y at %I:%M:%S %p %Z')
# Extract ZDDC metadata from filename for template variables
FILENAME_NO_EXT=$(basename "$INPUT" .md)
if parse_zddc_filename "$FILENAME_NO_EXT"; then
TRACKING_NUMBER="$ZDDC_TRACKING"
REVISION="$ZDDC_REVISION"
STATUS="$ZDDC_STATUS"
TITLE="$ZDDC_TITLE"
# Pass ZDDC variables to template (each as separate args to avoid injection)
PANDOC_ARGS+=("--variable" "tracking_number=$TRACKING_NUMBER")
PANDOC_ARGS+=("--variable" "revision=$REVISION")
PANDOC_ARGS+=("--variable" "status=$STATUS")
PANDOC_ARGS+=("--variable" "generation_time=$GENERATION_TIME")
PANDOC_ARGS+=("--variable" "title=$TITLE")
case "$REVISION" in
*~*)
PANDOC_ARGS+=("--variable" "is_draft=true")
;;
esac
else
# Still pass generation time even if no ZDDC match
PANDOC_ARGS+=("--variable" "generation_time=$GENERATION_TIME")
fi
# Add ZDDC configuration variables from zddc.conf
if [ -n "$client" ]; then
PANDOC_ARGS+=("--variable" "client=$client")
fi
if [ -n "$project" ]; then
PANDOC_ARGS+=("--variable" "project=$project")
fi
if [ -n "$contractor" ]; then
PANDOC_ARGS+=("--variable" "contractor=$contractor")
fi
if [ -n "$project_number" ]; then
PANDOC_ARGS+=("--variable" "project_number=$project_number")
fi
# Pass TOC status to template
if [ "$NO_TOC" = "true" ]; then
PANDOC_ARGS+=("--variable" "no-toc=true")
fi
# (--section-divs already added above)
PANDOC_ARGS+=("--html-q-tags")
# Run pandoc with positional arguments (safe form, no eval)
# All variables passed as separate arguments to avoid shell injection
if pandoc "$(basename "$INPUT_ABS")" -o "$OUTPUT_ABS" "${PANDOC_ARGS[@]}"; then
echo " ✓ Successfully converted: $BASENAME (MD→HTML)"
cd "$ORIGINAL_DIR"
return 0
else
echo " ✗ Failed to convert: $BASENAME (MD→HTML)"
cd "$ORIGINAL_DIR"
return 1
fi
}
for INPUT in "$@"; do
echo ""
echo "Processing: $INPUT"
# Validate input file exists
if [ ! -f "$INPUT" ]; then
echo " ✗ Input file not found: $INPUT"
FAILED=$((FAILED + 1))
continue
fi
# Extract filename without path and extension
BASENAME=$(basename "$INPUT")
FILENAME_NO_EXT="${BASENAME%.*}"
EXTENSION="${BASENAME##*.}"
INPUT_DIR=$(dirname "$INPUT")
# Convert extension to lowercase for comparison
EXTENSION_LOWER=$(echo "$EXTENSION" | tr '[:upper:]' '[:lower:]')
# Determine conversion type based on target format override or auto-detection
if [ -n "$TARGET_FORMAT" ]; then
# Target format specified - determine conversion type
TARGET_EXT="$TARGET_FORMAT_LOWER"
case "$EXTENSION_LOWER" in
docx)
if [ "$TARGET_EXT" = "md" ]; then
CONVERSION_TYPE="docx2md"
elif [ "$TARGET_EXT" = "html" ]; then
echo " ✗ Direct DOCX→HTML conversion not supported. Convert to MD first."
FAILED=$((FAILED + 1))
continue
elif [ "$TARGET_EXT" = "docx" ]; then
echo " ⚠ Skipping: $BASENAME (already DOCX format)"
SKIPPED=$((SKIPPED + 1))
continue
fi
;;
md)
if [ "$TARGET_EXT" = "html" ]; then
CONVERSION_TYPE="md2html"
elif [ "$TARGET_EXT" = "docx" ]; then
CONVERSION_TYPE="md2docx"
elif [ "$TARGET_EXT" = "md" ]; then
echo " ⚠ Skipping: $BASENAME (already MD format)"
SKIPPED=$((SKIPPED + 1))
continue
fi
;;
html|htm)
if [ "$TARGET_EXT" = "md" ]; then
CONVERSION_TYPE="html2md"
elif [ "$TARGET_EXT" = "docx" ]; then
CONVERSION_TYPE="html2docx"
elif [ "$TARGET_EXT" = "html" ]; then
echo " ⚠ Skipping: $BASENAME (already HTML format)"
SKIPPED=$((SKIPPED + 1))
continue
fi
;;
*)
echo " ✗ Unsupported input file type: .$EXTENSION (supported: .docx, .md, .html, .htm)"
FAILED=$((FAILED + 1))
continue
;;
esac
else
# Auto-detect conversion type based on input extension
case "$EXTENSION_LOWER" in
docx)
CONVERSION_TYPE="docx2md"
TARGET_EXT="md"
;;
md)
CONVERSION_TYPE="md2html"
TARGET_EXT="html"
;;
html|htm)
CONVERSION_TYPE="html2md"
TARGET_EXT="md"
;;
*)
echo " ✗ Unsupported file type: .$EXTENSION (supported: .docx, .md, .html, .htm)"
FAILED=$((FAILED + 1))
continue
;;
esac
fi
# Determine output location
if [ -n "$OUTPUT_DIR" ]; then
OUTPUT_FILE="$OUTPUT_DIR/$FILENAME_NO_EXT.$TARGET_EXT"
if [ "$CONVERSION_TYPE" = "docx2md" ]; then
TEMP_FILE="$OUTPUT_DIR/temp_$FILENAME_NO_EXT.md"
MEDIA_DIR="$OUTPUT_DIR/$FILENAME_NO_EXT"
fi
else
OUTPUT_FILE="$INPUT_DIR/$FILENAME_NO_EXT.$TARGET_EXT"
if [ "$CONVERSION_TYPE" = "docx2md" ]; then
TEMP_FILE="$INPUT_DIR/temp_$FILENAME_NO_EXT.md"
MEDIA_DIR="$INPUT_DIR/$FILENAME_NO_EXT"
fi
fi
echo " → Output file: $OUTPUT_FILE"
if [ "$CONVERSION_TYPE" = "docx2md" ]; then
echo " → Media dir: $MEDIA_DIR/"
fi
# Create output directory if needed
OUTPUT_FILE_DIR=$(dirname "$OUTPUT_FILE")
if [ ! -d "$OUTPUT_FILE_DIR" ]; then
mkdir -p "$OUTPUT_FILE_DIR"
fi
# Check if output file exists and handle accordingly
if [ -f "$OUTPUT_FILE" ] && [ "$FORCE_OVERWRITE" = "false" ]; then
echo " ⚠ Skipped (file exists): $BASENAME"
SKIPPED=$((SKIPPED + 1))
continue
fi
# Perform conversion based on type
case "$CONVERSION_TYPE" in
docx2md)
if convert_docx_to_md "$INPUT" "$OUTPUT_FILE" "$TEMP_FILE" "$MEDIA_DIR" "$BASENAME" "$FILENAME_NO_EXT"; then
SUCCESSFUL=$((SUCCESSFUL + 1))
else
FAILED=$((FAILED + 1))
fi
;;
md2html)
if convert_md_to_html "$INPUT" "$OUTPUT_FILE" "$BASENAME" "$INPUT_DIR"; then
SUCCESSFUL=$((SUCCESSFUL + 1))
else
FAILED=$((FAILED + 1))
fi
;;
html2md)
if convert_html_to_md "$INPUT" "$OUTPUT_FILE" "$BASENAME"; then
SUCCESSFUL=$((SUCCESSFUL + 1))
else
FAILED=$((FAILED + 1))
fi
;;
md2docx)
if convert_md_to_docx "$INPUT" "$OUTPUT_FILE" "$BASENAME"; then
SUCCESSFUL=$((SUCCESSFUL + 1))
else
FAILED=$((FAILED + 1))
fi
;;
html2docx)
if convert_html_to_docx "$INPUT" "$OUTPUT_FILE" "$BASENAME"; then
SUCCESSFUL=$((SUCCESSFUL + 1))
else
FAILED=$((FAILED + 1))
fi
;;
*)
echo " ✗ Unknown conversion type: $CONVERSION_TYPE"
FAILED=$((FAILED + 1))
;;
esac
done
echo ""
echo "=========================================="
echo "CONVERSION SUMMARY"
echo "=========================================="
echo "Total files processed: $TOTAL_FILES"
echo "Successful conversions: $SUCCESSFUL"
echo "Failed conversions: $FAILED"
echo "Skipped (existing files): $SKIPPED"
echo "=========================================="