diff --git a/scripts/archive-worker-entrypoint.sh b/scripts/archive-worker-entrypoint.sh new file mode 100755 index 0000000..541d88a --- /dev/null +++ b/scripts/archive-worker-entrypoint.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +set -euo pipefail + +DISPLAY="${DISPLAY:-:99}" +ARCHIVE_WORKER_VIEWPORT="${ARCHIVE_WORKER_VIEWPORT:-1366x768x24}" +ARCHIVE_WORKER_VNC_PORT="${ARCHIVE_WORKER_VNC_PORT:-5900}" +export DISPLAY + +xvfb_pid="" +vnc_pid="" +child_pid="" + +cleanup() { + local status=$? + if [[ -n "${child_pid}" ]] && kill -0 "${child_pid}" 2>/dev/null; then + kill "${child_pid}" 2>/dev/null || true + fi + if [[ -n "${vnc_pid}" ]] && kill -0 "${vnc_pid}" 2>/dev/null; then + kill "${vnc_pid}" 2>/dev/null || true + fi + if [[ -n "${xvfb_pid}" ]] && kill -0 "${xvfb_pid}" 2>/dev/null; then + kill "${xvfb_pid}" 2>/dev/null || true + fi + exit "${status}" +} + +trap cleanup EXIT INT TERM + +if [[ "${ARCHIVE_WORKER_XVFB:-1}" != "0" ]]; then + rm -f "/tmp/.X${DISPLAY#:}-lock" + Xvfb "${DISPLAY}" -screen 0 "${ARCHIVE_WORKER_VIEWPORT}" -nolisten tcp >/tmp/archive-worker-xvfb.log 2>&1 & + xvfb_pid=$! + sleep "${ARCHIVE_WORKER_XVFB_DELAY:-0.5}" +fi + +if [[ "${ARCHIVE_WORKER_VNC:-0}" == "1" ]]; then + x11vnc \ + -display "${DISPLAY}" \ + -nopw \ + -forever \ + -shared \ + -rfbport "${ARCHIVE_WORKER_VNC_PORT}" \ + >/tmp/archive-worker-x11vnc.log 2>&1 & + vnc_pid=$! +fi + +if [[ "$#" -eq 0 ]]; then + set -- help +fi + +case "$1" in + archive|help) + set -- node src/cli.mjs "$@" + ;; +esac + +"$@" & +child_pid=$! +wait "${child_pid}" diff --git a/src/cli.mjs b/src/cli.mjs index fb0bede..5d20e3a 100644 --- a/src/cli.mjs +++ b/src/cli.mjs @@ -34,7 +34,8 @@ function usage() { Options: --archive-path Output directory. Defaults to ARCHIVE_PATH or ${defaultArchivePath()} --id Output id/file stem - --headful Run browser in headful mode (requires display)`); + --headful Run browser in headful mode (requires display) + --json Print a machine-readable JSON result`); } async function main() { @@ -59,6 +60,11 @@ async function main() { headless: args.headful === true ? false : undefined }); + if (args.json === true) { + console.log(JSON.stringify({ ok: true, ...result })); + return; + } + console.log(`Archived: ${result.sourceUrl}`); console.log(`Output: ${result.filePath}`); if (result.externalAssets.length) { @@ -75,9 +81,15 @@ async function main() { console.log(` ${warning}`); } } + return result; } +const args = parseArgs(process.argv); main().catch((error) => { - console.error(error.message); + if (args.json === true) { + console.log(JSON.stringify({ ok: false, error: error.message })); + } else { + console.error(error.message); + } process.exitCode = 1; }); diff --git a/src/container-runner.mjs b/src/container-runner.mjs new file mode 100755 index 0000000..77e53f4 --- /dev/null +++ b/src/container-runner.mjs @@ -0,0 +1,348 @@ +#!/usr/bin/env node +import fs from "node:fs/promises"; +import path from "node:path"; +import { spawn } from "node:child_process"; +import { defaultArchivePath } from "./asset-inliner.mjs"; + +const CONTAINER_ARCHIVE_PATH = "/archives"; +const CONTAINER_INPUT_PATH = "/input"; +const DEFAULT_IMAGE = process.env.ARCHIVE_WORKER_IMAGE || "local-page-archiver:latest"; +const DEFAULT_SHM_SIZE = "1g"; + +export async function archiveWithContainer(input, options = {}) { + if (!input) { + throw new Error("Missing archive input URL or HTML file path"); + } + + const runtime = options.runtime || await detectContainerRuntime(); + const image = options.image || DEFAULT_IMAGE; + const archivePath = path.resolve(options.archivePath || defaultArchivePath()); + await fs.mkdir(archivePath, { recursive: true }); + + if (options.build === true) { + await buildWorkerImage({ runtime, image, context: options.context || process.cwd() }); + } + + const { containerInput, inputMount } = await resolveContainerInput(input); + const runArgs = [ + "run", + "--rm", + "--shm-size", + options.shmSize || DEFAULT_SHM_SIZE, + "-e", + `ARCHIVE_PATH=${CONTAINER_ARCHIVE_PATH}`, + "-e", + "ARCHIVE_WORKER_XVFB=1", + "-v", + `${archivePath}:${CONTAINER_ARCHIVE_PATH}` + ]; + + if (inputMount) { + runArgs.push("-v", `${inputMount.host}:${inputMount.container}:ro`); + } + if (options.network) { + runArgs.push("--network", options.network); + } + if (options.name) { + runArgs.push("--name", options.name); + } + if (options.vnc === true) { + const hostPort = String(options.vncPort || 5901); + runArgs.push( + "-e", + "ARCHIVE_WORKER_VNC=1", + "-p", + `${hostPort}:5900` + ); + } + + runArgs.push(image, "archive", containerInput, "--json"); + if (options.id) { + runArgs.push("--id", options.id); + } + + const worker = await runCapture(runtime, runArgs, { reject: false }); + let parsed; + try { + parsed = parseWorkerJson(worker.stdout); + } catch (error) { + if (worker.code !== 0) { + const message = worker.stderr.trim() || error.message; + const workerError = new Error(message); + workerError.worker = worker; + throw workerError; + } + throw error; + } + if (worker.code !== 0 || parsed.ok === false) { + const message = parsed.error || worker.stderr.trim() || `Archive worker exited with ${worker.code}`; + const error = new Error(message); + error.worker = worker; + error.result = parsed; + throw error; + } + + return mapContainerResult(parsed, { + runtime, + image, + archivePath + }); +} + +export async function detectContainerRuntime() { + for (const runtime of ["podman", "docker"]) { + if (await commandExists(runtime)) { + return runtime; + } + } + throw new Error("Neither podman nor docker is available on PATH"); +} + +export async function imageExists(runtime, image) { + const args = runtime === "podman" + ? ["image", "exists", image] + : ["image", "inspect", image]; + const result = await runCapture(runtime, args, { reject: false }); + return result.code === 0; +} + +export async function buildWorkerImage({ runtime, image = DEFAULT_IMAGE, context = process.cwd() } = {}) { + const selectedRuntime = runtime || await detectContainerRuntime(); + await runInherited(selectedRuntime, ["build", "-t", image, context]); +} + +function mapContainerResult(result, { runtime, image, archivePath }) { + const containerFilePath = result.filePath; + let filePath = containerFilePath; + if (containerFilePath?.startsWith(`${CONTAINER_ARCHIVE_PATH}/`)) { + filePath = path.join(archivePath, path.relative(CONTAINER_ARCHIVE_PATH, containerFilePath)); + } + + return { + ...result, + filePath, + archivePath, + container: { + runtime, + image, + filePath: containerFilePath, + archivePath: CONTAINER_ARCHIVE_PATH + } + }; +} + +async function resolveContainerInput(input) { + if (isUrlLike(input)) { + return { containerInput: input, inputMount: null }; + } + + const absolute = path.resolve(input); + const stat = await fs.stat(absolute).catch(() => null); + if (!stat?.isFile()) { + return { containerInput: input, inputMount: null }; + } + + return { + containerInput: path.posix.join(CONTAINER_INPUT_PATH, path.basename(absolute)), + inputMount: { + host: path.dirname(absolute), + container: CONTAINER_INPUT_PATH + } + }; +} + +function isUrlLike(value) { + return /^[a-z][a-z0-9+.-]*:/i.test(value); +} + +async function commandExists(command) { + const result = await runCapture(command, ["--version"], { reject: false }); + return result.code === 0; +} + +function parseWorkerJson(stdout) { + const trimmed = stdout.trim(); + if (!trimmed) { + throw new Error("Archive worker produced no JSON output"); + } + + const lines = trimmed.split(/\r?\n/).reverse(); + for (const line of lines) { + const candidate = line.trim(); + if (!candidate.startsWith("{")) { + continue; + } + try { + return JSON.parse(candidate); + } catch { + // Keep looking; earlier lines may contain log output. + } + } + + throw new Error(`Archive worker output did not include JSON: ${trimmed.slice(0, 500)}`); +} + +function runCapture(command, args, options = {}) { + return new Promise((resolve, reject) => { + const child = spawn(command, args, { + cwd: options.cwd, + env: options.env || process.env, + stdio: ["ignore", "pipe", "pipe"] + }); + let stdout = ""; + let stderr = ""; + + child.stdout.on("data", (chunk) => { + stdout += chunk; + }); + child.stderr.on("data", (chunk) => { + stderr += chunk; + }); + child.on("error", (error) => { + if (options.reject === false) { + resolve({ code: 127, stdout, stderr: error.message }); + } else { + reject(error); + } + }); + child.on("close", (code) => { + const result = { code, stdout, stderr }; + if (code !== 0 && options.reject !== false) { + const error = new Error(stderr.trim() || `${command} exited with ${code}`); + error.result = result; + reject(error); + } else { + resolve(result); + } + }); + }); +} + +function runInherited(command, args, options = {}) { + return new Promise((resolve, reject) => { + const child = spawn(command, args, { + cwd: options.cwd, + env: options.env || process.env, + stdio: "inherit" + }); + child.on("error", reject); + child.on("close", (code) => { + if (code === 0) { + resolve(); + } else { + reject(new Error(`${command} exited with ${code}`)); + } + }); + }); +} + +function parseArgs(argv) { + const args = { + command: argv[2], + positional: [] + }; + for (let i = 3; i < argv.length; i += 1) { + const arg = argv[i]; + if (!arg.startsWith("--")) { + args.positional.push(arg); + continue; + } + const [flag, inlineValue] = arg.split("=", 2); + const key = flag.slice(2); + if (key.startsWith("no-")) { + args[key.slice(3)] = false; + } else if (inlineValue !== undefined) { + args[key] = inlineValue; + } else if (i + 1 < argv.length && !argv[i + 1].startsWith("--")) { + args[key] = argv[++i]; + } else { + args[key] = true; + } + } + return args; +} + +function usage() { + console.log(`Usage: + node src/container-runner.mjs archive [options] + node src/container-runner.mjs build [options] + +Options: + --runtime Container runtime. Defaults to podman, then docker + --image Worker image. Defaults to ${DEFAULT_IMAGE} + --archive-path Host output directory. Defaults to ARCHIVE_PATH or ${defaultArchivePath()} + --id Output id/file stem + --build Build the worker image before archiving + --vnc Expose x11vnc from the worker for debugging + --vnc-port Host VNC port. Defaults to 5901 + --network Optional runtime network to attach + --json Print machine-readable JSON`); +} + +async function main() { + const args = parseArgs(process.argv); + if (!args.command || args.command === "help" || args.help) { + usage(); + return; + } + + const runtime = args.runtime || await detectContainerRuntime(); + const image = args.image || DEFAULT_IMAGE; + + if (args.command === "build") { + await buildWorkerImage({ runtime, image }); + return; + } + + if (args.command !== "archive") { + throw new Error(`Unknown command: ${args.command}`); + } + + const input = args.positional[0]; + if (!input) { + usage(); + process.exitCode = 1; + return; + } + + const result = await archiveWithContainer(input, { + runtime, + image, + archivePath: args["archive-path"], + id: args.id, + build: args.build === true, + vnc: args.vnc === true, + vncPort: args["vnc-port"], + network: args.network + }); + + if (args.json === true) { + console.log(JSON.stringify(result)); + return; + } + + console.log(`Archived: ${result.sourceUrl}`); + console.log(`Output: ${result.filePath}`); + console.log(`Worker: ${result.container.runtime} ${result.container.image}`); + if (result.externalAssets.length) { + console.log(`External asset references remaining: ${result.externalAssets.length}`); + for (const ref of result.externalAssets.slice(0, 20)) { + console.log(` ${ref}`); + } + } else { + console.log("External asset references remaining: 0"); + } + if (result.warnings.length) { + console.log(`Warnings: ${result.warnings.length}`); + for (const warning of result.warnings.slice(0, 20)) { + console.log(` ${warning}`); + } + } +} + +if (import.meta.url === `file://${process.argv[1]}`) { + main().catch((error) => { + console.error(error.message); + process.exitCode = 1; + }); +}