diff --git a/scripts/archive-worker-entrypoint.sh b/scripts/archive-worker-entrypoint.sh
new file mode 100755
index 0000000..541d88a
--- /dev/null
+++ b/scripts/archive-worker-entrypoint.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+DISPLAY="${DISPLAY:-:99}"
+ARCHIVE_WORKER_VIEWPORT="${ARCHIVE_WORKER_VIEWPORT:-1366x768x24}"
+ARCHIVE_WORKER_VNC_PORT="${ARCHIVE_WORKER_VNC_PORT:-5900}"
+export DISPLAY
+
+xvfb_pid=""
+vnc_pid=""
+child_pid=""
+
+cleanup() {
+ local status=$?
+ if [[ -n "${child_pid}" ]] && kill -0 "${child_pid}" 2>/dev/null; then
+ kill "${child_pid}" 2>/dev/null || true
+ fi
+ if [[ -n "${vnc_pid}" ]] && kill -0 "${vnc_pid}" 2>/dev/null; then
+ kill "${vnc_pid}" 2>/dev/null || true
+ fi
+ if [[ -n "${xvfb_pid}" ]] && kill -0 "${xvfb_pid}" 2>/dev/null; then
+ kill "${xvfb_pid}" 2>/dev/null || true
+ fi
+ exit "${status}"
+}
+
+trap cleanup EXIT INT TERM
+
+if [[ "${ARCHIVE_WORKER_XVFB:-1}" != "0" ]]; then
+ rm -f "/tmp/.X${DISPLAY#:}-lock"
+ Xvfb "${DISPLAY}" -screen 0 "${ARCHIVE_WORKER_VIEWPORT}" -nolisten tcp >/tmp/archive-worker-xvfb.log 2>&1 &
+ xvfb_pid=$!
+ sleep "${ARCHIVE_WORKER_XVFB_DELAY:-0.5}"
+fi
+
+if [[ "${ARCHIVE_WORKER_VNC:-0}" == "1" ]]; then
+ x11vnc \
+ -display "${DISPLAY}" \
+ -nopw \
+ -forever \
+ -shared \
+ -rfbport "${ARCHIVE_WORKER_VNC_PORT}" \
+ >/tmp/archive-worker-x11vnc.log 2>&1 &
+ vnc_pid=$!
+fi
+
+if [[ "$#" -eq 0 ]]; then
+ set -- help
+fi
+
+case "$1" in
+ archive|help)
+ set -- node src/cli.mjs "$@"
+ ;;
+esac
+
+"$@" &
+child_pid=$!
+wait "${child_pid}"
diff --git a/src/cli.mjs b/src/cli.mjs
index fb0bede..5d20e3a 100644
--- a/src/cli.mjs
+++ b/src/cli.mjs
@@ -34,7 +34,8 @@ function usage() {
Options:
--archive-path
Output directory. Defaults to ARCHIVE_PATH or ${defaultArchivePath()}
--id Output id/file stem
- --headful Run browser in headful mode (requires display)`);
+ --headful Run browser in headful mode (requires display)
+ --json Print a machine-readable JSON result`);
}
async function main() {
@@ -59,6 +60,11 @@ async function main() {
headless: args.headful === true ? false : undefined
});
+ if (args.json === true) {
+ console.log(JSON.stringify({ ok: true, ...result }));
+ return;
+ }
+
console.log(`Archived: ${result.sourceUrl}`);
console.log(`Output: ${result.filePath}`);
if (result.externalAssets.length) {
@@ -75,9 +81,15 @@ async function main() {
console.log(` ${warning}`);
}
}
+ return result;
}
+const args = parseArgs(process.argv);
main().catch((error) => {
- console.error(error.message);
+ if (args.json === true) {
+ console.log(JSON.stringify({ ok: false, error: error.message }));
+ } else {
+ console.error(error.message);
+ }
process.exitCode = 1;
});
diff --git a/src/container-runner.mjs b/src/container-runner.mjs
new file mode 100755
index 0000000..77e53f4
--- /dev/null
+++ b/src/container-runner.mjs
@@ -0,0 +1,348 @@
+#!/usr/bin/env node
+import fs from "node:fs/promises";
+import path from "node:path";
+import { spawn } from "node:child_process";
+import { defaultArchivePath } from "./asset-inliner.mjs";
+
+const CONTAINER_ARCHIVE_PATH = "/archives";
+const CONTAINER_INPUT_PATH = "/input";
+const DEFAULT_IMAGE = process.env.ARCHIVE_WORKER_IMAGE || "local-page-archiver:latest";
+const DEFAULT_SHM_SIZE = "1g";
+
+export async function archiveWithContainer(input, options = {}) {
+ if (!input) {
+ throw new Error("Missing archive input URL or HTML file path");
+ }
+
+ const runtime = options.runtime || await detectContainerRuntime();
+ const image = options.image || DEFAULT_IMAGE;
+ const archivePath = path.resolve(options.archivePath || defaultArchivePath());
+ await fs.mkdir(archivePath, { recursive: true });
+
+ if (options.build === true) {
+ await buildWorkerImage({ runtime, image, context: options.context || process.cwd() });
+ }
+
+ const { containerInput, inputMount } = await resolveContainerInput(input);
+ const runArgs = [
+ "run",
+ "--rm",
+ "--shm-size",
+ options.shmSize || DEFAULT_SHM_SIZE,
+ "-e",
+ `ARCHIVE_PATH=${CONTAINER_ARCHIVE_PATH}`,
+ "-e",
+ "ARCHIVE_WORKER_XVFB=1",
+ "-v",
+ `${archivePath}:${CONTAINER_ARCHIVE_PATH}`
+ ];
+
+ if (inputMount) {
+ runArgs.push("-v", `${inputMount.host}:${inputMount.container}:ro`);
+ }
+ if (options.network) {
+ runArgs.push("--network", options.network);
+ }
+ if (options.name) {
+ runArgs.push("--name", options.name);
+ }
+ if (options.vnc === true) {
+ const hostPort = String(options.vncPort || 5901);
+ runArgs.push(
+ "-e",
+ "ARCHIVE_WORKER_VNC=1",
+ "-p",
+ `${hostPort}:5900`
+ );
+ }
+
+ runArgs.push(image, "archive", containerInput, "--json");
+ if (options.id) {
+ runArgs.push("--id", options.id);
+ }
+
+ const worker = await runCapture(runtime, runArgs, { reject: false });
+ let parsed;
+ try {
+ parsed = parseWorkerJson(worker.stdout);
+ } catch (error) {
+ if (worker.code !== 0) {
+ const message = worker.stderr.trim() || error.message;
+ const workerError = new Error(message);
+ workerError.worker = worker;
+ throw workerError;
+ }
+ throw error;
+ }
+ if (worker.code !== 0 || parsed.ok === false) {
+ const message = parsed.error || worker.stderr.trim() || `Archive worker exited with ${worker.code}`;
+ const error = new Error(message);
+ error.worker = worker;
+ error.result = parsed;
+ throw error;
+ }
+
+ return mapContainerResult(parsed, {
+ runtime,
+ image,
+ archivePath
+ });
+}
+
+export async function detectContainerRuntime() {
+ for (const runtime of ["podman", "docker"]) {
+ if (await commandExists(runtime)) {
+ return runtime;
+ }
+ }
+ throw new Error("Neither podman nor docker is available on PATH");
+}
+
+export async function imageExists(runtime, image) {
+ const args = runtime === "podman"
+ ? ["image", "exists", image]
+ : ["image", "inspect", image];
+ const result = await runCapture(runtime, args, { reject: false });
+ return result.code === 0;
+}
+
+export async function buildWorkerImage({ runtime, image = DEFAULT_IMAGE, context = process.cwd() } = {}) {
+ const selectedRuntime = runtime || await detectContainerRuntime();
+ await runInherited(selectedRuntime, ["build", "-t", image, context]);
+}
+
+function mapContainerResult(result, { runtime, image, archivePath }) {
+ const containerFilePath = result.filePath;
+ let filePath = containerFilePath;
+ if (containerFilePath?.startsWith(`${CONTAINER_ARCHIVE_PATH}/`)) {
+ filePath = path.join(archivePath, path.relative(CONTAINER_ARCHIVE_PATH, containerFilePath));
+ }
+
+ return {
+ ...result,
+ filePath,
+ archivePath,
+ container: {
+ runtime,
+ image,
+ filePath: containerFilePath,
+ archivePath: CONTAINER_ARCHIVE_PATH
+ }
+ };
+}
+
+async function resolveContainerInput(input) {
+ if (isUrlLike(input)) {
+ return { containerInput: input, inputMount: null };
+ }
+
+ const absolute = path.resolve(input);
+ const stat = await fs.stat(absolute).catch(() => null);
+ if (!stat?.isFile()) {
+ return { containerInput: input, inputMount: null };
+ }
+
+ return {
+ containerInput: path.posix.join(CONTAINER_INPUT_PATH, path.basename(absolute)),
+ inputMount: {
+ host: path.dirname(absolute),
+ container: CONTAINER_INPUT_PATH
+ }
+ };
+}
+
+function isUrlLike(value) {
+ return /^[a-z][a-z0-9+.-]*:/i.test(value);
+}
+
+async function commandExists(command) {
+ const result = await runCapture(command, ["--version"], { reject: false });
+ return result.code === 0;
+}
+
+function parseWorkerJson(stdout) {
+ const trimmed = stdout.trim();
+ if (!trimmed) {
+ throw new Error("Archive worker produced no JSON output");
+ }
+
+ const lines = trimmed.split(/\r?\n/).reverse();
+ for (const line of lines) {
+ const candidate = line.trim();
+ if (!candidate.startsWith("{")) {
+ continue;
+ }
+ try {
+ return JSON.parse(candidate);
+ } catch {
+ // Keep looking; earlier lines may contain log output.
+ }
+ }
+
+ throw new Error(`Archive worker output did not include JSON: ${trimmed.slice(0, 500)}`);
+}
+
+function runCapture(command, args, options = {}) {
+ return new Promise((resolve, reject) => {
+ const child = spawn(command, args, {
+ cwd: options.cwd,
+ env: options.env || process.env,
+ stdio: ["ignore", "pipe", "pipe"]
+ });
+ let stdout = "";
+ let stderr = "";
+
+ child.stdout.on("data", (chunk) => {
+ stdout += chunk;
+ });
+ child.stderr.on("data", (chunk) => {
+ stderr += chunk;
+ });
+ child.on("error", (error) => {
+ if (options.reject === false) {
+ resolve({ code: 127, stdout, stderr: error.message });
+ } else {
+ reject(error);
+ }
+ });
+ child.on("close", (code) => {
+ const result = { code, stdout, stderr };
+ if (code !== 0 && options.reject !== false) {
+ const error = new Error(stderr.trim() || `${command} exited with ${code}`);
+ error.result = result;
+ reject(error);
+ } else {
+ resolve(result);
+ }
+ });
+ });
+}
+
+function runInherited(command, args, options = {}) {
+ return new Promise((resolve, reject) => {
+ const child = spawn(command, args, {
+ cwd: options.cwd,
+ env: options.env || process.env,
+ stdio: "inherit"
+ });
+ child.on("error", reject);
+ child.on("close", (code) => {
+ if (code === 0) {
+ resolve();
+ } else {
+ reject(new Error(`${command} exited with ${code}`));
+ }
+ });
+ });
+}
+
+function parseArgs(argv) {
+ const args = {
+ command: argv[2],
+ positional: []
+ };
+ for (let i = 3; i < argv.length; i += 1) {
+ const arg = argv[i];
+ if (!arg.startsWith("--")) {
+ args.positional.push(arg);
+ continue;
+ }
+ const [flag, inlineValue] = arg.split("=", 2);
+ const key = flag.slice(2);
+ if (key.startsWith("no-")) {
+ args[key.slice(3)] = false;
+ } else if (inlineValue !== undefined) {
+ args[key] = inlineValue;
+ } else if (i + 1 < argv.length && !argv[i + 1].startsWith("--")) {
+ args[key] = argv[++i];
+ } else {
+ args[key] = true;
+ }
+ }
+ return args;
+}
+
+function usage() {
+ console.log(`Usage:
+ node src/container-runner.mjs archive [options]
+ node src/container-runner.mjs build [options]
+
+Options:
+ --runtime Container runtime. Defaults to podman, then docker
+ --image Worker image. Defaults to ${DEFAULT_IMAGE}
+ --archive-path Host output directory. Defaults to ARCHIVE_PATH or ${defaultArchivePath()}
+ --id Output id/file stem
+ --build Build the worker image before archiving
+ --vnc Expose x11vnc from the worker for debugging
+ --vnc-port Host VNC port. Defaults to 5901
+ --network Optional runtime network to attach
+ --json Print machine-readable JSON`);
+}
+
+async function main() {
+ const args = parseArgs(process.argv);
+ if (!args.command || args.command === "help" || args.help) {
+ usage();
+ return;
+ }
+
+ const runtime = args.runtime || await detectContainerRuntime();
+ const image = args.image || DEFAULT_IMAGE;
+
+ if (args.command === "build") {
+ await buildWorkerImage({ runtime, image });
+ return;
+ }
+
+ if (args.command !== "archive") {
+ throw new Error(`Unknown command: ${args.command}`);
+ }
+
+ const input = args.positional[0];
+ if (!input) {
+ usage();
+ process.exitCode = 1;
+ return;
+ }
+
+ const result = await archiveWithContainer(input, {
+ runtime,
+ image,
+ archivePath: args["archive-path"],
+ id: args.id,
+ build: args.build === true,
+ vnc: args.vnc === true,
+ vncPort: args["vnc-port"],
+ network: args.network
+ });
+
+ if (args.json === true) {
+ console.log(JSON.stringify(result));
+ return;
+ }
+
+ console.log(`Archived: ${result.sourceUrl}`);
+ console.log(`Output: ${result.filePath}`);
+ console.log(`Worker: ${result.container.runtime} ${result.container.image}`);
+ if (result.externalAssets.length) {
+ console.log(`External asset references remaining: ${result.externalAssets.length}`);
+ for (const ref of result.externalAssets.slice(0, 20)) {
+ console.log(` ${ref}`);
+ }
+ } else {
+ console.log("External asset references remaining: 0");
+ }
+ if (result.warnings.length) {
+ console.log(`Warnings: ${result.warnings.length}`);
+ for (const warning of result.warnings.slice(0, 20)) {
+ console.log(` ${warning}`);
+ }
+ }
+}
+
+if (import.meta.url === `file://${process.argv[1]}`) {
+ main().catch((error) => {
+ console.error(error.message);
+ process.exitCode = 1;
+ });
+}