diff --git a/.dockerignore b/.dockerignore
index 98c743f..0b092dd 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -3,5 +3,8 @@ npm-debug.log
.git
.gitignore
README.md
+AGENTS.md
archives
*.html
+test
+test-*.mjs
diff --git a/.gitignore b/.gitignore
index d570088..0420d38 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
node_modules/
-
+archives/
diff --git a/AGENTS.md b/AGENTS.md
index 6239381..1901a19 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -122,35 +122,51 @@ The current init script is minimal and safe — it only overrides the getter via
### Dockerfile
-- Base: `mcr.microsoft.com/playwright:v1.60.0` (must stay in sync with the `playwright` npm version)
-- Installs Node 22 (the base image may ship an older Node)
-- Runs `npx playwright install chromium` so the browser binary is baked into the image
+- Base: `mcr.microsoft.com/playwright:v1.60.0-noble` (must stay in sync with the `playwright` npm version)
+- Installs only the worker runtime helpers that are not part of the Playwright image: `dumb-init`, `xvfb`, and `x11vnc`
+- Uses `/app/scripts/archive-worker-entrypoint.sh` as the entrypoint. The entrypoint starts Xvfb on `$DISPLAY` and then runs `node src/cli.mjs ...` for `archive`/`help` commands.
+- The worker is intended to be ephemeral: one container per archive job, with `/archives` mounted from the host.
+
+### Host-to-worker contract
+
+`src/container-runner.mjs` is the host/backend-facing boundary. It:
+
+1. Picks `podman` or `docker`.
+2. Starts `local-page-archiver:latest` with `/archives` mounted from the host.
+3. Calls the in-container CLI as `archive --json`.
+4. Parses the JSON result and rewrites `/archives/...` paths back to host paths.
+
+This is the integration point a future backend should use instead of shelling out to `podman run` directly.
### `podman-run.sh`
-Helper for local runs. Two modes:
+Helper for local Podman runs. It delegates to `src/container-runner.mjs`.
-1. **`./podman-run.sh archive `** — headless, mounts `./archives`
-2. **`./podman-run.sh headful-archive `** — headful with internal VNC
+1. **`./podman-run.sh build`** — build `local-page-archiver:latest`
+2. **`./podman-run.sh archive `** — run one ephemeral Xvfb/Chromium worker and write to `./archives`
+3. **`./podman-run.sh vnc-archive `** — same worker with x11vnc exposed on `vnc://localhost:5901`
-**Headful mode details:**
-The container's `ENTRYPOINT` is `node src/cli.mjs`. To run a shell command inside the container (setting up Xvfb + x11vnc) we must override the entrypoint:
+The helper builds the image if it is missing. Override with:
-```bash
-podman run --rm --entrypoint sh -c "...setup Xvfb... && node src/cli.mjs archive "
+```sh
+ARCHIVE_WORKER_IMAGE=local-page-archiver:dev ARCHIVE_DIR=/tmp/archives ./podman-run.sh archive https://example.com
```
-Port `5900` inside the container maps to `5901` on the host to avoid conflicts with macOS's built-in VNC.
-
### `docker-compose.yml`
-Includes a `headful` profile that can be run with:
+Compose is mainly a direct worker smoke test. It runs the same image and command shape as the host runner:
```bash
-URL=https://example.com docker compose --profile headful up archiver-headful
+URL=https://example.com docker compose up --build archive-worker
```
-Unlike `podman-run.sh`, Compose currently maps VNC to host port `5900`.
+For visual debugging:
+
+```bash
+URL=https://example.com docker compose --profile debug up --build archive-worker-vnc
+```
+
+Unlike `podman-run.sh`, Compose maps VNC to host port `5900`.
## Known limitations
@@ -201,10 +217,13 @@ node src/cli.mjs archive https://example.com
# Archive a page (headful on macOS)
node src/cli.mjs archive https://example.com --headful
-# Archive inside container (headless)
+# Build worker image
+./podman-run.sh build
+
+# Archive inside an ephemeral Xvfb/Chromium worker
./podman-run.sh archive https://example.com
-# Archive inside container (headful + VNC)
-./podman-run.sh headful-archive https://example.com
+# Archive inside worker + expose VNC for debugging
+./podman-run.sh vnc-archive https://example.com
# Then open vnc://localhost:5901
```
diff --git a/Dockerfile b/Dockerfile
index 8c6c12c..cc49052 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,23 +1,27 @@
-FROM mcr.microsoft.com/playwright:v1.60.0
+FROM mcr.microsoft.com/playwright:v1.60.0-noble
WORKDIR /app
-# Install Node 22+ if not present (Playwright image may have an older Node)
-RUN apt-get update && apt-get install -y curl && \
- curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && \
- apt-get install -y nodejs && \
- apt-get clean && rm -rf /var/lib/apt/lists/*
-
-COPY package.json package-lock.json ./
-RUN npm ci
-
-COPY . .
-RUN npx playwright install chromium
-
-# Default to headless; override with --env HEADFUL=1 and mount X11 socket or use VNC
ENV NODE_ENV=production
ENV ARCHIVE_PATH=/archives
+ENV DISPLAY=:99
+ENV ARCHIVE_WORKER_XVFB=1
+ENV ARCHIVE_WORKER_VIEWPORT=1366x768x24
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ dumb-init \
+ x11vnc \
+ xvfb && \
+ rm -rf /var/lib/apt/lists/*
+
+COPY package.json package-lock.json ./
+RUN npm ci --omit=dev
+
+COPY . .
+RUN mkdir -p /archives && chmod 0777 /archives
VOLUME ["/archives"]
-ENTRYPOINT ["node", "src/cli.mjs"]
+ENTRYPOINT ["dumb-init", "--", "/app/scripts/archive-worker-entrypoint.sh"]
+CMD ["help"]
diff --git a/README.md b/README.md
index a59eb64..ac0fc2b 100644
--- a/README.md
+++ b/README.md
@@ -17,3 +17,37 @@ node src/cli.mjs archive ./page.html
```
Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set.
+
+## Ephemeral container worker
+
+The host-facing container boundary is `src/container-runner.mjs`. It starts a short-lived Docker/Podman worker container, mounts the host archive directory at `/archives`, sends one archive request, reads a JSON result, and exits.
+
+Build the worker image:
+
+```sh
+podman build -t local-page-archiver:latest .
+```
+
+Archive through the worker on macOS with Podman:
+
+```sh
+node src/container-runner.mjs archive "https://example.com/article" \
+ --runtime podman \
+ --image local-page-archiver:latest \
+ --archive-path ./archives
+```
+
+The convenience wrapper does the same thing and builds the image if missing:
+
+```sh
+./podman-run.sh archive "https://example.com/article"
+```
+
+For visual debugging, expose VNC from the worker:
+
+```sh
+./podman-run.sh vnc-archive "https://example.com/article"
+# Then open vnc://localhost:5901
+```
+
+The worker image starts Xvfb internally, so callers do not need to mount the host X11 socket or override the entrypoint.
diff --git a/docker-compose.yml b/docker-compose.yml
index d26f8f9..b311e59 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,33 +1,26 @@
-version: "3.8"
-
services:
- archiver:
+ archive-worker:
build: .
+ image: local-page-archiver:latest
environment:
- - ARCHIVE_PATH=/archives
- - DISPLAY=:99
+ ARCHIVE_PATH: /archives
+ ARCHIVE_WORKER_XVFB: "1"
volumes:
- ./archives:/archives
- # For headful testing, uncomment the line below and run with --profile=headful
- # - /tmp/.X11-unix:/tmp/.X11-unix:rw
- command: ["archive", "--help"]
+ shm_size: 1gb
+ command: ["archive", "${URL:?Set URL=https://example.com}", "--json"]
- # Headful profile: runs a VNC server so you can watch the browser
- archiver-headful:
- profiles: ["headful"]
+ archive-worker-vnc:
+ profiles: ["debug"]
build: .
+ image: local-page-archiver:latest
environment:
- - ARCHIVE_PATH=/archives
- - DISPLAY=:99
+ ARCHIVE_PATH: /archives
+ ARCHIVE_WORKER_XVFB: "1"
+ ARCHIVE_WORKER_VNC: "1"
volumes:
- ./archives:/archives
ports:
- "5900:5900"
- command: >
- sh -c "
- apt-get update && apt-get install -y x11vnc xvfb &&
- Xvfb :99 -screen 0 1366x768x24 &
- x11vnc -display :99 -nopw -forever &
- sleep 2 &&
- node src/cli.mjs archive $$URL
- "
+ shm_size: 1gb
+ command: ["archive", "${URL:?Set URL=https://example.com}", "--json"]
diff --git a/package.json b/package.json
index 643bde5..6002cfd 100644
--- a/package.json
+++ b/package.json
@@ -5,10 +5,13 @@
"type": "module",
"description": "Render and save self-contained HTML archives.",
"bin": {
- "archive-page": "./src/cli.mjs"
+ "archive-page": "./src/cli.mjs",
+ "archive-page-container": "./src/container-runner.mjs"
},
"scripts": {
"archive": "node src/cli.mjs archive",
+ "container:archive": "node src/container-runner.mjs archive",
+ "container:build": "node src/container-runner.mjs build",
"test": "node --test test/*.test.mjs",
"install-browsers": "playwright install chromium"
},
diff --git a/podman-run.sh b/podman-run.sh
index f792cbe..923bf9a 100755
--- a/podman-run.sh
+++ b/podman-run.sh
@@ -1,61 +1,66 @@
-#!/bin/bash
+#!/usr/bin/env bash
set -euo pipefail
-# Podman helper for local-page-archiver with headful Chromium support.
+# Podman helper for local-page-archiver worker-container runs.
# Usage:
-# ./podman-run.sh archive [options]
-# ./podman-run.sh headful-archive [options]
+# ./podman-run.sh build
+# ./podman-run.sh archive [archive CLI options]
+# ./podman-run.sh vnc-archive [archive CLI options]
-IMAGE_NAME="local-page-archiver"
+IMAGE_NAME="${ARCHIVE_WORKER_IMAGE:-local-page-archiver:latest}"
ARCHIVE_DIR="${ARCHIVE_DIR:-$(pwd)/archives}"
build_image() {
- echo "Building ${IMAGE_NAME}..."
podman build -t "${IMAGE_NAME}" .
}
-run_headless() {
- mkdir -p "${ARCHIVE_DIR}"
- podman run --rm \
- -e "ARCHIVE_PATH=/archives" \
- -v "${ARCHIVE_DIR}:/archives:Z" \
- "${IMAGE_NAME}" \
- "$@"
+ensure_image() {
+ if ! podman image exists "${IMAGE_NAME}"; then
+ build_image
+ fi
}
-run_headful() {
- mkdir -p "${ARCHIVE_DIR}"
- podman run --rm \
- --entrypoint sh \
- -e "ARCHIVE_PATH=/archives" \
- -e "DISPLAY=:99" \
- -v "${ARCHIVE_DIR}:/archives:Z" \
- -p "5901:5900" \
- "${IMAGE_NAME}" \
- -c "
- apt-get update -qq && apt-get install -y -qq x11vnc xvfb >/dev/null 2>&1 &&
- Xvfb :99 -screen 0 1366x768x24 >/dev/null 2>&1 &
- x11vnc -display :99 -nopw -forever >/dev/null 2>&1 &
- sleep 2 &&
- node src/cli.mjs $(printf '%q ' "$@")
- "
-}
-
-if ! podman image exists "${IMAGE_NAME}"; then
- build_image
-fi
-
case "${1:-}" in
- headful-archive)
- shift
- # Prepend 'archive' so the user doesn't have to type it twice
- set -- archive "$@"
- run_headful "$@"
+ build)
+ build_image
;;
- archive|help)
- run_headless "$@"
+ archive)
+ shift
+ ensure_image
+ node src/container-runner.mjs archive \
+ --runtime podman \
+ --image "${IMAGE_NAME}" \
+ --archive-path "${ARCHIVE_DIR}" \
+ "$@"
+ ;;
+ vnc-archive|headful-archive)
+ shift
+ ensure_image
+ node src/container-runner.mjs archive \
+ --runtime podman \
+ --image "${IMAGE_NAME}" \
+ --archive-path "${ARCHIVE_DIR}" \
+ --vnc \
+ "$@"
+ ;;
+ help|"")
+ cat < [archive CLI options]
+ ./podman-run.sh vnc-archive [archive CLI options]
+
+Environment:
+ ARCHIVE_WORKER_IMAGE=${IMAGE_NAME}
+ ARCHIVE_DIR=${ARCHIVE_DIR}
+EOF
;;
*)
- run_headless "$@"
+ ensure_image
+ node src/container-runner.mjs archive \
+ --runtime podman \
+ --image "${IMAGE_NAME}" \
+ --archive-path "${ARCHIVE_DIR}" \
+ "$@"
;;
esac
diff --git a/src/archiver.mjs b/src/archiver.mjs
index 9290fcc..5592c84 100644
--- a/src/archiver.mjs
+++ b/src/archiver.mjs
@@ -9,6 +9,7 @@ import {
findEffectiveBase,
inputToUrl,
isHttpUrl,
+ splitSrcset,
slugForUrl
} from "./asset-inliner.mjs";
@@ -762,6 +763,7 @@ export async function renderPage(sourceUrl, options = {}) {
await waitForNetworkIdle(page);
await snapshotLoadedResourceUrls(page);
+ await snapshotRuntimeStyles(page);
return await page.content();
} finally {
@@ -807,6 +809,44 @@ async function snapshotLoadedResourceUrls(page) {
});
}
+async function snapshotRuntimeStyles(page) {
+ await page.evaluate(() => {
+ const serializeRules = (sheet) => {
+ try {
+ return Array.from(sheet.cssRules || [])
+ .map((rule) => rule.cssText)
+ .join("\n");
+ } catch {
+ return "";
+ }
+ };
+
+ for (const sheet of Array.from(document.styleSheets)) {
+ const css = serializeRules(sheet);
+ if (!css.trim()) {
+ continue;
+ }
+
+ const owner = sheet.ownerNode;
+ if (owner instanceof HTMLStyleElement && !owner.textContent.trim()) {
+ owner.textContent = css;
+ }
+ }
+
+ const adoptedStyleSheets = Array.from(document.adoptedStyleSheets || []);
+ adoptedStyleSheets.forEach((sheet, index) => {
+ const css = serializeRules(sheet);
+ if (!css.trim()) {
+ return;
+ }
+ const style = document.createElement("style");
+ style.setAttribute("data-archiver-adopted-stylesheet", String(index));
+ style.textContent = css;
+ document.head.appendChild(style);
+ });
+ });
+}
+
function addArchiveComment(html, sourceUrl) {
const safeSource = String(sourceUrl).replaceAll("--", "- -");
const comment = ``;
@@ -823,7 +863,14 @@ export function findExternalAssetRefs(html) {
const tag = match[0];
for (const attr of ["src", "srcset", "poster", "data"]) {
const value = readAttribute(tag, attr);
- if (!value || isSelfContainedAssetRef(value)) {
+ if (!value) {
+ continue;
+ }
+ if (attr === "srcset") {
+ addSrcsetRefs(refs, value);
+ continue;
+ }
+ if (isSelfContainedAssetRef(value)) {
continue;
}
for (const part of value.split(",")) {
@@ -846,6 +893,10 @@ export function findExternalAssetRefs(html) {
if (href && !isSelfContainedAssetRef(href)) {
refs.add(href);
}
+ const imageSrcset = readAttribute(tag, "imagesrcset");
+ if (imageSrcset) {
+ addSrcsetRefs(refs, imageSrcset);
+ }
}
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
@@ -859,6 +910,15 @@ export function findExternalAssetRefs(html) {
return Array.from(refs).sort();
}
+function addSrcsetRefs(refs, srcset) {
+ for (const part of splitSrcset(srcset)) {
+ const candidate = part.trim().split(/\s+/)[0];
+ if (candidate && !isSelfContainedAssetRef(candidate)) {
+ refs.add(candidate);
+ }
+ }
+}
+
function isSelfContainedAssetRef(value) {
const trimmed = cleanCssUrl(value);
return (
diff --git a/src/asset-inliner.mjs b/src/asset-inliner.mjs
index 98da69a..a9eefbb 100644
--- a/src/asset-inliner.mjs
+++ b/src/asset-inliner.mjs
@@ -91,6 +91,7 @@ export function resolveUrl(rawUrl, baseUrl) {
const trimmed = htmlDecode(rawUrl.trim());
if (
!trimmed ||
+ /^(?:undefined|null)$/i.test(trimmed) ||
trimmed.startsWith("#") ||
/^%23/i.test(trimmed) ||
/^(?:about|blob|data|javascript|mailto|tel):/i.test(trimmed)
@@ -233,11 +234,19 @@ export class AssetInliner {
if (/\bpreload\b/i.test(rel) && /^style$/i.test(asValue)) {
return "";
}
+ let output = tag;
+ if (/\bpreload\b/i.test(rel) && /^image$/i.test(asValue)) {
+ const imageSrcset = getAttribute(output, "imagesrcset");
+ if (imageSrcset) {
+ output = setAttribute(output, "imagesrcset", await this.inlineSrcset(imageSrcset, baseUrl));
+ }
+ }
+
const dataUri = await this.toDataUri(href, baseUrl);
if (!dataUri) {
return "";
}
- return setAttribute(tag, "href", dataUri);
+ return setAttribute(output, "href", dataUri);
}
async rewriteMediaAttributes(tag, baseUrl) {
@@ -293,8 +302,7 @@ export class AssetInliner {
}
async inlineSrcset(value, baseUrl) {
- const candidates = value
- .split(",")
+ const candidates = splitSrcset(value)
.map((part) => part.trim())
.filter(Boolean);
const rewritten = [];
@@ -425,6 +433,75 @@ export class AssetInliner {
}
}
+export function splitSrcset(value) {
+ const candidates = [];
+ let current = "";
+ let quote = "";
+ let parenDepth = 0;
+
+ for (let index = 0; index < value.length; index += 1) {
+ const ch = value[index];
+
+ if (quote) {
+ current += ch;
+ if (ch === quote) {
+ quote = "";
+ }
+ continue;
+ }
+
+ if (ch === '"' || ch === "'") {
+ quote = ch;
+ current += ch;
+ continue;
+ }
+
+ if (ch === "(") {
+ parenDepth += 1;
+ current += ch;
+ continue;
+ }
+
+ if (ch === ")" && parenDepth > 0) {
+ parenDepth -= 1;
+ current += ch;
+ continue;
+ }
+
+ if (ch === "," && parenDepth === 0 && isSrcsetCandidateSeparator(value, index, current)) {
+ if (current.trim()) {
+ candidates.push(current.trim());
+ }
+ current = "";
+ while (index + 1 < value.length && /\s/.test(value[index + 1])) {
+ index += 1;
+ }
+ continue;
+ }
+
+ current += ch;
+ }
+
+ if (current.trim()) {
+ candidates.push(current.trim());
+ }
+ return candidates;
+}
+
+function isSrcsetCandidateSeparator(value, commaIndex, currentCandidate) {
+ const after = value.slice(commaIndex + 1);
+ if (!after.trim()) {
+ return true;
+ }
+ if (/^\s/.test(after)) {
+ return true;
+ }
+
+ const tokens = currentCandidate.trim().split(/\s+/);
+ const descriptor = tokens.at(-1) || "";
+ return /^(?:\d+(?:\.\d+)?[wxh]|\d+(?:\.\d+)?x)$/i.test(descriptor);
+}
+
function mimeFromUrl(rawUrl) {
let pathname = rawUrl;
try {