From 40c63dc4e22182147437def3dcd5a7a7171245ba Mon Sep 17 00:00:00 2001 From: James Magahern Date: Sat, 16 May 2026 16:05:32 -0700 Subject: [PATCH] Fixes for inline css (CSSOM) --- .dockerignore | 3 ++ .gitignore | 2 +- AGENTS.md | 55 +++++++++++++++++--------- Dockerfile | 34 +++++++++------- README.md | 34 ++++++++++++++++ docker-compose.yml | 35 +++++++---------- package.json | 5 ++- podman-run.sh | 91 +++++++++++++++++++++++-------------------- src/archiver.mjs | 62 ++++++++++++++++++++++++++++- src/asset-inliner.mjs | 83 +++++++++++++++++++++++++++++++++++++-- 10 files changed, 301 insertions(+), 103 deletions(-) diff --git a/.dockerignore b/.dockerignore index 98c743f..0b092dd 100644 --- a/.dockerignore +++ b/.dockerignore @@ -3,5 +3,8 @@ npm-debug.log .git .gitignore README.md +AGENTS.md archives *.html +test +test-*.mjs diff --git a/.gitignore b/.gitignore index d570088..0420d38 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ node_modules/ - +archives/ diff --git a/AGENTS.md b/AGENTS.md index 6239381..1901a19 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -122,35 +122,51 @@ The current init script is minimal and safe — it only overrides the getter via ### Dockerfile -- Base: `mcr.microsoft.com/playwright:v1.60.0` (must stay in sync with the `playwright` npm version) -- Installs Node 22 (the base image may ship an older Node) -- Runs `npx playwright install chromium` so the browser binary is baked into the image +- Base: `mcr.microsoft.com/playwright:v1.60.0-noble` (must stay in sync with the `playwright` npm version) +- Installs only the worker runtime helpers that are not part of the Playwright image: `dumb-init`, `xvfb`, and `x11vnc` +- Uses `/app/scripts/archive-worker-entrypoint.sh` as the entrypoint. The entrypoint starts Xvfb on `$DISPLAY` and then runs `node src/cli.mjs ...` for `archive`/`help` commands. +- The worker is intended to be ephemeral: one container per archive job, with `/archives` mounted from the host. + +### Host-to-worker contract + +`src/container-runner.mjs` is the host/backend-facing boundary. It: + +1. Picks `podman` or `docker`. +2. Starts `local-page-archiver:latest` with `/archives` mounted from the host. +3. Calls the in-container CLI as `archive --json`. +4. Parses the JSON result and rewrites `/archives/...` paths back to host paths. + +This is the integration point a future backend should use instead of shelling out to `podman run` directly. ### `podman-run.sh` -Helper for local runs. Two modes: +Helper for local Podman runs. It delegates to `src/container-runner.mjs`. -1. **`./podman-run.sh archive `** — headless, mounts `./archives` -2. **`./podman-run.sh headful-archive `** — headful with internal VNC +1. **`./podman-run.sh build`** — build `local-page-archiver:latest` +2. **`./podman-run.sh archive `** — run one ephemeral Xvfb/Chromium worker and write to `./archives` +3. **`./podman-run.sh vnc-archive `** — same worker with x11vnc exposed on `vnc://localhost:5901` -**Headful mode details:** -The container's `ENTRYPOINT` is `node src/cli.mjs`. To run a shell command inside the container (setting up Xvfb + x11vnc) we must override the entrypoint: +The helper builds the image if it is missing. Override with: -```bash -podman run --rm --entrypoint sh -c "...setup Xvfb... && node src/cli.mjs archive " +```sh +ARCHIVE_WORKER_IMAGE=local-page-archiver:dev ARCHIVE_DIR=/tmp/archives ./podman-run.sh archive https://example.com ``` -Port `5900` inside the container maps to `5901` on the host to avoid conflicts with macOS's built-in VNC. - ### `docker-compose.yml` -Includes a `headful` profile that can be run with: +Compose is mainly a direct worker smoke test. It runs the same image and command shape as the host runner: ```bash -URL=https://example.com docker compose --profile headful up archiver-headful +URL=https://example.com docker compose up --build archive-worker ``` -Unlike `podman-run.sh`, Compose currently maps VNC to host port `5900`. +For visual debugging: + +```bash +URL=https://example.com docker compose --profile debug up --build archive-worker-vnc +``` + +Unlike `podman-run.sh`, Compose maps VNC to host port `5900`. ## Known limitations @@ -201,10 +217,13 @@ node src/cli.mjs archive https://example.com # Archive a page (headful on macOS) node src/cli.mjs archive https://example.com --headful -# Archive inside container (headless) +# Build worker image +./podman-run.sh build + +# Archive inside an ephemeral Xvfb/Chromium worker ./podman-run.sh archive https://example.com -# Archive inside container (headful + VNC) -./podman-run.sh headful-archive https://example.com +# Archive inside worker + expose VNC for debugging +./podman-run.sh vnc-archive https://example.com # Then open vnc://localhost:5901 ``` diff --git a/Dockerfile b/Dockerfile index 8c6c12c..cc49052 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,23 +1,27 @@ -FROM mcr.microsoft.com/playwright:v1.60.0 +FROM mcr.microsoft.com/playwright:v1.60.0-noble WORKDIR /app -# Install Node 22+ if not present (Playwright image may have an older Node) -RUN apt-get update && apt-get install -y curl && \ - curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && \ - apt-get install -y nodejs && \ - apt-get clean && rm -rf /var/lib/apt/lists/* - -COPY package.json package-lock.json ./ -RUN npm ci - -COPY . . -RUN npx playwright install chromium - -# Default to headless; override with --env HEADFUL=1 and mount X11 socket or use VNC ENV NODE_ENV=production ENV ARCHIVE_PATH=/archives +ENV DISPLAY=:99 +ENV ARCHIVE_WORKER_XVFB=1 +ENV ARCHIVE_WORKER_VIEWPORT=1366x768x24 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + dumb-init \ + x11vnc \ + xvfb && \ + rm -rf /var/lib/apt/lists/* + +COPY package.json package-lock.json ./ +RUN npm ci --omit=dev + +COPY . . +RUN mkdir -p /archives && chmod 0777 /archives VOLUME ["/archives"] -ENTRYPOINT ["node", "src/cli.mjs"] +ENTRYPOINT ["dumb-init", "--", "/app/scripts/archive-worker-entrypoint.sh"] +CMD ["help"] diff --git a/README.md b/README.md index a59eb64..ac0fc2b 100644 --- a/README.md +++ b/README.md @@ -17,3 +17,37 @@ node src/cli.mjs archive ./page.html ``` Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set. + +## Ephemeral container worker + +The host-facing container boundary is `src/container-runner.mjs`. It starts a short-lived Docker/Podman worker container, mounts the host archive directory at `/archives`, sends one archive request, reads a JSON result, and exits. + +Build the worker image: + +```sh +podman build -t local-page-archiver:latest . +``` + +Archive through the worker on macOS with Podman: + +```sh +node src/container-runner.mjs archive "https://example.com/article" \ + --runtime podman \ + --image local-page-archiver:latest \ + --archive-path ./archives +``` + +The convenience wrapper does the same thing and builds the image if missing: + +```sh +./podman-run.sh archive "https://example.com/article" +``` + +For visual debugging, expose VNC from the worker: + +```sh +./podman-run.sh vnc-archive "https://example.com/article" +# Then open vnc://localhost:5901 +``` + +The worker image starts Xvfb internally, so callers do not need to mount the host X11 socket or override the entrypoint. diff --git a/docker-compose.yml b/docker-compose.yml index d26f8f9..b311e59 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,33 +1,26 @@ -version: "3.8" - services: - archiver: + archive-worker: build: . + image: local-page-archiver:latest environment: - - ARCHIVE_PATH=/archives - - DISPLAY=:99 + ARCHIVE_PATH: /archives + ARCHIVE_WORKER_XVFB: "1" volumes: - ./archives:/archives - # For headful testing, uncomment the line below and run with --profile=headful - # - /tmp/.X11-unix:/tmp/.X11-unix:rw - command: ["archive", "--help"] + shm_size: 1gb + command: ["archive", "${URL:?Set URL=https://example.com}", "--json"] - # Headful profile: runs a VNC server so you can watch the browser - archiver-headful: - profiles: ["headful"] + archive-worker-vnc: + profiles: ["debug"] build: . + image: local-page-archiver:latest environment: - - ARCHIVE_PATH=/archives - - DISPLAY=:99 + ARCHIVE_PATH: /archives + ARCHIVE_WORKER_XVFB: "1" + ARCHIVE_WORKER_VNC: "1" volumes: - ./archives:/archives ports: - "5900:5900" - command: > - sh -c " - apt-get update && apt-get install -y x11vnc xvfb && - Xvfb :99 -screen 0 1366x768x24 & - x11vnc -display :99 -nopw -forever & - sleep 2 && - node src/cli.mjs archive $$URL - " + shm_size: 1gb + command: ["archive", "${URL:?Set URL=https://example.com}", "--json"] diff --git a/package.json b/package.json index 643bde5..6002cfd 100644 --- a/package.json +++ b/package.json @@ -5,10 +5,13 @@ "type": "module", "description": "Render and save self-contained HTML archives.", "bin": { - "archive-page": "./src/cli.mjs" + "archive-page": "./src/cli.mjs", + "archive-page-container": "./src/container-runner.mjs" }, "scripts": { "archive": "node src/cli.mjs archive", + "container:archive": "node src/container-runner.mjs archive", + "container:build": "node src/container-runner.mjs build", "test": "node --test test/*.test.mjs", "install-browsers": "playwright install chromium" }, diff --git a/podman-run.sh b/podman-run.sh index f792cbe..923bf9a 100755 --- a/podman-run.sh +++ b/podman-run.sh @@ -1,61 +1,66 @@ -#!/bin/bash +#!/usr/bin/env bash set -euo pipefail -# Podman helper for local-page-archiver with headful Chromium support. +# Podman helper for local-page-archiver worker-container runs. # Usage: -# ./podman-run.sh archive [options] -# ./podman-run.sh headful-archive [options] +# ./podman-run.sh build +# ./podman-run.sh archive [archive CLI options] +# ./podman-run.sh vnc-archive [archive CLI options] -IMAGE_NAME="local-page-archiver" +IMAGE_NAME="${ARCHIVE_WORKER_IMAGE:-local-page-archiver:latest}" ARCHIVE_DIR="${ARCHIVE_DIR:-$(pwd)/archives}" build_image() { - echo "Building ${IMAGE_NAME}..." podman build -t "${IMAGE_NAME}" . } -run_headless() { - mkdir -p "${ARCHIVE_DIR}" - podman run --rm \ - -e "ARCHIVE_PATH=/archives" \ - -v "${ARCHIVE_DIR}:/archives:Z" \ - "${IMAGE_NAME}" \ - "$@" +ensure_image() { + if ! podman image exists "${IMAGE_NAME}"; then + build_image + fi } -run_headful() { - mkdir -p "${ARCHIVE_DIR}" - podman run --rm \ - --entrypoint sh \ - -e "ARCHIVE_PATH=/archives" \ - -e "DISPLAY=:99" \ - -v "${ARCHIVE_DIR}:/archives:Z" \ - -p "5901:5900" \ - "${IMAGE_NAME}" \ - -c " - apt-get update -qq && apt-get install -y -qq x11vnc xvfb >/dev/null 2>&1 && - Xvfb :99 -screen 0 1366x768x24 >/dev/null 2>&1 & - x11vnc -display :99 -nopw -forever >/dev/null 2>&1 & - sleep 2 && - node src/cli.mjs $(printf '%q ' "$@") - " -} - -if ! podman image exists "${IMAGE_NAME}"; then - build_image -fi - case "${1:-}" in - headful-archive) - shift - # Prepend 'archive' so the user doesn't have to type it twice - set -- archive "$@" - run_headful "$@" + build) + build_image ;; - archive|help) - run_headless "$@" + archive) + shift + ensure_image + node src/container-runner.mjs archive \ + --runtime podman \ + --image "${IMAGE_NAME}" \ + --archive-path "${ARCHIVE_DIR}" \ + "$@" + ;; + vnc-archive|headful-archive) + shift + ensure_image + node src/container-runner.mjs archive \ + --runtime podman \ + --image "${IMAGE_NAME}" \ + --archive-path "${ARCHIVE_DIR}" \ + --vnc \ + "$@" + ;; + help|"") + cat < [archive CLI options] + ./podman-run.sh vnc-archive [archive CLI options] + +Environment: + ARCHIVE_WORKER_IMAGE=${IMAGE_NAME} + ARCHIVE_DIR=${ARCHIVE_DIR} +EOF ;; *) - run_headless "$@" + ensure_image + node src/container-runner.mjs archive \ + --runtime podman \ + --image "${IMAGE_NAME}" \ + --archive-path "${ARCHIVE_DIR}" \ + "$@" ;; esac diff --git a/src/archiver.mjs b/src/archiver.mjs index 9290fcc..5592c84 100644 --- a/src/archiver.mjs +++ b/src/archiver.mjs @@ -9,6 +9,7 @@ import { findEffectiveBase, inputToUrl, isHttpUrl, + splitSrcset, slugForUrl } from "./asset-inliner.mjs"; @@ -762,6 +763,7 @@ export async function renderPage(sourceUrl, options = {}) { await waitForNetworkIdle(page); await snapshotLoadedResourceUrls(page); + await snapshotRuntimeStyles(page); return await page.content(); } finally { @@ -807,6 +809,44 @@ async function snapshotLoadedResourceUrls(page) { }); } +async function snapshotRuntimeStyles(page) { + await page.evaluate(() => { + const serializeRules = (sheet) => { + try { + return Array.from(sheet.cssRules || []) + .map((rule) => rule.cssText) + .join("\n"); + } catch { + return ""; + } + }; + + for (const sheet of Array.from(document.styleSheets)) { + const css = serializeRules(sheet); + if (!css.trim()) { + continue; + } + + const owner = sheet.ownerNode; + if (owner instanceof HTMLStyleElement && !owner.textContent.trim()) { + owner.textContent = css; + } + } + + const adoptedStyleSheets = Array.from(document.adoptedStyleSheets || []); + adoptedStyleSheets.forEach((sheet, index) => { + const css = serializeRules(sheet); + if (!css.trim()) { + return; + } + const style = document.createElement("style"); + style.setAttribute("data-archiver-adopted-stylesheet", String(index)); + style.textContent = css; + document.head.appendChild(style); + }); + }); +} + function addArchiveComment(html, sourceUrl) { const safeSource = String(sourceUrl).replaceAll("--", "- -"); const comment = ``; @@ -823,7 +863,14 @@ export function findExternalAssetRefs(html) { const tag = match[0]; for (const attr of ["src", "srcset", "poster", "data"]) { const value = readAttribute(tag, attr); - if (!value || isSelfContainedAssetRef(value)) { + if (!value) { + continue; + } + if (attr === "srcset") { + addSrcsetRefs(refs, value); + continue; + } + if (isSelfContainedAssetRef(value)) { continue; } for (const part of value.split(",")) { @@ -846,6 +893,10 @@ export function findExternalAssetRefs(html) { if (href && !isSelfContainedAssetRef(href)) { refs.add(href); } + const imageSrcset = readAttribute(tag, "imagesrcset"); + if (imageSrcset) { + addSrcsetRefs(refs, imageSrcset); + } } const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi; @@ -859,6 +910,15 @@ export function findExternalAssetRefs(html) { return Array.from(refs).sort(); } +function addSrcsetRefs(refs, srcset) { + for (const part of splitSrcset(srcset)) { + const candidate = part.trim().split(/\s+/)[0]; + if (candidate && !isSelfContainedAssetRef(candidate)) { + refs.add(candidate); + } + } +} + function isSelfContainedAssetRef(value) { const trimmed = cleanCssUrl(value); return ( diff --git a/src/asset-inliner.mjs b/src/asset-inliner.mjs index 98da69a..a9eefbb 100644 --- a/src/asset-inliner.mjs +++ b/src/asset-inliner.mjs @@ -91,6 +91,7 @@ export function resolveUrl(rawUrl, baseUrl) { const trimmed = htmlDecode(rawUrl.trim()); if ( !trimmed || + /^(?:undefined|null)$/i.test(trimmed) || trimmed.startsWith("#") || /^%23/i.test(trimmed) || /^(?:about|blob|data|javascript|mailto|tel):/i.test(trimmed) @@ -233,11 +234,19 @@ export class AssetInliner { if (/\bpreload\b/i.test(rel) && /^style$/i.test(asValue)) { return ""; } + let output = tag; + if (/\bpreload\b/i.test(rel) && /^image$/i.test(asValue)) { + const imageSrcset = getAttribute(output, "imagesrcset"); + if (imageSrcset) { + output = setAttribute(output, "imagesrcset", await this.inlineSrcset(imageSrcset, baseUrl)); + } + } + const dataUri = await this.toDataUri(href, baseUrl); if (!dataUri) { return ""; } - return setAttribute(tag, "href", dataUri); + return setAttribute(output, "href", dataUri); } async rewriteMediaAttributes(tag, baseUrl) { @@ -293,8 +302,7 @@ export class AssetInliner { } async inlineSrcset(value, baseUrl) { - const candidates = value - .split(",") + const candidates = splitSrcset(value) .map((part) => part.trim()) .filter(Boolean); const rewritten = []; @@ -425,6 +433,75 @@ export class AssetInliner { } } +export function splitSrcset(value) { + const candidates = []; + let current = ""; + let quote = ""; + let parenDepth = 0; + + for (let index = 0; index < value.length; index += 1) { + const ch = value[index]; + + if (quote) { + current += ch; + if (ch === quote) { + quote = ""; + } + continue; + } + + if (ch === '"' || ch === "'") { + quote = ch; + current += ch; + continue; + } + + if (ch === "(") { + parenDepth += 1; + current += ch; + continue; + } + + if (ch === ")" && parenDepth > 0) { + parenDepth -= 1; + current += ch; + continue; + } + + if (ch === "," && parenDepth === 0 && isSrcsetCandidateSeparator(value, index, current)) { + if (current.trim()) { + candidates.push(current.trim()); + } + current = ""; + while (index + 1 < value.length && /\s/.test(value[index + 1])) { + index += 1; + } + continue; + } + + current += ch; + } + + if (current.trim()) { + candidates.push(current.trim()); + } + return candidates; +} + +function isSrcsetCandidateSeparator(value, commaIndex, currentCandidate) { + const after = value.slice(commaIndex + 1); + if (!after.trim()) { + return true; + } + if (/^\s/.test(after)) { + return true; + } + + const tokens = currentCandidate.trim().split(/\s+/); + const descriptor = tokens.at(-1) || ""; + return /^(?:\d+(?:\.\d+)?[wxh]|\d+(?:\.\d+)?x)$/i.test(descriptor); +} + function mimeFromUrl(rawUrl) { let pathname = rawUrl; try {