Fixes for inline css (CSSOM)
This commit is contained in:
@@ -3,5 +3,8 @@ npm-debug.log
|
||||
.git
|
||||
.gitignore
|
||||
README.md
|
||||
AGENTS.md
|
||||
archives
|
||||
*.html
|
||||
test
|
||||
test-*.mjs
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,2 +1,2 @@
|
||||
node_modules/
|
||||
|
||||
archives/
|
||||
|
||||
55
AGENTS.md
55
AGENTS.md
@@ -122,35 +122,51 @@ The current init script is minimal and safe — it only overrides the getter via
|
||||
|
||||
### Dockerfile
|
||||
|
||||
- Base: `mcr.microsoft.com/playwright:v1.60.0` (must stay in sync with the `playwright` npm version)
|
||||
- Installs Node 22 (the base image may ship an older Node)
|
||||
- Runs `npx playwright install chromium` so the browser binary is baked into the image
|
||||
- Base: `mcr.microsoft.com/playwright:v1.60.0-noble` (must stay in sync with the `playwright` npm version)
|
||||
- Installs only the worker runtime helpers that are not part of the Playwright image: `dumb-init`, `xvfb`, and `x11vnc`
|
||||
- Uses `/app/scripts/archive-worker-entrypoint.sh` as the entrypoint. The entrypoint starts Xvfb on `$DISPLAY` and then runs `node src/cli.mjs ...` for `archive`/`help` commands.
|
||||
- The worker is intended to be ephemeral: one container per archive job, with `/archives` mounted from the host.
|
||||
|
||||
### Host-to-worker contract
|
||||
|
||||
`src/container-runner.mjs` is the host/backend-facing boundary. It:
|
||||
|
||||
1. Picks `podman` or `docker`.
|
||||
2. Starts `local-page-archiver:latest` with `/archives` mounted from the host.
|
||||
3. Calls the in-container CLI as `archive <input> --json`.
|
||||
4. Parses the JSON result and rewrites `/archives/...` paths back to host paths.
|
||||
|
||||
This is the integration point a future backend should use instead of shelling out to `podman run` directly.
|
||||
|
||||
### `podman-run.sh`
|
||||
|
||||
Helper for local runs. Two modes:
|
||||
Helper for local Podman runs. It delegates to `src/container-runner.mjs`.
|
||||
|
||||
1. **`./podman-run.sh archive <URL>`** — headless, mounts `./archives`
|
||||
2. **`./podman-run.sh headful-archive <URL>`** — headful with internal VNC
|
||||
1. **`./podman-run.sh build`** — build `local-page-archiver:latest`
|
||||
2. **`./podman-run.sh archive <URL>`** — run one ephemeral Xvfb/Chromium worker and write to `./archives`
|
||||
3. **`./podman-run.sh vnc-archive <URL>`** — same worker with x11vnc exposed on `vnc://localhost:5901`
|
||||
|
||||
**Headful mode details:**
|
||||
The container's `ENTRYPOINT` is `node src/cli.mjs`. To run a shell command inside the container (setting up Xvfb + x11vnc) we must override the entrypoint:
|
||||
The helper builds the image if it is missing. Override with:
|
||||
|
||||
```bash
|
||||
podman run --rm --entrypoint sh <image> -c "...setup Xvfb... && node src/cli.mjs archive <URL>"
|
||||
```sh
|
||||
ARCHIVE_WORKER_IMAGE=local-page-archiver:dev ARCHIVE_DIR=/tmp/archives ./podman-run.sh archive https://example.com
|
||||
```
|
||||
|
||||
Port `5900` inside the container maps to `5901` on the host to avoid conflicts with macOS's built-in VNC.
|
||||
|
||||
### `docker-compose.yml`
|
||||
|
||||
Includes a `headful` profile that can be run with:
|
||||
Compose is mainly a direct worker smoke test. It runs the same image and command shape as the host runner:
|
||||
|
||||
```bash
|
||||
URL=https://example.com docker compose --profile headful up archiver-headful
|
||||
URL=https://example.com docker compose up --build archive-worker
|
||||
```
|
||||
|
||||
Unlike `podman-run.sh`, Compose currently maps VNC to host port `5900`.
|
||||
For visual debugging:
|
||||
|
||||
```bash
|
||||
URL=https://example.com docker compose --profile debug up --build archive-worker-vnc
|
||||
```
|
||||
|
||||
Unlike `podman-run.sh`, Compose maps VNC to host port `5900`.
|
||||
|
||||
## Known limitations
|
||||
|
||||
@@ -201,10 +217,13 @@ node src/cli.mjs archive https://example.com
|
||||
# Archive a page (headful on macOS)
|
||||
node src/cli.mjs archive https://example.com --headful
|
||||
|
||||
# Archive inside container (headless)
|
||||
# Build worker image
|
||||
./podman-run.sh build
|
||||
|
||||
# Archive inside an ephemeral Xvfb/Chromium worker
|
||||
./podman-run.sh archive https://example.com
|
||||
|
||||
# Archive inside container (headful + VNC)
|
||||
./podman-run.sh headful-archive https://example.com
|
||||
# Archive inside worker + expose VNC for debugging
|
||||
./podman-run.sh vnc-archive https://example.com
|
||||
# Then open vnc://localhost:5901
|
||||
```
|
||||
|
||||
34
Dockerfile
34
Dockerfile
@@ -1,23 +1,27 @@
|
||||
FROM mcr.microsoft.com/playwright:v1.60.0
|
||||
FROM mcr.microsoft.com/playwright:v1.60.0-noble
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install Node 22+ if not present (Playwright image may have an older Node)
|
||||
RUN apt-get update && apt-get install -y curl && \
|
||||
curl -fsSL https://deb.nodesource.com/setup_22.x | bash - && \
|
||||
apt-get install -y nodejs && \
|
||||
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY package.json package-lock.json ./
|
||||
RUN npm ci
|
||||
|
||||
COPY . .
|
||||
RUN npx playwright install chromium
|
||||
|
||||
# Default to headless; override with --env HEADFUL=1 and mount X11 socket or use VNC
|
||||
ENV NODE_ENV=production
|
||||
ENV ARCHIVE_PATH=/archives
|
||||
ENV DISPLAY=:99
|
||||
ENV ARCHIVE_WORKER_XVFB=1
|
||||
ENV ARCHIVE_WORKER_VIEWPORT=1366x768x24
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
dumb-init \
|
||||
x11vnc \
|
||||
xvfb && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY package.json package-lock.json ./
|
||||
RUN npm ci --omit=dev
|
||||
|
||||
COPY . .
|
||||
RUN mkdir -p /archives && chmod 0777 /archives
|
||||
|
||||
VOLUME ["/archives"]
|
||||
|
||||
ENTRYPOINT ["node", "src/cli.mjs"]
|
||||
ENTRYPOINT ["dumb-init", "--", "/app/scripts/archive-worker-entrypoint.sh"]
|
||||
CMD ["help"]
|
||||
|
||||
34
README.md
34
README.md
@@ -17,3 +17,37 @@ node src/cli.mjs archive ./page.html
|
||||
```
|
||||
|
||||
Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set.
|
||||
|
||||
## Ephemeral container worker
|
||||
|
||||
The host-facing container boundary is `src/container-runner.mjs`. It starts a short-lived Docker/Podman worker container, mounts the host archive directory at `/archives`, sends one archive request, reads a JSON result, and exits.
|
||||
|
||||
Build the worker image:
|
||||
|
||||
```sh
|
||||
podman build -t local-page-archiver:latest .
|
||||
```
|
||||
|
||||
Archive through the worker on macOS with Podman:
|
||||
|
||||
```sh
|
||||
node src/container-runner.mjs archive "https://example.com/article" \
|
||||
--runtime podman \
|
||||
--image local-page-archiver:latest \
|
||||
--archive-path ./archives
|
||||
```
|
||||
|
||||
The convenience wrapper does the same thing and builds the image if missing:
|
||||
|
||||
```sh
|
||||
./podman-run.sh archive "https://example.com/article"
|
||||
```
|
||||
|
||||
For visual debugging, expose VNC from the worker:
|
||||
|
||||
```sh
|
||||
./podman-run.sh vnc-archive "https://example.com/article"
|
||||
# Then open vnc://localhost:5901
|
||||
```
|
||||
|
||||
The worker image starts Xvfb internally, so callers do not need to mount the host X11 socket or override the entrypoint.
|
||||
|
||||
@@ -1,33 +1,26 @@
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
archiver:
|
||||
archive-worker:
|
||||
build: .
|
||||
image: local-page-archiver:latest
|
||||
environment:
|
||||
- ARCHIVE_PATH=/archives
|
||||
- DISPLAY=:99
|
||||
ARCHIVE_PATH: /archives
|
||||
ARCHIVE_WORKER_XVFB: "1"
|
||||
volumes:
|
||||
- ./archives:/archives
|
||||
# For headful testing, uncomment the line below and run with --profile=headful
|
||||
# - /tmp/.X11-unix:/tmp/.X11-unix:rw
|
||||
command: ["archive", "--help"]
|
||||
shm_size: 1gb
|
||||
command: ["archive", "${URL:?Set URL=https://example.com}", "--json"]
|
||||
|
||||
# Headful profile: runs a VNC server so you can watch the browser
|
||||
archiver-headful:
|
||||
profiles: ["headful"]
|
||||
archive-worker-vnc:
|
||||
profiles: ["debug"]
|
||||
build: .
|
||||
image: local-page-archiver:latest
|
||||
environment:
|
||||
- ARCHIVE_PATH=/archives
|
||||
- DISPLAY=:99
|
||||
ARCHIVE_PATH: /archives
|
||||
ARCHIVE_WORKER_XVFB: "1"
|
||||
ARCHIVE_WORKER_VNC: "1"
|
||||
volumes:
|
||||
- ./archives:/archives
|
||||
ports:
|
||||
- "5900:5900"
|
||||
command: >
|
||||
sh -c "
|
||||
apt-get update && apt-get install -y x11vnc xvfb &&
|
||||
Xvfb :99 -screen 0 1366x768x24 &
|
||||
x11vnc -display :99 -nopw -forever &
|
||||
sleep 2 &&
|
||||
node src/cli.mjs archive $$URL
|
||||
"
|
||||
shm_size: 1gb
|
||||
command: ["archive", "${URL:?Set URL=https://example.com}", "--json"]
|
||||
|
||||
@@ -5,10 +5,13 @@
|
||||
"type": "module",
|
||||
"description": "Render and save self-contained HTML archives.",
|
||||
"bin": {
|
||||
"archive-page": "./src/cli.mjs"
|
||||
"archive-page": "./src/cli.mjs",
|
||||
"archive-page-container": "./src/container-runner.mjs"
|
||||
},
|
||||
"scripts": {
|
||||
"archive": "node src/cli.mjs archive",
|
||||
"container:archive": "node src/container-runner.mjs archive",
|
||||
"container:build": "node src/container-runner.mjs build",
|
||||
"test": "node --test test/*.test.mjs",
|
||||
"install-browsers": "playwright install chromium"
|
||||
},
|
||||
|
||||
@@ -1,61 +1,66 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Podman helper for local-page-archiver with headful Chromium support.
|
||||
# Podman helper for local-page-archiver worker-container runs.
|
||||
# Usage:
|
||||
# ./podman-run.sh archive <URL> [options]
|
||||
# ./podman-run.sh headful-archive <URL> [options]
|
||||
# ./podman-run.sh build
|
||||
# ./podman-run.sh archive <URL> [archive CLI options]
|
||||
# ./podman-run.sh vnc-archive <URL> [archive CLI options]
|
||||
|
||||
IMAGE_NAME="local-page-archiver"
|
||||
IMAGE_NAME="${ARCHIVE_WORKER_IMAGE:-local-page-archiver:latest}"
|
||||
ARCHIVE_DIR="${ARCHIVE_DIR:-$(pwd)/archives}"
|
||||
|
||||
build_image() {
|
||||
echo "Building ${IMAGE_NAME}..."
|
||||
podman build -t "${IMAGE_NAME}" .
|
||||
}
|
||||
|
||||
run_headless() {
|
||||
mkdir -p "${ARCHIVE_DIR}"
|
||||
podman run --rm \
|
||||
-e "ARCHIVE_PATH=/archives" \
|
||||
-v "${ARCHIVE_DIR}:/archives:Z" \
|
||||
"${IMAGE_NAME}" \
|
||||
"$@"
|
||||
}
|
||||
|
||||
run_headful() {
|
||||
mkdir -p "${ARCHIVE_DIR}"
|
||||
podman run --rm \
|
||||
--entrypoint sh \
|
||||
-e "ARCHIVE_PATH=/archives" \
|
||||
-e "DISPLAY=:99" \
|
||||
-v "${ARCHIVE_DIR}:/archives:Z" \
|
||||
-p "5901:5900" \
|
||||
"${IMAGE_NAME}" \
|
||||
-c "
|
||||
apt-get update -qq && apt-get install -y -qq x11vnc xvfb >/dev/null 2>&1 &&
|
||||
Xvfb :99 -screen 0 1366x768x24 >/dev/null 2>&1 &
|
||||
x11vnc -display :99 -nopw -forever >/dev/null 2>&1 &
|
||||
sleep 2 &&
|
||||
node src/cli.mjs $(printf '%q ' "$@")
|
||||
"
|
||||
}
|
||||
|
||||
if ! podman image exists "${IMAGE_NAME}"; then
|
||||
ensure_image() {
|
||||
if ! podman image exists "${IMAGE_NAME}"; then
|
||||
build_image
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
case "${1:-}" in
|
||||
headful-archive)
|
||||
shift
|
||||
# Prepend 'archive' so the user doesn't have to type it twice
|
||||
set -- archive "$@"
|
||||
run_headful "$@"
|
||||
build)
|
||||
build_image
|
||||
;;
|
||||
archive|help)
|
||||
run_headless "$@"
|
||||
archive)
|
||||
shift
|
||||
ensure_image
|
||||
node src/container-runner.mjs archive \
|
||||
--runtime podman \
|
||||
--image "${IMAGE_NAME}" \
|
||||
--archive-path "${ARCHIVE_DIR}" \
|
||||
"$@"
|
||||
;;
|
||||
vnc-archive|headful-archive)
|
||||
shift
|
||||
ensure_image
|
||||
node src/container-runner.mjs archive \
|
||||
--runtime podman \
|
||||
--image "${IMAGE_NAME}" \
|
||||
--archive-path "${ARCHIVE_DIR}" \
|
||||
--vnc \
|
||||
"$@"
|
||||
;;
|
||||
help|"")
|
||||
cat <<EOF
|
||||
Usage:
|
||||
./podman-run.sh build
|
||||
./podman-run.sh archive <URL> [archive CLI options]
|
||||
./podman-run.sh vnc-archive <URL> [archive CLI options]
|
||||
|
||||
Environment:
|
||||
ARCHIVE_WORKER_IMAGE=${IMAGE_NAME}
|
||||
ARCHIVE_DIR=${ARCHIVE_DIR}
|
||||
EOF
|
||||
;;
|
||||
*)
|
||||
run_headless "$@"
|
||||
ensure_image
|
||||
node src/container-runner.mjs archive \
|
||||
--runtime podman \
|
||||
--image "${IMAGE_NAME}" \
|
||||
--archive-path "${ARCHIVE_DIR}" \
|
||||
"$@"
|
||||
;;
|
||||
esac
|
||||
|
||||
@@ -9,6 +9,7 @@ import {
|
||||
findEffectiveBase,
|
||||
inputToUrl,
|
||||
isHttpUrl,
|
||||
splitSrcset,
|
||||
slugForUrl
|
||||
} from "./asset-inliner.mjs";
|
||||
|
||||
@@ -762,6 +763,7 @@ export async function renderPage(sourceUrl, options = {}) {
|
||||
|
||||
await waitForNetworkIdle(page);
|
||||
await snapshotLoadedResourceUrls(page);
|
||||
await snapshotRuntimeStyles(page);
|
||||
|
||||
return await page.content();
|
||||
} finally {
|
||||
@@ -807,6 +809,44 @@ async function snapshotLoadedResourceUrls(page) {
|
||||
});
|
||||
}
|
||||
|
||||
async function snapshotRuntimeStyles(page) {
|
||||
await page.evaluate(() => {
|
||||
const serializeRules = (sheet) => {
|
||||
try {
|
||||
return Array.from(sheet.cssRules || [])
|
||||
.map((rule) => rule.cssText)
|
||||
.join("\n");
|
||||
} catch {
|
||||
return "";
|
||||
}
|
||||
};
|
||||
|
||||
for (const sheet of Array.from(document.styleSheets)) {
|
||||
const css = serializeRules(sheet);
|
||||
if (!css.trim()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const owner = sheet.ownerNode;
|
||||
if (owner instanceof HTMLStyleElement && !owner.textContent.trim()) {
|
||||
owner.textContent = css;
|
||||
}
|
||||
}
|
||||
|
||||
const adoptedStyleSheets = Array.from(document.adoptedStyleSheets || []);
|
||||
adoptedStyleSheets.forEach((sheet, index) => {
|
||||
const css = serializeRules(sheet);
|
||||
if (!css.trim()) {
|
||||
return;
|
||||
}
|
||||
const style = document.createElement("style");
|
||||
style.setAttribute("data-archiver-adopted-stylesheet", String(index));
|
||||
style.textContent = css;
|
||||
document.head.appendChild(style);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function addArchiveComment(html, sourceUrl) {
|
||||
const safeSource = String(sourceUrl).replaceAll("--", "- -");
|
||||
const comment = `<!-- Archived locally. Source: ${safeSource}. Created: ${new Date().toISOString()}. -->`;
|
||||
@@ -823,7 +863,14 @@ export function findExternalAssetRefs(html) {
|
||||
const tag = match[0];
|
||||
for (const attr of ["src", "srcset", "poster", "data"]) {
|
||||
const value = readAttribute(tag, attr);
|
||||
if (!value || isSelfContainedAssetRef(value)) {
|
||||
if (!value) {
|
||||
continue;
|
||||
}
|
||||
if (attr === "srcset") {
|
||||
addSrcsetRefs(refs, value);
|
||||
continue;
|
||||
}
|
||||
if (isSelfContainedAssetRef(value)) {
|
||||
continue;
|
||||
}
|
||||
for (const part of value.split(",")) {
|
||||
@@ -846,6 +893,10 @@ export function findExternalAssetRefs(html) {
|
||||
if (href && !isSelfContainedAssetRef(href)) {
|
||||
refs.add(href);
|
||||
}
|
||||
const imageSrcset = readAttribute(tag, "imagesrcset");
|
||||
if (imageSrcset) {
|
||||
addSrcsetRefs(refs, imageSrcset);
|
||||
}
|
||||
}
|
||||
|
||||
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
|
||||
@@ -859,6 +910,15 @@ export function findExternalAssetRefs(html) {
|
||||
return Array.from(refs).sort();
|
||||
}
|
||||
|
||||
function addSrcsetRefs(refs, srcset) {
|
||||
for (const part of splitSrcset(srcset)) {
|
||||
const candidate = part.trim().split(/\s+/)[0];
|
||||
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
||||
refs.add(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function isSelfContainedAssetRef(value) {
|
||||
const trimmed = cleanCssUrl(value);
|
||||
return (
|
||||
|
||||
@@ -91,6 +91,7 @@ export function resolveUrl(rawUrl, baseUrl) {
|
||||
const trimmed = htmlDecode(rawUrl.trim());
|
||||
if (
|
||||
!trimmed ||
|
||||
/^(?:undefined|null)$/i.test(trimmed) ||
|
||||
trimmed.startsWith("#") ||
|
||||
/^%23/i.test(trimmed) ||
|
||||
/^(?:about|blob|data|javascript|mailto|tel):/i.test(trimmed)
|
||||
@@ -233,11 +234,19 @@ export class AssetInliner {
|
||||
if (/\bpreload\b/i.test(rel) && /^style$/i.test(asValue)) {
|
||||
return "";
|
||||
}
|
||||
let output = tag;
|
||||
if (/\bpreload\b/i.test(rel) && /^image$/i.test(asValue)) {
|
||||
const imageSrcset = getAttribute(output, "imagesrcset");
|
||||
if (imageSrcset) {
|
||||
output = setAttribute(output, "imagesrcset", await this.inlineSrcset(imageSrcset, baseUrl));
|
||||
}
|
||||
}
|
||||
|
||||
const dataUri = await this.toDataUri(href, baseUrl);
|
||||
if (!dataUri) {
|
||||
return "";
|
||||
}
|
||||
return setAttribute(tag, "href", dataUri);
|
||||
return setAttribute(output, "href", dataUri);
|
||||
}
|
||||
|
||||
async rewriteMediaAttributes(tag, baseUrl) {
|
||||
@@ -293,8 +302,7 @@ export class AssetInliner {
|
||||
}
|
||||
|
||||
async inlineSrcset(value, baseUrl) {
|
||||
const candidates = value
|
||||
.split(",")
|
||||
const candidates = splitSrcset(value)
|
||||
.map((part) => part.trim())
|
||||
.filter(Boolean);
|
||||
const rewritten = [];
|
||||
@@ -425,6 +433,75 @@ export class AssetInliner {
|
||||
}
|
||||
}
|
||||
|
||||
export function splitSrcset(value) {
|
||||
const candidates = [];
|
||||
let current = "";
|
||||
let quote = "";
|
||||
let parenDepth = 0;
|
||||
|
||||
for (let index = 0; index < value.length; index += 1) {
|
||||
const ch = value[index];
|
||||
|
||||
if (quote) {
|
||||
current += ch;
|
||||
if (ch === quote) {
|
||||
quote = "";
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ch === '"' || ch === "'") {
|
||||
quote = ch;
|
||||
current += ch;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ch === "(") {
|
||||
parenDepth += 1;
|
||||
current += ch;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ch === ")" && parenDepth > 0) {
|
||||
parenDepth -= 1;
|
||||
current += ch;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ch === "," && parenDepth === 0 && isSrcsetCandidateSeparator(value, index, current)) {
|
||||
if (current.trim()) {
|
||||
candidates.push(current.trim());
|
||||
}
|
||||
current = "";
|
||||
while (index + 1 < value.length && /\s/.test(value[index + 1])) {
|
||||
index += 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
current += ch;
|
||||
}
|
||||
|
||||
if (current.trim()) {
|
||||
candidates.push(current.trim());
|
||||
}
|
||||
return candidates;
|
||||
}
|
||||
|
||||
function isSrcsetCandidateSeparator(value, commaIndex, currentCandidate) {
|
||||
const after = value.slice(commaIndex + 1);
|
||||
if (!after.trim()) {
|
||||
return true;
|
||||
}
|
||||
if (/^\s/.test(after)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const tokens = currentCandidate.trim().split(/\s+/);
|
||||
const descriptor = tokens.at(-1) || "";
|
||||
return /^(?:\d+(?:\.\d+)?[wxh]|\d+(?:\.\d+)?x)$/i.test(descriptor);
|
||||
}
|
||||
|
||||
function mimeFromUrl(rawUrl) {
|
||||
let pathname = rawUrl;
|
||||
try {
|
||||
|
||||
Reference in New Issue
Block a user