adds frontend

This commit is contained in:
2026-05-16 16:36:51 -07:00
parent 40c63dc4e2
commit c00913ec35
17 changed files with 1473 additions and 0 deletions

View File

@@ -22,6 +22,7 @@ COPY . .
RUN mkdir -p /archives && chmod 0777 /archives RUN mkdir -p /archives && chmod 0777 /archives
VOLUME ["/archives"] VOLUME ["/archives"]
EXPOSE 5733
ENTRYPOINT ["dumb-init", "--", "/app/scripts/archive-worker-entrypoint.sh"] ENTRYPOINT ["dumb-init", "--", "/app/scripts/archive-worker-entrypoint.sh"]
CMD ["help"] CMD ["help"]

16
Dockerfile.web Normal file
View File

@@ -0,0 +1,16 @@
FROM node:22-slim
WORKDIR /app
ENV NODE_ENV=production
ENV PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
COPY package.json package-lock.json ./
RUN npm ci --omit=dev
COPY src ./src
COPY public ./public
EXPOSE 5731 5732
CMD ["node", "src/backend-server.mjs"]

View File

@@ -51,3 +51,23 @@ For visual debugging, expose VNC from the worker:
``` ```
The worker image starts Xvfb internally, so callers do not need to mount the host X11 socket or override the entrypoint. The worker image starts Xvfb internally, so callers do not need to mount the host X11 socket or override the entrypoint.
## Web UI
The web path is split into three roles:
- `src/frontend-server.mjs` serves the static UI and proxies `/api/*` and `/archives/*` to the backend.
- `src/backend-server.mjs` manages archive lookup, job state, and the archive index.
- `src/worker-server.mjs` runs inside the browser worker container and wraps `archivePage()` over HTTP.
Run the full stack with:
```sh
docker compose -f docker-compose.example.yml up --build
```
Then open `http://localhost:5731`. Direct path archival is supported, for example:
```text
http://localhost:5731/https://example.com
```

1
Tiltfile Normal file
View File

@@ -0,0 +1 @@
docker_compose("docker-compose.example.yml")

View File

@@ -0,0 +1,66 @@
services:
frontend:
build:
context: .
dockerfile: Dockerfile.web
image: local-page-archiver-web:latest
command: ["node", "src/frontend-server.mjs"]
environment:
PORT: "5731"
BACKEND_URL: "http://backend:5732"
ports:
- "5731:5731"
depends_on:
backend:
condition: service_healthy
healthcheck:
test: ["CMD", "node", "-e", "fetch('http://127.0.0.1:5731/healthz').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
interval: 10s
timeout: 3s
retries: 5
backend:
build:
context: .
dockerfile: Dockerfile.web
image: local-page-archiver-web:latest
command: ["node", "src/backend-server.mjs"]
environment:
PORT: "5732"
ARCHIVE_PATH: /archives
ARCHIVE_WORKER_URL: "http://browser:5733"
PUBLIC_ARCHIVES_PATH: /archives
volumes:
- archives:/archives
depends_on:
browser:
condition: service_healthy
healthcheck:
test: ["CMD", "node", "-e", "fetch('http://127.0.0.1:5732/healthz').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
interval: 10s
timeout: 3s
retries: 5
browser:
build:
context: .
dockerfile: Dockerfile
image: local-page-archiver-browser:latest
command: ["serve-worker"]
environment:
PORT: "5733"
ARCHIVE_PATH: /archives
ARCHIVE_WORKER_XVFB: "1"
volumes:
- archives:/archives
expose:
- "5733"
shm_size: 1gb
healthcheck:
test: ["CMD", "node", "-e", "fetch('http://127.0.0.1:5733/healthz').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
interval: 10s
timeout: 3s
retries: 10
volumes:
archives:

View File

@@ -12,6 +12,9 @@
"archive": "node src/cli.mjs archive", "archive": "node src/cli.mjs archive",
"container:archive": "node src/container-runner.mjs archive", "container:archive": "node src/container-runner.mjs archive",
"container:build": "node src/container-runner.mjs build", "container:build": "node src/container-runner.mjs build",
"start:backend": "node src/backend-server.mjs",
"start:frontend": "node src/frontend-server.mjs",
"start:worker": "node src/worker-server.mjs",
"test": "node --test test/*.test.mjs", "test": "node --test test/*.test.mjs",
"install-browsers": "playwright install chromium" "install-browsers": "playwright install chromium"
}, },

153
public/assets/app.css Normal file
View File

@@ -0,0 +1,153 @@
:root {
color-scheme: light;
--bg: #f6f5f1;
--surface: #ffffff;
--ink: #161616;
--muted: #696963;
--line: #d8d6ce;
--accent: #2f7664;
--accent-strong: #245d50;
--danger: #a43d32;
font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
}
* {
box-sizing: border-box;
}
html,
body {
min-height: 100%;
}
body {
margin: 0;
background: var(--bg);
color: var(--ink);
}
button,
input {
font: inherit;
letter-spacing: 0;
}
.shell {
min-height: 100vh;
display: grid;
place-items: center;
padding: 24px;
}
.archive-box {
width: min(680px, 100%);
background: var(--surface);
border: 1px solid var(--line);
border-radius: 8px;
padding: 12px;
box-shadow: 0 18px 42px rgba(28, 25, 19, 0.08);
}
.input-row {
display: grid;
grid-template-columns: 1fr auto;
gap: 10px;
}
input {
min-width: 0;
width: 100%;
height: 48px;
border: 1px solid var(--line);
border-radius: 6px;
color: var(--ink);
background: #fbfaf7;
padding: 0 14px;
outline: none;
}
input:focus {
border-color: var(--accent);
box-shadow: 0 0 0 3px rgba(47, 118, 100, 0.16);
}
button {
height: 48px;
min-width: 112px;
border: 0;
border-radius: 6px;
color: #ffffff;
background: var(--accent);
padding: 0 18px;
cursor: pointer;
}
button:hover {
background: var(--accent-strong);
}
button:disabled {
cursor: wait;
opacity: 0.72;
}
.progress-wrap {
padding-top: 12px;
}
.progress-track {
height: 6px;
overflow: hidden;
border-radius: 999px;
background: #e7e4dc;
}
.progress-bar {
width: 0%;
height: 100%;
border-radius: inherit;
background: var(--accent);
transition: width 220ms ease;
}
.status-line {
min-height: 22px;
margin-top: 8px;
color: var(--muted);
font-size: 14px;
line-height: 22px;
}
.status-line.error {
color: var(--danger);
}
.sr-only {
position: absolute;
width: 1px;
height: 1px;
overflow: hidden;
clip: rect(0, 0, 0, 0);
white-space: nowrap;
clip-path: inset(50%);
}
@media (max-width: 560px) {
.shell {
align-items: start;
padding: 16px;
padding-top: 20vh;
}
.archive-box {
padding: 10px;
}
.input-row {
grid-template-columns: 1fr;
}
button {
width: 100%;
}
}

169
public/assets/app.js Normal file
View File

@@ -0,0 +1,169 @@
const form = document.querySelector("#archive-form");
const input = document.querySelector("#archive-url");
const button = document.querySelector("#archive-submit");
const progressWrap = document.querySelector("#progress-wrap");
const progressBar = document.querySelector("#progress-bar");
const statusLine = document.querySelector("#status-line");
let pollTimer = null;
let visualTimer = null;
let startedAt = Date.now();
form.addEventListener("submit", (event) => {
event.preventDefault();
submitArchive(input.value);
});
const pathUrl = urlFromPath();
if (pathUrl) {
input.value = pathUrl;
submitArchive(pathUrl);
} else {
input.focus();
}
async function submitArchive(rawUrl) {
stopTimers();
setBusy(true);
setStatus("Checking", 8);
try {
const response = await fetch("/api/archives", {
method: "POST",
headers: {
"content-type": "application/json"
},
body: JSON.stringify({ url: rawUrl })
});
const data = await readApiResponse(response);
if (data.archive?.archiveUrl) {
openArchive(data.archive.archiveUrl);
return;
}
if (data.job?.id) {
watchJob(data.job);
return;
}
throw new Error(data.error || "Archive did not start");
} catch (error) {
setError(error.message || "Archive failed");
setBusy(false);
}
}
function watchJob(job) {
startedAt = Date.parse(job.startedAt || job.createdAt) || Date.now();
updateFromJob(job);
visualTimer = window.setInterval(updateVisualProgress, 250);
pollTimer = window.setInterval(async () => {
try {
const response = await fetch(`/api/jobs/${encodeURIComponent(job.id)}`);
const data = await readApiResponse(response);
updateFromJob(data.job);
} catch (error) {
stopTimers();
setError(error.message || "Archive failed");
setBusy(false);
}
}, 850);
}
function updateFromJob(job) {
if (job.status === "done" && job.archive?.archiveUrl) {
stopTimers();
setStatus("Opening", 100);
openArchive(job.archive.archiveUrl);
return;
}
if (job.status === "failed") {
stopTimers();
setError(job.error || "Archive failed");
setBusy(false);
return;
}
startedAt = Date.parse(job.startedAt || job.createdAt) || startedAt;
const elapsed = Math.max(0, Math.round((Date.now() - startedAt) / 1000));
const label = job.status === "queued" ? "Queued" : `Archiving ${elapsed}s`;
setStatus(label, optimisticProgress());
}
function updateVisualProgress() {
if (!progressWrap.hidden) {
progressBar.style.width = `${optimisticProgress()}%`;
}
}
function optimisticProgress() {
const elapsed = Math.max(0, (Date.now() - startedAt) / 1000);
if (elapsed < 1) {
return 12;
}
if (elapsed < 12) {
return Math.min(88, 12 + elapsed * 6.3);
}
return Math.min(96, 88 + (elapsed - 12) * 0.6);
}
async function readApiResponse(response) {
const data = await response.json().catch(() => null);
if (!response.ok || data?.ok === false) {
throw new Error(data?.error || `Request failed with ${response.status}`);
}
return data;
}
function setBusy(isBusy) {
button.disabled = isBusy;
input.readOnly = isBusy;
}
function setStatus(text, progress) {
progressWrap.hidden = false;
statusLine.classList.remove("error");
statusLine.textContent = text;
progressBar.style.width = `${Math.max(0, Math.min(100, progress))}%`;
}
function setError(text) {
progressWrap.hidden = false;
statusLine.classList.add("error");
statusLine.textContent = text;
progressBar.style.width = "100%";
}
function stopTimers() {
if (pollTimer) {
window.clearInterval(pollTimer);
pollTimer = null;
}
if (visualTimer) {
window.clearInterval(visualTimer);
visualTimer = null;
}
}
function openArchive(archiveUrl) {
window.location.assign(archiveUrl);
}
function urlFromPath() {
const rawPath = window.location.pathname.replace(/^\/+/, "");
if (!rawPath || rawPath.startsWith("assets/") || rawPath.startsWith("api/") || rawPath.startsWith("archives/")) {
return "";
}
let decoded;
try {
decoded = decodeURIComponent(rawPath);
} catch {
return "";
}
if (!/^https?:\/\//i.test(decoded)) {
return "";
}
return `${decoded}${window.location.search}${window.location.hash}`;
}

27
public/index.html Normal file
View File

@@ -0,0 +1,27 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Archive</title>
<link rel="stylesheet" href="/assets/app.css">
<script src="/assets/app.js" defer></script>
</head>
<body>
<main class="shell">
<form class="archive-box" id="archive-form" autocomplete="off">
<label class="sr-only" for="archive-url">URL</label>
<div class="input-row">
<input id="archive-url" name="url" type="url" inputmode="url" spellcheck="false" autocomplete="url" placeholder="https://example.com" required>
<button id="archive-submit" type="submit">Archive</button>
</div>
<div class="progress-wrap" id="progress-wrap" hidden>
<div class="progress-track" aria-hidden="true">
<div class="progress-bar" id="progress-bar"></div>
</div>
<div class="status-line" id="status-line" aria-live="polite"></div>
</div>
</form>
</main>
</body>
</html>

View File

@@ -52,6 +52,10 @@ case "$1" in
archive|help) archive|help)
set -- node src/cli.mjs "$@" set -- node src/cli.mjs "$@"
;; ;;
serve-worker)
shift
set -- node src/worker-server.mjs "$@"
;;
esac esac
"$@" & "$@" &

250
src/archive-catalog.mjs Normal file
View File

@@ -0,0 +1,250 @@
import crypto from "node:crypto";
import fs from "node:fs/promises";
import path from "node:path";
import { defaultArchivePath } from "./asset-inliner.mjs";
const INDEX_FILE = ".archive-index.json";
const INDEX_VERSION = 1;
const COMMENT_RE = /<!--\s*Archived locally\. Source: ([\s\S]*?)\. Created: ([^.]*(?:\.[0-9]+)?Z)\.\s*-->/;
export function normalizeArchiveUrl(rawUrl) {
const text = String(rawUrl || "").trim();
if (!text) {
throw new Error("URL is required");
}
let url;
try {
url = new URL(text);
} catch {
throw new Error("Enter a valid URL");
}
if (url.protocol !== "http:" && url.protocol !== "https:") {
throw new Error("Only http and https URLs can be archived");
}
return url.href;
}
export function archiveIdForUrl(sourceUrl) {
const url = new URL(normalizeArchiveUrl(sourceUrl));
const stem =
`${url.hostname}${url.pathname}`
.replace(/\/+$/, "")
.replace(/[^a-z0-9]+/gi, "-")
.replace(/^-+|-+$/g, "")
.slice(0, 72) || "archive";
const digest = crypto.createHash("sha256").update(url.href).digest("hex").slice(0, 16);
return `${stem}-${digest}`;
}
export function archiveFileNameForUrl(sourceUrl) {
return `${archiveIdForUrl(sourceUrl)}.html`;
}
export class ArchiveCatalog {
constructor(options = {}) {
this.archivePath = path.resolve(options.archivePath || defaultArchivePath());
this.publicBasePath = options.publicBasePath || "/archives";
this.indexPath = path.join(this.archivePath, INDEX_FILE);
this.index = {
version: INDEX_VERSION,
archives: {}
};
this.loadPromise = null;
this.savePromise = Promise.resolve();
}
async findByUrl(rawUrl) {
const sourceUrl = normalizeArchiveUrl(rawUrl);
await this.ensureLoaded();
const indexed = this.index.archives[sourceUrl];
if (indexed && await this.hasArchiveFile(indexed.fileName)) {
return this.toPublicRecord(indexed);
}
if (indexed) {
delete this.index.archives[sourceUrl];
await this.saveIndex();
}
const stableFileName = archiveFileNameForUrl(sourceUrl);
if (await this.hasArchiveFile(stableFileName)) {
const record = this.upsertRecord(sourceUrl, {
id: path.basename(stableFileName, ".html"),
fileName: stableFileName
});
await this.saveIndex();
return this.toPublicRecord(record);
}
return null;
}
async recordResult(rawUrl, result) {
const sourceUrl = normalizeArchiveUrl(rawUrl);
await this.ensureLoaded();
const fileName = path.basename(result.filePath || `${result.id}.html`);
const id = result.id || path.basename(fileName, ".html");
const record = this.upsertRecord(sourceUrl, {
id,
fileName,
warningsCount: Array.isArray(result.warnings) ? result.warnings.length : 0,
externalAssetsCount: Array.isArray(result.externalAssets) ? result.externalAssets.length : 0
});
await this.saveIndex();
return this.toPublicRecord(record);
}
async ensureLoaded() {
if (!this.loadPromise) {
this.loadPromise = this.loadIndex();
}
await this.loadPromise;
}
async loadIndex() {
await fs.mkdir(this.archivePath, { recursive: true });
try {
const data = JSON.parse(await fs.readFile(this.indexPath, "utf8"));
if (data && data.version === INDEX_VERSION && data.archives && typeof data.archives === "object") {
this.index = data;
}
} catch (error) {
if (error.code !== "ENOENT") {
throw error;
}
}
if (await this.scanArchiveFiles()) {
await this.saveIndex();
}
}
async scanArchiveFiles() {
let changed = false;
const entries = await fs.readdir(this.archivePath, { withFileTypes: true }).catch((error) => {
if (error.code === "ENOENT") {
return [];
}
throw error;
});
for (const entry of entries) {
if (!entry.isFile() || !entry.name.endsWith(".html")) {
continue;
}
const filePath = path.join(this.archivePath, entry.name);
const metadata = await readArchiveMetadata(filePath);
if (!metadata?.sourceUrl) {
continue;
}
let sourceUrl;
try {
sourceUrl = normalizeArchiveUrl(metadata.sourceUrl);
} catch {
continue;
}
const current = this.index.archives[sourceUrl];
if (current?.fileName === entry.name) {
continue;
}
this.index.archives[sourceUrl] = {
id: path.basename(entry.name, ".html"),
fileName: entry.name,
sourceUrl,
createdAt: metadata.createdAt || new Date().toISOString(),
updatedAt: new Date().toISOString()
};
changed = true;
}
for (const [sourceUrl, record] of Object.entries(this.index.archives)) {
if (!record?.fileName || !await this.hasArchiveFile(record.fileName)) {
delete this.index.archives[sourceUrl];
changed = true;
}
}
return changed;
}
upsertRecord(sourceUrl, values) {
const previous = this.index.archives[sourceUrl];
const now = new Date().toISOString();
const record = {
id: values.id,
fileName: values.fileName,
sourceUrl,
createdAt: previous?.createdAt || now,
updatedAt: now,
warningsCount: values.warningsCount ?? previous?.warningsCount ?? 0,
externalAssetsCount: values.externalAssetsCount ?? previous?.externalAssetsCount ?? 0
};
this.index.archives[sourceUrl] = record;
return record;
}
async hasArchiveFile(fileName) {
if (!isSafeArchiveFileName(fileName)) {
return false;
}
const stat = await fs.stat(path.join(this.archivePath, fileName)).catch(() => null);
return !!stat?.isFile();
}
toPublicRecord(record) {
return {
...record,
archiveUrl: `${this.publicBasePath}/${encodeURIComponent(record.fileName)}`
};
}
async saveIndex() {
this.savePromise = this.savePromise.then(async () => {
await fs.mkdir(this.archivePath, { recursive: true });
const tmpPath = `${this.indexPath}.${process.pid}.tmp`;
await fs.writeFile(tmpPath, `${JSON.stringify(this.index, null, 2)}\n`, "utf8");
await fs.rename(tmpPath, this.indexPath);
});
return this.savePromise;
}
}
export function isSafeArchiveFileName(fileName) {
return (
typeof fileName === "string" &&
fileName === path.basename(fileName) &&
fileName.endsWith(".html") &&
!fileName.startsWith(".")
);
}
async function readArchiveMetadata(filePath) {
const handle = await fs.open(filePath, "r").catch(() => null);
if (!handle) {
return null;
}
try {
const buffer = Buffer.alloc(4096);
const { bytesRead } = await handle.read(buffer, 0, buffer.length, 0);
const head = buffer.subarray(0, bytesRead).toString("utf8");
const match = head.match(COMMENT_RE);
if (!match) {
return null;
}
return {
sourceUrl: match[1].replaceAll("- -", "--"),
createdAt: match[2]
};
} finally {
await handle.close();
}
}

345
src/backend-server.mjs Normal file
View File

@@ -0,0 +1,345 @@
#!/usr/bin/env node
import { createReadStream } from "node:fs";
import fs from "node:fs/promises";
import http from "node:http";
import path from "node:path";
import { randomUUID } from "node:crypto";
import { ArchiveCatalog, archiveIdForUrl, isSafeArchiveFileName, normalizeArchiveUrl } from "./archive-catalog.mjs";
import { defaultArchivePath } from "./asset-inliner.mjs";
const PORT = Number(process.env.PORT || 5732);
const ARCHIVE_PATH = path.resolve(process.env.ARCHIVE_PATH || defaultArchivePath());
const ARCHIVE_WORKER_URL = process.env.ARCHIVE_WORKER_URL || "http://127.0.0.1:5733";
const PUBLIC_ARCHIVES_PATH = process.env.PUBLIC_ARCHIVES_PATH || "/archives";
const JOB_TIMEOUT_MS = Number(process.env.ARCHIVE_JOB_TIMEOUT_MS || 120000);
const MAX_BODY_BYTES = 64 * 1024;
const catalog = new ArchiveCatalog({
archivePath: ARCHIVE_PATH,
publicBasePath: PUBLIC_ARCHIVES_PATH
});
const jobs = new Map();
const activeJobByUrl = new Map();
let workerQueue = Promise.resolve();
const server = http.createServer(async (req, res) => {
try {
await route(req, res);
} catch (error) {
sendJson(res, error.statusCode || 500, {
ok: false,
error: error.message || "Unexpected error"
});
}
});
server.listen(PORT, () => {
console.log(`archive backend listening on ${PORT}`);
console.log(`archive path: ${ARCHIVE_PATH}`);
console.log(`archive worker: ${ARCHIVE_WORKER_URL}`);
});
const cleanupTimer = setInterval(cleanupJobs, 10 * 60 * 1000);
cleanupTimer.unref?.();
async function route(req, res) {
const requestUrl = new URL(req.url, `http://${req.headers.host || "localhost"}`);
if (req.method === "GET" && requestUrl.pathname === "/healthz") {
sendJson(res, 200, { ok: true });
return;
}
if (req.method === "GET" && requestUrl.pathname === "/api/archives/lookup") {
const sourceUrl = normalizeArchiveUrl(requestUrl.searchParams.get("url"));
const archive = await catalog.findByUrl(sourceUrl);
sendJson(res, 200, {
ok: true,
exists: !!archive,
archive
});
return;
}
if (req.method === "POST" && requestUrl.pathname === "/api/archives") {
const body = await readJsonBody(req);
const sourceUrl = normalizeArchiveUrl(body.url);
const response = await createOrFindArchive(sourceUrl);
sendJson(res, response.statusCode, response.body);
return;
}
const jobMatch = requestUrl.pathname.match(/^\/api\/jobs\/([^/]+)$/);
if (req.method === "GET" && jobMatch) {
const job = jobs.get(jobMatch[1]);
if (!job) {
sendJson(res, 404, { ok: false, error: "Job not found" });
return;
}
sendJson(res, 200, {
ok: true,
job: publicJob(job)
});
return;
}
if (req.method === "GET" && requestUrl.pathname.startsWith(`${PUBLIC_ARCHIVES_PATH}/`)) {
await serveArchive(requestUrl.pathname.slice(PUBLIC_ARCHIVES_PATH.length + 1), res);
return;
}
sendJson(res, 404, { ok: false, error: "Not found" });
}
async function createOrFindArchive(sourceUrl) {
const existing = await catalog.findByUrl(sourceUrl);
if (existing) {
return {
statusCode: 200,
body: {
ok: true,
status: "done",
mode: "existing",
archive: existing
}
};
}
const activeJobId = activeJobByUrl.get(sourceUrl);
const activeJob = activeJobId ? jobs.get(activeJobId) : null;
if (activeJob && !isTerminal(activeJob.status)) {
return {
statusCode: 202,
body: {
ok: true,
status: activeJob.status,
mode: "active",
job: publicJob(activeJob)
}
};
}
const job = {
id: cryptoRandomId(),
archiveId: archiveIdForUrl(sourceUrl),
sourceUrl,
status: "queued",
message: "Queued",
createdAt: new Date().toISOString(),
startedAt: null,
updatedAt: new Date().toISOString(),
finishedAt: null,
archive: null,
error: null
};
jobs.set(job.id, job);
activeJobByUrl.set(sourceUrl, job.id);
enqueueJob(job);
return {
statusCode: 202,
body: {
ok: true,
status: job.status,
mode: "created",
job: publicJob(job)
}
};
}
function enqueueJob(job) {
const run = () => executeJob(job);
workerQueue = workerQueue.then(run, run);
}
async function executeJob(job) {
if (job.status !== "queued") {
return;
}
updateJob(job, {
status: "running",
message: "Archiving",
startedAt: new Date().toISOString()
});
try {
const result = await requestWorkerArchive(job.sourceUrl, job.archiveId);
const archive = await catalog.recordResult(job.sourceUrl, result);
updateJob(job, {
status: "done",
message: "Opening",
archive,
finishedAt: new Date().toISOString()
});
} catch (error) {
updateJob(job, {
status: "failed",
message: "Failed",
error: error.message || "Archive failed",
finishedAt: new Date().toISOString()
});
} finally {
activeJobByUrl.delete(job.sourceUrl);
}
}
async function requestWorkerArchive(sourceUrl, archiveId) {
const workerUrl = new URL("/archive", ARCHIVE_WORKER_URL);
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), JOB_TIMEOUT_MS);
timeout.unref?.();
try {
const response = await fetch(workerUrl, {
method: "POST",
headers: {
"content-type": "application/json"
},
body: JSON.stringify({ url: sourceUrl, id: archiveId }),
signal: controller.signal
});
const text = await response.text();
const parsed = parseJson(text);
if (!response.ok || parsed.ok === false) {
throw new Error(parsed.error || text || `Worker returned ${response.status}`);
}
return parsed.result || parsed;
} catch (error) {
if (error.name === "AbortError") {
throw new Error(`Archive timed out after ${Math.round(JOB_TIMEOUT_MS / 1000)} seconds`);
}
throw error;
} finally {
clearTimeout(timeout);
}
}
async function serveArchive(rawFileName, res) {
let fileName;
try {
fileName = decodeURIComponent(rawFileName);
} catch {
sendJson(res, 400, { ok: false, error: "Invalid archive path" });
return;
}
if (!isSafeArchiveFileName(fileName)) {
sendJson(res, 404, { ok: false, error: "Archive not found" });
return;
}
const filePath = path.join(ARCHIVE_PATH, fileName);
const stat = await fs.stat(filePath).catch(() => null);
if (!stat?.isFile()) {
sendJson(res, 404, { ok: false, error: "Archive not found" });
return;
}
const stream = createReadStream(filePath, { encoding: "utf8" });
stream.on("error", () => {
if (!res.headersSent) {
sendJson(res, 404, { ok: false, error: "Archive not found" });
} else {
res.destroy();
}
});
res.writeHead(200, {
"content-type": "text/html; charset=utf-8",
"cache-control": "no-store"
});
stream.pipe(res);
}
function updateJob(job, values) {
Object.assign(job, values, {
updatedAt: new Date().toISOString()
});
}
function publicJob(job) {
const startedAt = job.startedAt || job.createdAt;
return {
id: job.id,
sourceUrl: job.sourceUrl,
status: job.status,
message: job.message,
createdAt: job.createdAt,
startedAt,
updatedAt: job.updatedAt,
finishedAt: job.finishedAt,
elapsedMs: startedAt ? Date.now() - Date.parse(startedAt) : 0,
archive: job.archive,
error: job.error
};
}
function isTerminal(status) {
return status === "done" || status === "failed";
}
function cleanupJobs() {
const cutoff = Date.now() - 60 * 60 * 1000;
for (const [id, job] of jobs) {
if (isTerminal(job.status) && Date.parse(job.finishedAt || job.updatedAt) < cutoff) {
jobs.delete(id);
}
}
}
async function readJsonBody(req) {
const text = await readRequestBody(req, MAX_BODY_BYTES);
if (!text.trim()) {
throw httpError(400, "Request body is required");
}
try {
return JSON.parse(text);
} catch {
throw httpError(400, "Request body must be JSON");
}
}
function readRequestBody(req, maxBytes) {
return new Promise((resolve, reject) => {
const chunks = [];
let total = 0;
req.on("data", (chunk) => {
total += chunk.length;
if (total > maxBytes) {
reject(httpError(413, "Request body is too large"));
req.destroy();
return;
}
chunks.push(chunk);
});
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
req.on("error", reject);
});
}
function sendJson(res, statusCode, payload) {
res.writeHead(statusCode, {
"content-type": "application/json; charset=utf-8",
"cache-control": "no-store"
});
res.end(`${JSON.stringify(payload)}\n`);
}
function parseJson(text) {
try {
return JSON.parse(text);
} catch {
throw new Error(text || "Worker returned invalid JSON");
}
}
function cryptoRandomId() {
return randomUUID();
}
function httpError(statusCode, message) {
const error = new Error(message);
error.statusCode = statusCode;
return error;
}

157
src/frontend-server.mjs Normal file
View File

@@ -0,0 +1,157 @@
#!/usr/bin/env node
import fs from "node:fs/promises";
import http from "node:http";
import path from "node:path";
import { Readable } from "node:stream";
import { fileURLToPath } from "node:url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PORT = Number(process.env.PORT || 5731);
const BACKEND_URL = process.env.BACKEND_URL || "http://127.0.0.1:5732";
const PUBLIC_DIR = path.resolve(__dirname, "..", "public");
const MAX_PROXY_BODY_BYTES = 128 * 1024;
const CONTENT_TYPES = new Map([
[".css", "text/css; charset=utf-8"],
[".html", "text/html; charset=utf-8"],
[".js", "text/javascript; charset=utf-8"],
[".svg", "image/svg+xml"]
]);
const server = http.createServer(async (req, res) => {
try {
await route(req, res);
} catch (error) {
res.writeHead(error.statusCode || 500, {
"content-type": "text/plain; charset=utf-8",
"cache-control": "no-store"
});
res.end(error.message || "Unexpected error");
}
});
server.listen(PORT, () => {
console.log(`archive frontend listening on ${PORT}`);
console.log(`archive backend: ${BACKEND_URL}`);
});
async function route(req, res) {
const requestUrl = new URL(req.url, `http://${req.headers.host || "localhost"}`);
if (requestUrl.pathname === "/healthz") {
res.writeHead(200, {
"content-type": "application/json; charset=utf-8",
"cache-control": "no-store"
});
res.end('{"ok":true}\n');
return;
}
if (requestUrl.pathname.startsWith("/api/") || requestUrl.pathname.startsWith("/archives/")) {
await proxyToBackend(req, res, requestUrl);
return;
}
if (req.method !== "GET" && req.method !== "HEAD") {
throw httpError(405, "Method not allowed");
}
if (requestUrl.pathname.startsWith("/assets/")) {
await serveStatic(requestUrl.pathname, res);
return;
}
await serveStatic("/index.html", res);
}
async function serveStatic(urlPath, res) {
let decodedPath;
try {
decodedPath = decodeURIComponent(urlPath);
} catch {
throw httpError(400, "Invalid path");
}
const filePath = path.join(PUBLIC_DIR, decodedPath);
const relative = path.relative(PUBLIC_DIR, filePath);
if (relative.startsWith("..") || path.isAbsolute(relative)) {
throw httpError(404, "Not found");
}
const bytes = await fs.readFile(filePath).catch((error) => {
if (error.code === "ENOENT") {
throw httpError(404, "Not found");
}
throw error;
});
const type = CONTENT_TYPES.get(path.extname(filePath)) || "application/octet-stream";
res.writeHead(200, {
"content-type": type,
"cache-control": "no-store"
});
res.end(bytes);
}
async function proxyToBackend(req, res, requestUrl) {
const upstreamUrl = new URL(`${requestUrl.pathname}${requestUrl.search}`, BACKEND_URL);
const headers = {};
for (const [key, value] of Object.entries(req.headers)) {
if (["connection", "content-length", "host"].includes(key.toLowerCase())) {
continue;
}
if (Array.isArray(value)) {
headers[key] = value.join(", ");
} else if (value !== undefined) {
headers[key] = value;
}
}
const body = req.method === "GET" || req.method === "HEAD"
? undefined
: await readRequestBody(req, MAX_PROXY_BODY_BYTES);
const upstream = await fetch(upstreamUrl, {
method: req.method,
headers,
body
});
const responseHeaders = {};
upstream.headers.forEach((value, key) => {
if (!["connection", "content-encoding", "transfer-encoding"].includes(key.toLowerCase())) {
responseHeaders[key] = value;
}
});
res.writeHead(upstream.status, responseHeaders);
if (req.method === "HEAD" || !upstream.body) {
res.end();
return;
}
Readable.fromWeb(upstream.body).pipe(res);
}
function readRequestBody(req, maxBytes) {
return new Promise((resolve, reject) => {
const chunks = [];
let total = 0;
req.on("data", (chunk) => {
total += chunk.length;
if (total > maxBytes) {
reject(httpError(413, "Request body is too large"));
req.destroy();
return;
}
chunks.push(chunk);
});
req.on("end", () => resolve(Buffer.concat(chunks)));
req.on("error", reject);
});
}
function httpError(statusCode, message) {
const error = new Error(message);
error.statusCode = statusCode;
return error;
}

103
src/worker-server.mjs Normal file
View File

@@ -0,0 +1,103 @@
#!/usr/bin/env node
import http from "node:http";
import path from "node:path";
import { archivePage, defaultArchivePath } from "./archiver.mjs";
import { archiveIdForUrl, normalizeArchiveUrl } from "./archive-catalog.mjs";
const PORT = Number(process.env.PORT || process.env.ARCHIVE_WORKER_PORT || 5733);
const ARCHIVE_PATH = path.resolve(process.env.ARCHIVE_PATH || defaultArchivePath());
const MAX_BODY_BYTES = 64 * 1024;
let queue = Promise.resolve();
const server = http.createServer(async (req, res) => {
try {
await route(req, res);
} catch (error) {
sendJson(res, error.statusCode || 500, {
ok: false,
error: error.message || "Unexpected error"
});
}
});
server.listen(PORT, () => {
console.log(`archive worker listening on ${PORT}`);
console.log(`archive path: ${ARCHIVE_PATH}`);
});
async function route(req, res) {
const requestUrl = new URL(req.url, `http://${req.headers.host || "localhost"}`);
if (req.method === "GET" && requestUrl.pathname === "/healthz") {
sendJson(res, 200, { ok: true });
return;
}
if (req.method === "POST" && requestUrl.pathname === "/archive") {
const body = await readJsonBody(req);
const sourceUrl = normalizeArchiveUrl(body.url);
const id = typeof body.id === "string" && body.id.trim() ? body.id.trim() : archiveIdForUrl(sourceUrl);
const result = await enqueueArchive(sourceUrl, id);
sendJson(res, 200, {
ok: true,
result
});
return;
}
sendJson(res, 404, { ok: false, error: "Not found" });
}
function enqueueArchive(sourceUrl, id) {
const run = () => archivePage(sourceUrl, {
archivePath: ARCHIVE_PATH,
id
});
queue = queue.then(run, run);
return queue;
}
async function readJsonBody(req) {
const text = await readRequestBody(req, MAX_BODY_BYTES);
if (!text.trim()) {
throw httpError(400, "Request body is required");
}
try {
return JSON.parse(text);
} catch {
throw httpError(400, "Request body must be JSON");
}
}
function readRequestBody(req, maxBytes) {
return new Promise((resolve, reject) => {
const chunks = [];
let total = 0;
req.on("data", (chunk) => {
total += chunk.length;
if (total > maxBytes) {
reject(httpError(413, "Request body is too large"));
req.destroy();
return;
}
chunks.push(chunk);
});
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
req.on("error", reject);
});
}
function sendJson(res, statusCode, payload) {
res.writeHead(statusCode, {
"content-type": "application/json; charset=utf-8",
"cache-control": "no-store"
});
res.end(`${JSON.stringify(payload)}\n`);
}
function httpError(statusCode, message) {
const error = new Error(message);
error.statusCode = statusCode;
return error;
}

View File

@@ -0,0 +1,49 @@
import assert from "node:assert/strict";
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import test from "node:test";
import { ArchiveCatalog, archiveFileNameForUrl, archiveIdForUrl, normalizeArchiveUrl } from "../src/archive-catalog.mjs";
test("normalizes only http and https archive URLs", () => {
assert.equal(normalizeArchiveUrl(" https://example.com/path "), "https://example.com/path");
assert.throws(() => normalizeArchiveUrl("file:///tmp/page.html"), /Only http and https/);
assert.throws(() => normalizeArchiveUrl("not a url"), /valid URL/);
});
test("builds stable archive ids from the full URL", () => {
const first = archiveIdForUrl("https://example.com/article?x=1");
const second = archiveIdForUrl("https://example.com/article?x=1");
const third = archiveIdForUrl("https://example.com/article?x=2");
assert.equal(first, second);
assert.notEqual(first, third);
assert.match(first, /^example-com-article-[a-f0-9]{16}$/);
});
test("finds stable archive files without rerendering", async () => {
const archivePath = await fs.mkdtemp(path.join(os.tmpdir(), "archive-catalog-"));
const sourceUrl = "https://example.com/";
const fileName = archiveFileNameForUrl(sourceUrl);
await fs.writeFile(path.join(archivePath, fileName), "<!doctype html>", "utf8");
const catalog = new ArchiveCatalog({ archivePath });
const record = await catalog.findByUrl(sourceUrl);
assert.equal(record.fileName, fileName);
assert.equal(record.archiveUrl, `/archives/${encodeURIComponent(fileName)}`);
});
test("indexes older timestamped archives from the archive comment", async () => {
const archivePath = await fs.mkdtemp(path.join(os.tmpdir(), "archive-catalog-"));
await fs.writeFile(
path.join(archivePath, "example-com-2026-05-16T00-00-00-000Z.html"),
'<!doctype html>\n<!-- Archived locally. Source: https://example.com/story. Created: 2026-05-16T00:00:00.000Z. -->\n<html></html>',
"utf8"
);
const catalog = new ArchiveCatalog({ archivePath });
const record = await catalog.findByUrl("https://example.com/story");
assert.equal(record.fileName, "example-com-2026-05-16T00-00-00-000Z.html");
assert.equal(record.sourceUrl, "https://example.com/story");
});

26
test/archiver.test.mjs Normal file
View File

@@ -0,0 +1,26 @@
import assert from "node:assert/strict";
import test from "node:test";
import { renderPage } from "../src/archiver.mjs";
test("renderPage serializes CSSOM-inserted style rules", async () => {
const html = `<!doctype html>
<html>
<head>
<style id="runtime-style"></style>
<script>
document
.getElementById("runtime-style")
.sheet
.insertRule(".runtime-rule { color: rgb(1, 2, 3); }", 0);
</script>
</head>
<body><div class="runtime-rule">Styled by CSSOM</div></body>
</html>`;
const rendered = await renderPage(`data:text/html,${encodeURIComponent(html)}`, {
userscriptDelay: 0
});
assert.match(rendered, /<style id="runtime-style">[\s\S]*\.runtime-rule/);
assert.match(rendered, /color:\s*rgb\(1,\s*2,\s*3\)/);
});

View File

@@ -0,0 +1,83 @@
import assert from "node:assert/strict";
import test from "node:test";
import { AssetInliner, splitSrcset } from "../src/asset-inliner.mjs";
import { findExternalAssetRefs } from "../src/archiver.mjs";
test("inlines real srcset attributes without reading escaped src text from srcdoc", async () => {
const fetched = [];
const inliner = new AssetInliner();
inliner.fetchAsset = async (rawUrl) => {
fetched.push(rawUrl);
return {
bytes: Buffer.from("asset"),
contentType: "image/png"
};
};
const html = `
<img srcset="/small.png 1x, /large.png 2x">
<iframe srcdoc="&lt;script src=&quot;https://js.stripe.com/v3/foo.js&quot;&gt;&lt;/script&gt;&lt;img src=&quot;/nested.png&quot;&gt;"></iframe>
`;
const output = await inliner.inlineHtml(html, "https://example.com/article");
assert.deepEqual(fetched.sort(), [
"https://example.com/large.png",
"https://example.com/nested.png",
"https://example.com/small.png",
]);
assert.doesNotMatch(output, /js\.stripe\.com/);
assert.equal(inliner.warnings.length, 0);
});
test("external asset reporting ignores escaped nested attributes inside srcdoc", () => {
const refs = findExternalAssetRefs(`
<iframe srcdoc="&lt;img src=&quot;https://tracker.example/pixel.gif&quot;&gt;"></iframe>
<img src="https://cdn.example/picture.jpg">
`);
assert.deepEqual(refs, ["https://cdn.example/picture.jpg"]);
});
test("srcset parsing keeps image CDN transform commas inside URLs", async () => {
assert.deepEqual(splitSrcset([
"https://media.example/photos/id/master/w_120,c_limit/photo.jpg 120w",
"https://media.example/photos/id/master/w_240,c_limit/photo.jpg 240w"
].join(", ")), [
"https://media.example/photos/id/master/w_120,c_limit/photo.jpg 120w",
"https://media.example/photos/id/master/w_240,c_limit/photo.jpg 240w"
]);
const fetched = [];
const inliner = new AssetInliner();
inliner.fetchAsset = async (rawUrl) => {
fetched.push(rawUrl);
return {
bytes: Buffer.from("asset"),
contentType: "image/jpeg"
};
};
await inliner.inlineSrcset(
"https://media.example/photos/id/master/w_120,c_limit/photo.jpg 120w, https://media.example/photos/id/master/w_240,c_limit/photo.jpg 240w",
"https://example.com/article"
);
assert.deepEqual(fetched, [
"https://media.example/photos/id/master/w_120,c_limit/photo.jpg",
"https://media.example/photos/id/master/w_240,c_limit/photo.jpg"
]);
});
test("external asset reporting parses srcset-like attributes without splitting URL commas", () => {
const refs = findExternalAssetRefs(`
<img srcset="data:image/gif;base64,R0lGODlhAQABAAAAACw= 1x, https://cdn.example/image.jpg 2x">
<link rel="preload" as="image" imagesrcset="https://media.example/photos/id/master/w_120,c_limit/photo.jpg 120w, https://media.example/photos/id/master/w_240,c_limit/photo.jpg 240w">
`);
assert.deepEqual(refs, [
"https://cdn.example/image.jpg",
"https://media.example/photos/id/master/w_120,c_limit/photo.jpg",
"https://media.example/photos/id/master/w_240,c_limit/photo.jpg"
]);
});