adds frontend

This commit is contained in:
2026-05-16 16:36:51 -07:00
parent 40c63dc4e2
commit c00913ec35
17 changed files with 1473 additions and 0 deletions

View File

@@ -22,6 +22,7 @@ COPY . .
RUN mkdir -p /archives && chmod 0777 /archives
VOLUME ["/archives"]
EXPOSE 5733
ENTRYPOINT ["dumb-init", "--", "/app/scripts/archive-worker-entrypoint.sh"]
CMD ["help"]

16
Dockerfile.web Normal file
View File

@@ -0,0 +1,16 @@
FROM node:22-slim
WORKDIR /app
ENV NODE_ENV=production
ENV PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
COPY package.json package-lock.json ./
RUN npm ci --omit=dev
COPY src ./src
COPY public ./public
EXPOSE 5731 5732
CMD ["node", "src/backend-server.mjs"]

View File

@@ -51,3 +51,23 @@ For visual debugging, expose VNC from the worker:
```
The worker image starts Xvfb internally, so callers do not need to mount the host X11 socket or override the entrypoint.
## Web UI
The web path is split into three roles:
- `src/frontend-server.mjs` serves the static UI and proxies `/api/*` and `/archives/*` to the backend.
- `src/backend-server.mjs` manages archive lookup, job state, and the archive index.
- `src/worker-server.mjs` runs inside the browser worker container and wraps `archivePage()` over HTTP.
Run the full stack with:
```sh
docker compose -f docker-compose.example.yml up --build
```
Then open `http://localhost:5731`. Direct path archival is supported, for example:
```text
http://localhost:5731/https://example.com
```

1
Tiltfile Normal file
View File

@@ -0,0 +1 @@
docker_compose("docker-compose.example.yml")

View File

@@ -0,0 +1,66 @@
services:
frontend:
build:
context: .
dockerfile: Dockerfile.web
image: local-page-archiver-web:latest
command: ["node", "src/frontend-server.mjs"]
environment:
PORT: "5731"
BACKEND_URL: "http://backend:5732"
ports:
- "5731:5731"
depends_on:
backend:
condition: service_healthy
healthcheck:
test: ["CMD", "node", "-e", "fetch('http://127.0.0.1:5731/healthz').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
interval: 10s
timeout: 3s
retries: 5
backend:
build:
context: .
dockerfile: Dockerfile.web
image: local-page-archiver-web:latest
command: ["node", "src/backend-server.mjs"]
environment:
PORT: "5732"
ARCHIVE_PATH: /archives
ARCHIVE_WORKER_URL: "http://browser:5733"
PUBLIC_ARCHIVES_PATH: /archives
volumes:
- archives:/archives
depends_on:
browser:
condition: service_healthy
healthcheck:
test: ["CMD", "node", "-e", "fetch('http://127.0.0.1:5732/healthz').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
interval: 10s
timeout: 3s
retries: 5
browser:
build:
context: .
dockerfile: Dockerfile
image: local-page-archiver-browser:latest
command: ["serve-worker"]
environment:
PORT: "5733"
ARCHIVE_PATH: /archives
ARCHIVE_WORKER_XVFB: "1"
volumes:
- archives:/archives
expose:
- "5733"
shm_size: 1gb
healthcheck:
test: ["CMD", "node", "-e", "fetch('http://127.0.0.1:5733/healthz').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
interval: 10s
timeout: 3s
retries: 10
volumes:
archives:

View File

@@ -12,6 +12,9 @@
"archive": "node src/cli.mjs archive",
"container:archive": "node src/container-runner.mjs archive",
"container:build": "node src/container-runner.mjs build",
"start:backend": "node src/backend-server.mjs",
"start:frontend": "node src/frontend-server.mjs",
"start:worker": "node src/worker-server.mjs",
"test": "node --test test/*.test.mjs",
"install-browsers": "playwright install chromium"
},

153
public/assets/app.css Normal file
View File

@@ -0,0 +1,153 @@
:root {
color-scheme: light;
--bg: #f6f5f1;
--surface: #ffffff;
--ink: #161616;
--muted: #696963;
--line: #d8d6ce;
--accent: #2f7664;
--accent-strong: #245d50;
--danger: #a43d32;
font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
}
* {
box-sizing: border-box;
}
html,
body {
min-height: 100%;
}
body {
margin: 0;
background: var(--bg);
color: var(--ink);
}
button,
input {
font: inherit;
letter-spacing: 0;
}
.shell {
min-height: 100vh;
display: grid;
place-items: center;
padding: 24px;
}
.archive-box {
width: min(680px, 100%);
background: var(--surface);
border: 1px solid var(--line);
border-radius: 8px;
padding: 12px;
box-shadow: 0 18px 42px rgba(28, 25, 19, 0.08);
}
.input-row {
display: grid;
grid-template-columns: 1fr auto;
gap: 10px;
}
input {
min-width: 0;
width: 100%;
height: 48px;
border: 1px solid var(--line);
border-radius: 6px;
color: var(--ink);
background: #fbfaf7;
padding: 0 14px;
outline: none;
}
input:focus {
border-color: var(--accent);
box-shadow: 0 0 0 3px rgba(47, 118, 100, 0.16);
}
button {
height: 48px;
min-width: 112px;
border: 0;
border-radius: 6px;
color: #ffffff;
background: var(--accent);
padding: 0 18px;
cursor: pointer;
}
button:hover {
background: var(--accent-strong);
}
button:disabled {
cursor: wait;
opacity: 0.72;
}
.progress-wrap {
padding-top: 12px;
}
.progress-track {
height: 6px;
overflow: hidden;
border-radius: 999px;
background: #e7e4dc;
}
.progress-bar {
width: 0%;
height: 100%;
border-radius: inherit;
background: var(--accent);
transition: width 220ms ease;
}
.status-line {
min-height: 22px;
margin-top: 8px;
color: var(--muted);
font-size: 14px;
line-height: 22px;
}
.status-line.error {
color: var(--danger);
}
.sr-only {
position: absolute;
width: 1px;
height: 1px;
overflow: hidden;
clip: rect(0, 0, 0, 0);
white-space: nowrap;
clip-path: inset(50%);
}
@media (max-width: 560px) {
.shell {
align-items: start;
padding: 16px;
padding-top: 20vh;
}
.archive-box {
padding: 10px;
}
.input-row {
grid-template-columns: 1fr;
}
button {
width: 100%;
}
}

169
public/assets/app.js Normal file
View File

@@ -0,0 +1,169 @@
const form = document.querySelector("#archive-form");
const input = document.querySelector("#archive-url");
const button = document.querySelector("#archive-submit");
const progressWrap = document.querySelector("#progress-wrap");
const progressBar = document.querySelector("#progress-bar");
const statusLine = document.querySelector("#status-line");
let pollTimer = null;
let visualTimer = null;
let startedAt = Date.now();
form.addEventListener("submit", (event) => {
event.preventDefault();
submitArchive(input.value);
});
const pathUrl = urlFromPath();
if (pathUrl) {
input.value = pathUrl;
submitArchive(pathUrl);
} else {
input.focus();
}
async function submitArchive(rawUrl) {
stopTimers();
setBusy(true);
setStatus("Checking", 8);
try {
const response = await fetch("/api/archives", {
method: "POST",
headers: {
"content-type": "application/json"
},
body: JSON.stringify({ url: rawUrl })
});
const data = await readApiResponse(response);
if (data.archive?.archiveUrl) {
openArchive(data.archive.archiveUrl);
return;
}
if (data.job?.id) {
watchJob(data.job);
return;
}
throw new Error(data.error || "Archive did not start");
} catch (error) {
setError(error.message || "Archive failed");
setBusy(false);
}
}
function watchJob(job) {
startedAt = Date.parse(job.startedAt || job.createdAt) || Date.now();
updateFromJob(job);
visualTimer = window.setInterval(updateVisualProgress, 250);
pollTimer = window.setInterval(async () => {
try {
const response = await fetch(`/api/jobs/${encodeURIComponent(job.id)}`);
const data = await readApiResponse(response);
updateFromJob(data.job);
} catch (error) {
stopTimers();
setError(error.message || "Archive failed");
setBusy(false);
}
}, 850);
}
function updateFromJob(job) {
if (job.status === "done" && job.archive?.archiveUrl) {
stopTimers();
setStatus("Opening", 100);
openArchive(job.archive.archiveUrl);
return;
}
if (job.status === "failed") {
stopTimers();
setError(job.error || "Archive failed");
setBusy(false);
return;
}
startedAt = Date.parse(job.startedAt || job.createdAt) || startedAt;
const elapsed = Math.max(0, Math.round((Date.now() - startedAt) / 1000));
const label = job.status === "queued" ? "Queued" : `Archiving ${elapsed}s`;
setStatus(label, optimisticProgress());
}
function updateVisualProgress() {
if (!progressWrap.hidden) {
progressBar.style.width = `${optimisticProgress()}%`;
}
}
function optimisticProgress() {
const elapsed = Math.max(0, (Date.now() - startedAt) / 1000);
if (elapsed < 1) {
return 12;
}
if (elapsed < 12) {
return Math.min(88, 12 + elapsed * 6.3);
}
return Math.min(96, 88 + (elapsed - 12) * 0.6);
}
async function readApiResponse(response) {
const data = await response.json().catch(() => null);
if (!response.ok || data?.ok === false) {
throw new Error(data?.error || `Request failed with ${response.status}`);
}
return data;
}
function setBusy(isBusy) {
button.disabled = isBusy;
input.readOnly = isBusy;
}
function setStatus(text, progress) {
progressWrap.hidden = false;
statusLine.classList.remove("error");
statusLine.textContent = text;
progressBar.style.width = `${Math.max(0, Math.min(100, progress))}%`;
}
function setError(text) {
progressWrap.hidden = false;
statusLine.classList.add("error");
statusLine.textContent = text;
progressBar.style.width = "100%";
}
function stopTimers() {
if (pollTimer) {
window.clearInterval(pollTimer);
pollTimer = null;
}
if (visualTimer) {
window.clearInterval(visualTimer);
visualTimer = null;
}
}
function openArchive(archiveUrl) {
window.location.assign(archiveUrl);
}
function urlFromPath() {
const rawPath = window.location.pathname.replace(/^\/+/, "");
if (!rawPath || rawPath.startsWith("assets/") || rawPath.startsWith("api/") || rawPath.startsWith("archives/")) {
return "";
}
let decoded;
try {
decoded = decodeURIComponent(rawPath);
} catch {
return "";
}
if (!/^https?:\/\//i.test(decoded)) {
return "";
}
return `${decoded}${window.location.search}${window.location.hash}`;
}

27
public/index.html Normal file
View File

@@ -0,0 +1,27 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Archive</title>
<link rel="stylesheet" href="/assets/app.css">
<script src="/assets/app.js" defer></script>
</head>
<body>
<main class="shell">
<form class="archive-box" id="archive-form" autocomplete="off">
<label class="sr-only" for="archive-url">URL</label>
<div class="input-row">
<input id="archive-url" name="url" type="url" inputmode="url" spellcheck="false" autocomplete="url" placeholder="https://example.com" required>
<button id="archive-submit" type="submit">Archive</button>
</div>
<div class="progress-wrap" id="progress-wrap" hidden>
<div class="progress-track" aria-hidden="true">
<div class="progress-bar" id="progress-bar"></div>
</div>
<div class="status-line" id="status-line" aria-live="polite"></div>
</div>
</form>
</main>
</body>
</html>

View File

@@ -52,6 +52,10 @@ case "$1" in
archive|help)
set -- node src/cli.mjs "$@"
;;
serve-worker)
shift
set -- node src/worker-server.mjs "$@"
;;
esac
"$@" &

250
src/archive-catalog.mjs Normal file
View File

@@ -0,0 +1,250 @@
import crypto from "node:crypto";
import fs from "node:fs/promises";
import path from "node:path";
import { defaultArchivePath } from "./asset-inliner.mjs";
const INDEX_FILE = ".archive-index.json";
const INDEX_VERSION = 1;
const COMMENT_RE = /<!--\s*Archived locally\. Source: ([\s\S]*?)\. Created: ([^.]*(?:\.[0-9]+)?Z)\.\s*-->/;
export function normalizeArchiveUrl(rawUrl) {
const text = String(rawUrl || "").trim();
if (!text) {
throw new Error("URL is required");
}
let url;
try {
url = new URL(text);
} catch {
throw new Error("Enter a valid URL");
}
if (url.protocol !== "http:" && url.protocol !== "https:") {
throw new Error("Only http and https URLs can be archived");
}
return url.href;
}
export function archiveIdForUrl(sourceUrl) {
const url = new URL(normalizeArchiveUrl(sourceUrl));
const stem =
`${url.hostname}${url.pathname}`
.replace(/\/+$/, "")
.replace(/[^a-z0-9]+/gi, "-")
.replace(/^-+|-+$/g, "")
.slice(0, 72) || "archive";
const digest = crypto.createHash("sha256").update(url.href).digest("hex").slice(0, 16);
return `${stem}-${digest}`;
}
export function archiveFileNameForUrl(sourceUrl) {
return `${archiveIdForUrl(sourceUrl)}.html`;
}
export class ArchiveCatalog {
constructor(options = {}) {
this.archivePath = path.resolve(options.archivePath || defaultArchivePath());
this.publicBasePath = options.publicBasePath || "/archives";
this.indexPath = path.join(this.archivePath, INDEX_FILE);
this.index = {
version: INDEX_VERSION,
archives: {}
};
this.loadPromise = null;
this.savePromise = Promise.resolve();
}
async findByUrl(rawUrl) {
const sourceUrl = normalizeArchiveUrl(rawUrl);
await this.ensureLoaded();
const indexed = this.index.archives[sourceUrl];
if (indexed && await this.hasArchiveFile(indexed.fileName)) {
return this.toPublicRecord(indexed);
}
if (indexed) {
delete this.index.archives[sourceUrl];
await this.saveIndex();
}
const stableFileName = archiveFileNameForUrl(sourceUrl);
if (await this.hasArchiveFile(stableFileName)) {
const record = this.upsertRecord(sourceUrl, {
id: path.basename(stableFileName, ".html"),
fileName: stableFileName
});
await this.saveIndex();
return this.toPublicRecord(record);
}
return null;
}
async recordResult(rawUrl, result) {
const sourceUrl = normalizeArchiveUrl(rawUrl);
await this.ensureLoaded();
const fileName = path.basename(result.filePath || `${result.id}.html`);
const id = result.id || path.basename(fileName, ".html");
const record = this.upsertRecord(sourceUrl, {
id,
fileName,
warningsCount: Array.isArray(result.warnings) ? result.warnings.length : 0,
externalAssetsCount: Array.isArray(result.externalAssets) ? result.externalAssets.length : 0
});
await this.saveIndex();
return this.toPublicRecord(record);
}
async ensureLoaded() {
if (!this.loadPromise) {
this.loadPromise = this.loadIndex();
}
await this.loadPromise;
}
async loadIndex() {
await fs.mkdir(this.archivePath, { recursive: true });
try {
const data = JSON.parse(await fs.readFile(this.indexPath, "utf8"));
if (data && data.version === INDEX_VERSION && data.archives && typeof data.archives === "object") {
this.index = data;
}
} catch (error) {
if (error.code !== "ENOENT") {
throw error;
}
}
if (await this.scanArchiveFiles()) {
await this.saveIndex();
}
}
async scanArchiveFiles() {
let changed = false;
const entries = await fs.readdir(this.archivePath, { withFileTypes: true }).catch((error) => {
if (error.code === "ENOENT") {
return [];
}
throw error;
});
for (const entry of entries) {
if (!entry.isFile() || !entry.name.endsWith(".html")) {
continue;
}
const filePath = path.join(this.archivePath, entry.name);
const metadata = await readArchiveMetadata(filePath);
if (!metadata?.sourceUrl) {
continue;
}
let sourceUrl;
try {
sourceUrl = normalizeArchiveUrl(metadata.sourceUrl);
} catch {
continue;
}
const current = this.index.archives[sourceUrl];
if (current?.fileName === entry.name) {
continue;
}
this.index.archives[sourceUrl] = {
id: path.basename(entry.name, ".html"),
fileName: entry.name,
sourceUrl,
createdAt: metadata.createdAt || new Date().toISOString(),
updatedAt: new Date().toISOString()
};
changed = true;
}
for (const [sourceUrl, record] of Object.entries(this.index.archives)) {
if (!record?.fileName || !await this.hasArchiveFile(record.fileName)) {
delete this.index.archives[sourceUrl];
changed = true;
}
}
return changed;
}
upsertRecord(sourceUrl, values) {
const previous = this.index.archives[sourceUrl];
const now = new Date().toISOString();
const record = {
id: values.id,
fileName: values.fileName,
sourceUrl,
createdAt: previous?.createdAt || now,
updatedAt: now,
warningsCount: values.warningsCount ?? previous?.warningsCount ?? 0,
externalAssetsCount: values.externalAssetsCount ?? previous?.externalAssetsCount ?? 0
};
this.index.archives[sourceUrl] = record;
return record;
}
async hasArchiveFile(fileName) {
if (!isSafeArchiveFileName(fileName)) {
return false;
}
const stat = await fs.stat(path.join(this.archivePath, fileName)).catch(() => null);
return !!stat?.isFile();
}
toPublicRecord(record) {
return {
...record,
archiveUrl: `${this.publicBasePath}/${encodeURIComponent(record.fileName)}`
};
}
async saveIndex() {
this.savePromise = this.savePromise.then(async () => {
await fs.mkdir(this.archivePath, { recursive: true });
const tmpPath = `${this.indexPath}.${process.pid}.tmp`;
await fs.writeFile(tmpPath, `${JSON.stringify(this.index, null, 2)}\n`, "utf8");
await fs.rename(tmpPath, this.indexPath);
});
return this.savePromise;
}
}
export function isSafeArchiveFileName(fileName) {
return (
typeof fileName === "string" &&
fileName === path.basename(fileName) &&
fileName.endsWith(".html") &&
!fileName.startsWith(".")
);
}
async function readArchiveMetadata(filePath) {
const handle = await fs.open(filePath, "r").catch(() => null);
if (!handle) {
return null;
}
try {
const buffer = Buffer.alloc(4096);
const { bytesRead } = await handle.read(buffer, 0, buffer.length, 0);
const head = buffer.subarray(0, bytesRead).toString("utf8");
const match = head.match(COMMENT_RE);
if (!match) {
return null;
}
return {
sourceUrl: match[1].replaceAll("- -", "--"),
createdAt: match[2]
};
} finally {
await handle.close();
}
}

345
src/backend-server.mjs Normal file
View File

@@ -0,0 +1,345 @@
#!/usr/bin/env node
import { createReadStream } from "node:fs";
import fs from "node:fs/promises";
import http from "node:http";
import path from "node:path";
import { randomUUID } from "node:crypto";
import { ArchiveCatalog, archiveIdForUrl, isSafeArchiveFileName, normalizeArchiveUrl } from "./archive-catalog.mjs";
import { defaultArchivePath } from "./asset-inliner.mjs";
const PORT = Number(process.env.PORT || 5732);
const ARCHIVE_PATH = path.resolve(process.env.ARCHIVE_PATH || defaultArchivePath());
const ARCHIVE_WORKER_URL = process.env.ARCHIVE_WORKER_URL || "http://127.0.0.1:5733";
const PUBLIC_ARCHIVES_PATH = process.env.PUBLIC_ARCHIVES_PATH || "/archives";
const JOB_TIMEOUT_MS = Number(process.env.ARCHIVE_JOB_TIMEOUT_MS || 120000);
const MAX_BODY_BYTES = 64 * 1024;
const catalog = new ArchiveCatalog({
archivePath: ARCHIVE_PATH,
publicBasePath: PUBLIC_ARCHIVES_PATH
});
const jobs = new Map();
const activeJobByUrl = new Map();
let workerQueue = Promise.resolve();
const server = http.createServer(async (req, res) => {
try {
await route(req, res);
} catch (error) {
sendJson(res, error.statusCode || 500, {
ok: false,
error: error.message || "Unexpected error"
});
}
});
server.listen(PORT, () => {
console.log(`archive backend listening on ${PORT}`);
console.log(`archive path: ${ARCHIVE_PATH}`);
console.log(`archive worker: ${ARCHIVE_WORKER_URL}`);
});
const cleanupTimer = setInterval(cleanupJobs, 10 * 60 * 1000);
cleanupTimer.unref?.();
async function route(req, res) {
const requestUrl = new URL(req.url, `http://${req.headers.host || "localhost"}`);
if (req.method === "GET" && requestUrl.pathname === "/healthz") {
sendJson(res, 200, { ok: true });
return;
}
if (req.method === "GET" && requestUrl.pathname === "/api/archives/lookup") {
const sourceUrl = normalizeArchiveUrl(requestUrl.searchParams.get("url"));
const archive = await catalog.findByUrl(sourceUrl);
sendJson(res, 200, {
ok: true,
exists: !!archive,
archive
});
return;
}
if (req.method === "POST" && requestUrl.pathname === "/api/archives") {
const body = await readJsonBody(req);
const sourceUrl = normalizeArchiveUrl(body.url);
const response = await createOrFindArchive(sourceUrl);
sendJson(res, response.statusCode, response.body);
return;
}
const jobMatch = requestUrl.pathname.match(/^\/api\/jobs\/([^/]+)$/);
if (req.method === "GET" && jobMatch) {
const job = jobs.get(jobMatch[1]);
if (!job) {
sendJson(res, 404, { ok: false, error: "Job not found" });
return;
}
sendJson(res, 200, {
ok: true,
job: publicJob(job)
});
return;
}
if (req.method === "GET" && requestUrl.pathname.startsWith(`${PUBLIC_ARCHIVES_PATH}/`)) {
await serveArchive(requestUrl.pathname.slice(PUBLIC_ARCHIVES_PATH.length + 1), res);
return;
}
sendJson(res, 404, { ok: false, error: "Not found" });
}
async function createOrFindArchive(sourceUrl) {
const existing = await catalog.findByUrl(sourceUrl);
if (existing) {
return {
statusCode: 200,
body: {
ok: true,
status: "done",
mode: "existing",
archive: existing
}
};
}
const activeJobId = activeJobByUrl.get(sourceUrl);
const activeJob = activeJobId ? jobs.get(activeJobId) : null;
if (activeJob && !isTerminal(activeJob.status)) {
return {
statusCode: 202,
body: {
ok: true,
status: activeJob.status,
mode: "active",
job: publicJob(activeJob)
}
};
}
const job = {
id: cryptoRandomId(),
archiveId: archiveIdForUrl(sourceUrl),
sourceUrl,
status: "queued",
message: "Queued",
createdAt: new Date().toISOString(),
startedAt: null,
updatedAt: new Date().toISOString(),
finishedAt: null,
archive: null,
error: null
};
jobs.set(job.id, job);
activeJobByUrl.set(sourceUrl, job.id);
enqueueJob(job);
return {
statusCode: 202,
body: {
ok: true,
status: job.status,
mode: "created",
job: publicJob(job)
}
};
}
function enqueueJob(job) {
const run = () => executeJob(job);
workerQueue = workerQueue.then(run, run);
}
async function executeJob(job) {
if (job.status !== "queued") {
return;
}
updateJob(job, {
status: "running",
message: "Archiving",
startedAt: new Date().toISOString()
});
try {
const result = await requestWorkerArchive(job.sourceUrl, job.archiveId);
const archive = await catalog.recordResult(job.sourceUrl, result);
updateJob(job, {
status: "done",
message: "Opening",
archive,
finishedAt: new Date().toISOString()
});
} catch (error) {
updateJob(job, {
status: "failed",
message: "Failed",
error: error.message || "Archive failed",
finishedAt: new Date().toISOString()
});
} finally {
activeJobByUrl.delete(job.sourceUrl);
}
}
async function requestWorkerArchive(sourceUrl, archiveId) {
const workerUrl = new URL("/archive", ARCHIVE_WORKER_URL);
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), JOB_TIMEOUT_MS);
timeout.unref?.();
try {
const response = await fetch(workerUrl, {
method: "POST",
headers: {
"content-type": "application/json"
},
body: JSON.stringify({ url: sourceUrl, id: archiveId }),
signal: controller.signal
});
const text = await response.text();
const parsed = parseJson(text);
if (!response.ok || parsed.ok === false) {
throw new Error(parsed.error || text || `Worker returned ${response.status}`);
}
return parsed.result || parsed;
} catch (error) {
if (error.name === "AbortError") {
throw new Error(`Archive timed out after ${Math.round(JOB_TIMEOUT_MS / 1000)} seconds`);
}
throw error;
} finally {
clearTimeout(timeout);
}
}
async function serveArchive(rawFileName, res) {
let fileName;
try {
fileName = decodeURIComponent(rawFileName);
} catch {
sendJson(res, 400, { ok: false, error: "Invalid archive path" });
return;
}
if (!isSafeArchiveFileName(fileName)) {
sendJson(res, 404, { ok: false, error: "Archive not found" });
return;
}
const filePath = path.join(ARCHIVE_PATH, fileName);
const stat = await fs.stat(filePath).catch(() => null);
if (!stat?.isFile()) {
sendJson(res, 404, { ok: false, error: "Archive not found" });
return;
}
const stream = createReadStream(filePath, { encoding: "utf8" });
stream.on("error", () => {
if (!res.headersSent) {
sendJson(res, 404, { ok: false, error: "Archive not found" });
} else {
res.destroy();
}
});
res.writeHead(200, {
"content-type": "text/html; charset=utf-8",
"cache-control": "no-store"
});
stream.pipe(res);
}
function updateJob(job, values) {
Object.assign(job, values, {
updatedAt: new Date().toISOString()
});
}
function publicJob(job) {
const startedAt = job.startedAt || job.createdAt;
return {
id: job.id,
sourceUrl: job.sourceUrl,
status: job.status,
message: job.message,
createdAt: job.createdAt,
startedAt,
updatedAt: job.updatedAt,
finishedAt: job.finishedAt,
elapsedMs: startedAt ? Date.now() - Date.parse(startedAt) : 0,
archive: job.archive,
error: job.error
};
}
function isTerminal(status) {
return status === "done" || status === "failed";
}
function cleanupJobs() {
const cutoff = Date.now() - 60 * 60 * 1000;
for (const [id, job] of jobs) {
if (isTerminal(job.status) && Date.parse(job.finishedAt || job.updatedAt) < cutoff) {
jobs.delete(id);
}
}
}
async function readJsonBody(req) {
const text = await readRequestBody(req, MAX_BODY_BYTES);
if (!text.trim()) {
throw httpError(400, "Request body is required");
}
try {
return JSON.parse(text);
} catch {
throw httpError(400, "Request body must be JSON");
}
}
function readRequestBody(req, maxBytes) {
return new Promise((resolve, reject) => {
const chunks = [];
let total = 0;
req.on("data", (chunk) => {
total += chunk.length;
if (total > maxBytes) {
reject(httpError(413, "Request body is too large"));
req.destroy();
return;
}
chunks.push(chunk);
});
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
req.on("error", reject);
});
}
function sendJson(res, statusCode, payload) {
res.writeHead(statusCode, {
"content-type": "application/json; charset=utf-8",
"cache-control": "no-store"
});
res.end(`${JSON.stringify(payload)}\n`);
}
function parseJson(text) {
try {
return JSON.parse(text);
} catch {
throw new Error(text || "Worker returned invalid JSON");
}
}
function cryptoRandomId() {
return randomUUID();
}
function httpError(statusCode, message) {
const error = new Error(message);
error.statusCode = statusCode;
return error;
}

157
src/frontend-server.mjs Normal file
View File

@@ -0,0 +1,157 @@
#!/usr/bin/env node
import fs from "node:fs/promises";
import http from "node:http";
import path from "node:path";
import { Readable } from "node:stream";
import { fileURLToPath } from "node:url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PORT = Number(process.env.PORT || 5731);
const BACKEND_URL = process.env.BACKEND_URL || "http://127.0.0.1:5732";
const PUBLIC_DIR = path.resolve(__dirname, "..", "public");
const MAX_PROXY_BODY_BYTES = 128 * 1024;
const CONTENT_TYPES = new Map([
[".css", "text/css; charset=utf-8"],
[".html", "text/html; charset=utf-8"],
[".js", "text/javascript; charset=utf-8"],
[".svg", "image/svg+xml"]
]);
const server = http.createServer(async (req, res) => {
try {
await route(req, res);
} catch (error) {
res.writeHead(error.statusCode || 500, {
"content-type": "text/plain; charset=utf-8",
"cache-control": "no-store"
});
res.end(error.message || "Unexpected error");
}
});
server.listen(PORT, () => {
console.log(`archive frontend listening on ${PORT}`);
console.log(`archive backend: ${BACKEND_URL}`);
});
async function route(req, res) {
const requestUrl = new URL(req.url, `http://${req.headers.host || "localhost"}`);
if (requestUrl.pathname === "/healthz") {
res.writeHead(200, {
"content-type": "application/json; charset=utf-8",
"cache-control": "no-store"
});
res.end('{"ok":true}\n');
return;
}
if (requestUrl.pathname.startsWith("/api/") || requestUrl.pathname.startsWith("/archives/")) {
await proxyToBackend(req, res, requestUrl);
return;
}
if (req.method !== "GET" && req.method !== "HEAD") {
throw httpError(405, "Method not allowed");
}
if (requestUrl.pathname.startsWith("/assets/")) {
await serveStatic(requestUrl.pathname, res);
return;
}
await serveStatic("/index.html", res);
}
async function serveStatic(urlPath, res) {
let decodedPath;
try {
decodedPath = decodeURIComponent(urlPath);
} catch {
throw httpError(400, "Invalid path");
}
const filePath = path.join(PUBLIC_DIR, decodedPath);
const relative = path.relative(PUBLIC_DIR, filePath);
if (relative.startsWith("..") || path.isAbsolute(relative)) {
throw httpError(404, "Not found");
}
const bytes = await fs.readFile(filePath).catch((error) => {
if (error.code === "ENOENT") {
throw httpError(404, "Not found");
}
throw error;
});
const type = CONTENT_TYPES.get(path.extname(filePath)) || "application/octet-stream";
res.writeHead(200, {
"content-type": type,
"cache-control": "no-store"
});
res.end(bytes);
}
async function proxyToBackend(req, res, requestUrl) {
const upstreamUrl = new URL(`${requestUrl.pathname}${requestUrl.search}`, BACKEND_URL);
const headers = {};
for (const [key, value] of Object.entries(req.headers)) {
if (["connection", "content-length", "host"].includes(key.toLowerCase())) {
continue;
}
if (Array.isArray(value)) {
headers[key] = value.join(", ");
} else if (value !== undefined) {
headers[key] = value;
}
}
const body = req.method === "GET" || req.method === "HEAD"
? undefined
: await readRequestBody(req, MAX_PROXY_BODY_BYTES);
const upstream = await fetch(upstreamUrl, {
method: req.method,
headers,
body
});
const responseHeaders = {};
upstream.headers.forEach((value, key) => {
if (!["connection", "content-encoding", "transfer-encoding"].includes(key.toLowerCase())) {
responseHeaders[key] = value;
}
});
res.writeHead(upstream.status, responseHeaders);
if (req.method === "HEAD" || !upstream.body) {
res.end();
return;
}
Readable.fromWeb(upstream.body).pipe(res);
}
function readRequestBody(req, maxBytes) {
return new Promise((resolve, reject) => {
const chunks = [];
let total = 0;
req.on("data", (chunk) => {
total += chunk.length;
if (total > maxBytes) {
reject(httpError(413, "Request body is too large"));
req.destroy();
return;
}
chunks.push(chunk);
});
req.on("end", () => resolve(Buffer.concat(chunks)));
req.on("error", reject);
});
}
function httpError(statusCode, message) {
const error = new Error(message);
error.statusCode = statusCode;
return error;
}

103
src/worker-server.mjs Normal file
View File

@@ -0,0 +1,103 @@
#!/usr/bin/env node
import http from "node:http";
import path from "node:path";
import { archivePage, defaultArchivePath } from "./archiver.mjs";
import { archiveIdForUrl, normalizeArchiveUrl } from "./archive-catalog.mjs";
const PORT = Number(process.env.PORT || process.env.ARCHIVE_WORKER_PORT || 5733);
const ARCHIVE_PATH = path.resolve(process.env.ARCHIVE_PATH || defaultArchivePath());
const MAX_BODY_BYTES = 64 * 1024;
let queue = Promise.resolve();
const server = http.createServer(async (req, res) => {
try {
await route(req, res);
} catch (error) {
sendJson(res, error.statusCode || 500, {
ok: false,
error: error.message || "Unexpected error"
});
}
});
server.listen(PORT, () => {
console.log(`archive worker listening on ${PORT}`);
console.log(`archive path: ${ARCHIVE_PATH}`);
});
async function route(req, res) {
const requestUrl = new URL(req.url, `http://${req.headers.host || "localhost"}`);
if (req.method === "GET" && requestUrl.pathname === "/healthz") {
sendJson(res, 200, { ok: true });
return;
}
if (req.method === "POST" && requestUrl.pathname === "/archive") {
const body = await readJsonBody(req);
const sourceUrl = normalizeArchiveUrl(body.url);
const id = typeof body.id === "string" && body.id.trim() ? body.id.trim() : archiveIdForUrl(sourceUrl);
const result = await enqueueArchive(sourceUrl, id);
sendJson(res, 200, {
ok: true,
result
});
return;
}
sendJson(res, 404, { ok: false, error: "Not found" });
}
function enqueueArchive(sourceUrl, id) {
const run = () => archivePage(sourceUrl, {
archivePath: ARCHIVE_PATH,
id
});
queue = queue.then(run, run);
return queue;
}
async function readJsonBody(req) {
const text = await readRequestBody(req, MAX_BODY_BYTES);
if (!text.trim()) {
throw httpError(400, "Request body is required");
}
try {
return JSON.parse(text);
} catch {
throw httpError(400, "Request body must be JSON");
}
}
function readRequestBody(req, maxBytes) {
return new Promise((resolve, reject) => {
const chunks = [];
let total = 0;
req.on("data", (chunk) => {
total += chunk.length;
if (total > maxBytes) {
reject(httpError(413, "Request body is too large"));
req.destroy();
return;
}
chunks.push(chunk);
});
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
req.on("error", reject);
});
}
function sendJson(res, statusCode, payload) {
res.writeHead(statusCode, {
"content-type": "application/json; charset=utf-8",
"cache-control": "no-store"
});
res.end(`${JSON.stringify(payload)}\n`);
}
function httpError(statusCode, message) {
const error = new Error(message);
error.statusCode = statusCode;
return error;
}

View File

@@ -0,0 +1,49 @@
import assert from "node:assert/strict";
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import test from "node:test";
import { ArchiveCatalog, archiveFileNameForUrl, archiveIdForUrl, normalizeArchiveUrl } from "../src/archive-catalog.mjs";
test("normalizes only http and https archive URLs", () => {
assert.equal(normalizeArchiveUrl(" https://example.com/path "), "https://example.com/path");
assert.throws(() => normalizeArchiveUrl("file:///tmp/page.html"), /Only http and https/);
assert.throws(() => normalizeArchiveUrl("not a url"), /valid URL/);
});
test("builds stable archive ids from the full URL", () => {
const first = archiveIdForUrl("https://example.com/article?x=1");
const second = archiveIdForUrl("https://example.com/article?x=1");
const third = archiveIdForUrl("https://example.com/article?x=2");
assert.equal(first, second);
assert.notEqual(first, third);
assert.match(first, /^example-com-article-[a-f0-9]{16}$/);
});
test("finds stable archive files without rerendering", async () => {
const archivePath = await fs.mkdtemp(path.join(os.tmpdir(), "archive-catalog-"));
const sourceUrl = "https://example.com/";
const fileName = archiveFileNameForUrl(sourceUrl);
await fs.writeFile(path.join(archivePath, fileName), "<!doctype html>", "utf8");
const catalog = new ArchiveCatalog({ archivePath });
const record = await catalog.findByUrl(sourceUrl);
assert.equal(record.fileName, fileName);
assert.equal(record.archiveUrl, `/archives/${encodeURIComponent(fileName)}`);
});
test("indexes older timestamped archives from the archive comment", async () => {
const archivePath = await fs.mkdtemp(path.join(os.tmpdir(), "archive-catalog-"));
await fs.writeFile(
path.join(archivePath, "example-com-2026-05-16T00-00-00-000Z.html"),
'<!doctype html>\n<!-- Archived locally. Source: https://example.com/story. Created: 2026-05-16T00:00:00.000Z. -->\n<html></html>',
"utf8"
);
const catalog = new ArchiveCatalog({ archivePath });
const record = await catalog.findByUrl("https://example.com/story");
assert.equal(record.fileName, "example-com-2026-05-16T00-00-00-000Z.html");
assert.equal(record.sourceUrl, "https://example.com/story");
});

26
test/archiver.test.mjs Normal file
View File

@@ -0,0 +1,26 @@
import assert from "node:assert/strict";
import test from "node:test";
import { renderPage } from "../src/archiver.mjs";
test("renderPage serializes CSSOM-inserted style rules", async () => {
const html = `<!doctype html>
<html>
<head>
<style id="runtime-style"></style>
<script>
document
.getElementById("runtime-style")
.sheet
.insertRule(".runtime-rule { color: rgb(1, 2, 3); }", 0);
</script>
</head>
<body><div class="runtime-rule">Styled by CSSOM</div></body>
</html>`;
const rendered = await renderPage(`data:text/html,${encodeURIComponent(html)}`, {
userscriptDelay: 0
});
assert.match(rendered, /<style id="runtime-style">[\s\S]*\.runtime-rule/);
assert.match(rendered, /color:\s*rgb\(1,\s*2,\s*3\)/);
});

View File

@@ -0,0 +1,83 @@
import assert from "node:assert/strict";
import test from "node:test";
import { AssetInliner, splitSrcset } from "../src/asset-inliner.mjs";
import { findExternalAssetRefs } from "../src/archiver.mjs";
test("inlines real srcset attributes without reading escaped src text from srcdoc", async () => {
const fetched = [];
const inliner = new AssetInliner();
inliner.fetchAsset = async (rawUrl) => {
fetched.push(rawUrl);
return {
bytes: Buffer.from("asset"),
contentType: "image/png"
};
};
const html = `
<img srcset="/small.png 1x, /large.png 2x">
<iframe srcdoc="&lt;script src=&quot;https://js.stripe.com/v3/foo.js&quot;&gt;&lt;/script&gt;&lt;img src=&quot;/nested.png&quot;&gt;"></iframe>
`;
const output = await inliner.inlineHtml(html, "https://example.com/article");
assert.deepEqual(fetched.sort(), [
"https://example.com/large.png",
"https://example.com/nested.png",
"https://example.com/small.png",
]);
assert.doesNotMatch(output, /js\.stripe\.com/);
assert.equal(inliner.warnings.length, 0);
});
test("external asset reporting ignores escaped nested attributes inside srcdoc", () => {
const refs = findExternalAssetRefs(`
<iframe srcdoc="&lt;img src=&quot;https://tracker.example/pixel.gif&quot;&gt;"></iframe>
<img src="https://cdn.example/picture.jpg">
`);
assert.deepEqual(refs, ["https://cdn.example/picture.jpg"]);
});
test("srcset parsing keeps image CDN transform commas inside URLs", async () => {
assert.deepEqual(splitSrcset([
"https://media.example/photos/id/master/w_120,c_limit/photo.jpg 120w",
"https://media.example/photos/id/master/w_240,c_limit/photo.jpg 240w"
].join(", ")), [
"https://media.example/photos/id/master/w_120,c_limit/photo.jpg 120w",
"https://media.example/photos/id/master/w_240,c_limit/photo.jpg 240w"
]);
const fetched = [];
const inliner = new AssetInliner();
inliner.fetchAsset = async (rawUrl) => {
fetched.push(rawUrl);
return {
bytes: Buffer.from("asset"),
contentType: "image/jpeg"
};
};
await inliner.inlineSrcset(
"https://media.example/photos/id/master/w_120,c_limit/photo.jpg 120w, https://media.example/photos/id/master/w_240,c_limit/photo.jpg 240w",
"https://example.com/article"
);
assert.deepEqual(fetched, [
"https://media.example/photos/id/master/w_120,c_limit/photo.jpg",
"https://media.example/photos/id/master/w_240,c_limit/photo.jpg"
]);
});
test("external asset reporting parses srcset-like attributes without splitting URL commas", () => {
const refs = findExternalAssetRefs(`
<img srcset="data:image/gif;base64,R0lGODlhAQABAAAAACw= 1x, https://cdn.example/image.jpg 2x">
<link rel="preload" as="image" imagesrcset="https://media.example/photos/id/master/w_120,c_limit/photo.jpg 120w, https://media.example/photos/id/master/w_240,c_limit/photo.jpg 240w">
`);
assert.deepEqual(refs, [
"https://cdn.example/image.jpg",
"https://media.example/photos/id/master/w_120,c_limit/photo.jpg",
"https://media.example/photos/id/master/w_240,c_limit/photo.jpg"
]);
});