diff --git a/Dockerfile b/Dockerfile index cc49052..aaab698 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,6 +22,7 @@ COPY . . RUN mkdir -p /archives && chmod 0777 /archives VOLUME ["/archives"] +EXPOSE 5733 ENTRYPOINT ["dumb-init", "--", "/app/scripts/archive-worker-entrypoint.sh"] CMD ["help"] diff --git a/Dockerfile.web b/Dockerfile.web new file mode 100644 index 0000000..2f7dfe1 --- /dev/null +++ b/Dockerfile.web @@ -0,0 +1,16 @@ +FROM node:22-slim + +WORKDIR /app + +ENV NODE_ENV=production +ENV PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 + +COPY package.json package-lock.json ./ +RUN npm ci --omit=dev + +COPY src ./src +COPY public ./public + +EXPOSE 5731 5732 + +CMD ["node", "src/backend-server.mjs"] diff --git a/README.md b/README.md index ac0fc2b..fcaa6c1 100644 --- a/README.md +++ b/README.md @@ -51,3 +51,23 @@ For visual debugging, expose VNC from the worker: ``` The worker image starts Xvfb internally, so callers do not need to mount the host X11 socket or override the entrypoint. + +## Web UI + +The web path is split into three roles: + +- `src/frontend-server.mjs` serves the static UI and proxies `/api/*` and `/archives/*` to the backend. +- `src/backend-server.mjs` manages archive lookup, job state, and the archive index. +- `src/worker-server.mjs` runs inside the browser worker container and wraps `archivePage()` over HTTP. + +Run the full stack with: + +```sh +docker compose -f docker-compose.example.yml up --build +``` + +Then open `http://localhost:5731`. Direct path archival is supported, for example: + +```text +http://localhost:5731/https://example.com +``` diff --git a/Tiltfile b/Tiltfile new file mode 100644 index 0000000..0036a1d --- /dev/null +++ b/Tiltfile @@ -0,0 +1 @@ +docker_compose("docker-compose.example.yml") diff --git a/docker-compose.example.yml b/docker-compose.example.yml new file mode 100644 index 0000000..9fbc876 --- /dev/null +++ b/docker-compose.example.yml @@ -0,0 +1,66 @@ +services: + frontend: + build: + context: . + dockerfile: Dockerfile.web + image: local-page-archiver-web:latest + command: ["node", "src/frontend-server.mjs"] + environment: + PORT: "5731" + BACKEND_URL: "http://backend:5732" + ports: + - "5731:5731" + depends_on: + backend: + condition: service_healthy + healthcheck: + test: ["CMD", "node", "-e", "fetch('http://127.0.0.1:5731/healthz').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"] + interval: 10s + timeout: 3s + retries: 5 + + backend: + build: + context: . + dockerfile: Dockerfile.web + image: local-page-archiver-web:latest + command: ["node", "src/backend-server.mjs"] + environment: + PORT: "5732" + ARCHIVE_PATH: /archives + ARCHIVE_WORKER_URL: "http://browser:5733" + PUBLIC_ARCHIVES_PATH: /archives + volumes: + - archives:/archives + depends_on: + browser: + condition: service_healthy + healthcheck: + test: ["CMD", "node", "-e", "fetch('http://127.0.0.1:5732/healthz').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"] + interval: 10s + timeout: 3s + retries: 5 + + browser: + build: + context: . + dockerfile: Dockerfile + image: local-page-archiver-browser:latest + command: ["serve-worker"] + environment: + PORT: "5733" + ARCHIVE_PATH: /archives + ARCHIVE_WORKER_XVFB: "1" + volumes: + - archives:/archives + expose: + - "5733" + shm_size: 1gb + healthcheck: + test: ["CMD", "node", "-e", "fetch('http://127.0.0.1:5733/healthz').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"] + interval: 10s + timeout: 3s + retries: 10 + +volumes: + archives: diff --git a/package.json b/package.json index 6002cfd..1356b16 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,9 @@ "archive": "node src/cli.mjs archive", "container:archive": "node src/container-runner.mjs archive", "container:build": "node src/container-runner.mjs build", + "start:backend": "node src/backend-server.mjs", + "start:frontend": "node src/frontend-server.mjs", + "start:worker": "node src/worker-server.mjs", "test": "node --test test/*.test.mjs", "install-browsers": "playwright install chromium" }, diff --git a/public/assets/app.css b/public/assets/app.css new file mode 100644 index 0000000..ebe7ad2 --- /dev/null +++ b/public/assets/app.css @@ -0,0 +1,153 @@ +:root { + color-scheme: light; + --bg: #f6f5f1; + --surface: #ffffff; + --ink: #161616; + --muted: #696963; + --line: #d8d6ce; + --accent: #2f7664; + --accent-strong: #245d50; + --danger: #a43d32; + font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; +} + +* { + box-sizing: border-box; +} + +html, +body { + min-height: 100%; +} + +body { + margin: 0; + background: var(--bg); + color: var(--ink); +} + +button, +input { + font: inherit; + letter-spacing: 0; +} + +.shell { + min-height: 100vh; + display: grid; + place-items: center; + padding: 24px; +} + +.archive-box { + width: min(680px, 100%); + background: var(--surface); + border: 1px solid var(--line); + border-radius: 8px; + padding: 12px; + box-shadow: 0 18px 42px rgba(28, 25, 19, 0.08); +} + +.input-row { + display: grid; + grid-template-columns: 1fr auto; + gap: 10px; +} + +input { + min-width: 0; + width: 100%; + height: 48px; + border: 1px solid var(--line); + border-radius: 6px; + color: var(--ink); + background: #fbfaf7; + padding: 0 14px; + outline: none; +} + +input:focus { + border-color: var(--accent); + box-shadow: 0 0 0 3px rgba(47, 118, 100, 0.16); +} + +button { + height: 48px; + min-width: 112px; + border: 0; + border-radius: 6px; + color: #ffffff; + background: var(--accent); + padding: 0 18px; + cursor: pointer; +} + +button:hover { + background: var(--accent-strong); +} + +button:disabled { + cursor: wait; + opacity: 0.72; +} + +.progress-wrap { + padding-top: 12px; +} + +.progress-track { + height: 6px; + overflow: hidden; + border-radius: 999px; + background: #e7e4dc; +} + +.progress-bar { + width: 0%; + height: 100%; + border-radius: inherit; + background: var(--accent); + transition: width 220ms ease; +} + +.status-line { + min-height: 22px; + margin-top: 8px; + color: var(--muted); + font-size: 14px; + line-height: 22px; +} + +.status-line.error { + color: var(--danger); +} + +.sr-only { + position: absolute; + width: 1px; + height: 1px; + overflow: hidden; + clip: rect(0, 0, 0, 0); + white-space: nowrap; + clip-path: inset(50%); +} + +@media (max-width: 560px) { + .shell { + align-items: start; + padding: 16px; + padding-top: 20vh; + } + + .archive-box { + padding: 10px; + } + + .input-row { + grid-template-columns: 1fr; + } + + button { + width: 100%; + } +} diff --git a/public/assets/app.js b/public/assets/app.js new file mode 100644 index 0000000..445a5fd --- /dev/null +++ b/public/assets/app.js @@ -0,0 +1,169 @@ +const form = document.querySelector("#archive-form"); +const input = document.querySelector("#archive-url"); +const button = document.querySelector("#archive-submit"); +const progressWrap = document.querySelector("#progress-wrap"); +const progressBar = document.querySelector("#progress-bar"); +const statusLine = document.querySelector("#status-line"); + +let pollTimer = null; +let visualTimer = null; +let startedAt = Date.now(); + +form.addEventListener("submit", (event) => { + event.preventDefault(); + submitArchive(input.value); +}); + +const pathUrl = urlFromPath(); +if (pathUrl) { + input.value = pathUrl; + submitArchive(pathUrl); +} else { + input.focus(); +} + +async function submitArchive(rawUrl) { + stopTimers(); + setBusy(true); + setStatus("Checking", 8); + + try { + const response = await fetch("/api/archives", { + method: "POST", + headers: { + "content-type": "application/json" + }, + body: JSON.stringify({ url: rawUrl }) + }); + const data = await readApiResponse(response); + if (data.archive?.archiveUrl) { + openArchive(data.archive.archiveUrl); + return; + } + if (data.job?.id) { + watchJob(data.job); + return; + } + throw new Error(data.error || "Archive did not start"); + } catch (error) { + setError(error.message || "Archive failed"); + setBusy(false); + } +} + +function watchJob(job) { + startedAt = Date.parse(job.startedAt || job.createdAt) || Date.now(); + updateFromJob(job); + visualTimer = window.setInterval(updateVisualProgress, 250); + pollTimer = window.setInterval(async () => { + try { + const response = await fetch(`/api/jobs/${encodeURIComponent(job.id)}`); + const data = await readApiResponse(response); + updateFromJob(data.job); + } catch (error) { + stopTimers(); + setError(error.message || "Archive failed"); + setBusy(false); + } + }, 850); +} + +function updateFromJob(job) { + if (job.status === "done" && job.archive?.archiveUrl) { + stopTimers(); + setStatus("Opening", 100); + openArchive(job.archive.archiveUrl); + return; + } + + if (job.status === "failed") { + stopTimers(); + setError(job.error || "Archive failed"); + setBusy(false); + return; + } + + startedAt = Date.parse(job.startedAt || job.createdAt) || startedAt; + const elapsed = Math.max(0, Math.round((Date.now() - startedAt) / 1000)); + const label = job.status === "queued" ? "Queued" : `Archiving ${elapsed}s`; + setStatus(label, optimisticProgress()); +} + +function updateVisualProgress() { + if (!progressWrap.hidden) { + progressBar.style.width = `${optimisticProgress()}%`; + } +} + +function optimisticProgress() { + const elapsed = Math.max(0, (Date.now() - startedAt) / 1000); + if (elapsed < 1) { + return 12; + } + if (elapsed < 12) { + return Math.min(88, 12 + elapsed * 6.3); + } + return Math.min(96, 88 + (elapsed - 12) * 0.6); +} + +async function readApiResponse(response) { + const data = await response.json().catch(() => null); + if (!response.ok || data?.ok === false) { + throw new Error(data?.error || `Request failed with ${response.status}`); + } + return data; +} + +function setBusy(isBusy) { + button.disabled = isBusy; + input.readOnly = isBusy; +} + +function setStatus(text, progress) { + progressWrap.hidden = false; + statusLine.classList.remove("error"); + statusLine.textContent = text; + progressBar.style.width = `${Math.max(0, Math.min(100, progress))}%`; +} + +function setError(text) { + progressWrap.hidden = false; + statusLine.classList.add("error"); + statusLine.textContent = text; + progressBar.style.width = "100%"; +} + +function stopTimers() { + if (pollTimer) { + window.clearInterval(pollTimer); + pollTimer = null; + } + if (visualTimer) { + window.clearInterval(visualTimer); + visualTimer = null; + } +} + +function openArchive(archiveUrl) { + window.location.assign(archiveUrl); +} + +function urlFromPath() { + const rawPath = window.location.pathname.replace(/^\/+/, ""); + if (!rawPath || rawPath.startsWith("assets/") || rawPath.startsWith("api/") || rawPath.startsWith("archives/")) { + return ""; + } + + let decoded; + try { + decoded = decodeURIComponent(rawPath); + } catch { + return ""; + } + + if (!/^https?:\/\//i.test(decoded)) { + return ""; + } + + return `${decoded}${window.location.search}${window.location.hash}`; +} diff --git a/public/index.html b/public/index.html new file mode 100644 index 0000000..b82cf84 --- /dev/null +++ b/public/index.html @@ -0,0 +1,27 @@ + + + + + + Archive + + + + +
+
+ +
+ + +
+ +
+
+ + diff --git a/scripts/archive-worker-entrypoint.sh b/scripts/archive-worker-entrypoint.sh index 541d88a..27c0655 100755 --- a/scripts/archive-worker-entrypoint.sh +++ b/scripts/archive-worker-entrypoint.sh @@ -52,6 +52,10 @@ case "$1" in archive|help) set -- node src/cli.mjs "$@" ;; + serve-worker) + shift + set -- node src/worker-server.mjs "$@" + ;; esac "$@" & diff --git a/src/archive-catalog.mjs b/src/archive-catalog.mjs new file mode 100644 index 0000000..4f1c6d7 --- /dev/null +++ b/src/archive-catalog.mjs @@ -0,0 +1,250 @@ +import crypto from "node:crypto"; +import fs from "node:fs/promises"; +import path from "node:path"; +import { defaultArchivePath } from "./asset-inliner.mjs"; + +const INDEX_FILE = ".archive-index.json"; +const INDEX_VERSION = 1; +const COMMENT_RE = //; + +export function normalizeArchiveUrl(rawUrl) { + const text = String(rawUrl || "").trim(); + if (!text) { + throw new Error("URL is required"); + } + + let url; + try { + url = new URL(text); + } catch { + throw new Error("Enter a valid URL"); + } + + if (url.protocol !== "http:" && url.protocol !== "https:") { + throw new Error("Only http and https URLs can be archived"); + } + + return url.href; +} + +export function archiveIdForUrl(sourceUrl) { + const url = new URL(normalizeArchiveUrl(sourceUrl)); + const stem = + `${url.hostname}${url.pathname}` + .replace(/\/+$/, "") + .replace(/[^a-z0-9]+/gi, "-") + .replace(/^-+|-+$/g, "") + .slice(0, 72) || "archive"; + const digest = crypto.createHash("sha256").update(url.href).digest("hex").slice(0, 16); + return `${stem}-${digest}`; +} + +export function archiveFileNameForUrl(sourceUrl) { + return `${archiveIdForUrl(sourceUrl)}.html`; +} + +export class ArchiveCatalog { + constructor(options = {}) { + this.archivePath = path.resolve(options.archivePath || defaultArchivePath()); + this.publicBasePath = options.publicBasePath || "/archives"; + this.indexPath = path.join(this.archivePath, INDEX_FILE); + this.index = { + version: INDEX_VERSION, + archives: {} + }; + this.loadPromise = null; + this.savePromise = Promise.resolve(); + } + + async findByUrl(rawUrl) { + const sourceUrl = normalizeArchiveUrl(rawUrl); + await this.ensureLoaded(); + + const indexed = this.index.archives[sourceUrl]; + if (indexed && await this.hasArchiveFile(indexed.fileName)) { + return this.toPublicRecord(indexed); + } + + if (indexed) { + delete this.index.archives[sourceUrl]; + await this.saveIndex(); + } + + const stableFileName = archiveFileNameForUrl(sourceUrl); + if (await this.hasArchiveFile(stableFileName)) { + const record = this.upsertRecord(sourceUrl, { + id: path.basename(stableFileName, ".html"), + fileName: stableFileName + }); + await this.saveIndex(); + return this.toPublicRecord(record); + } + + return null; + } + + async recordResult(rawUrl, result) { + const sourceUrl = normalizeArchiveUrl(rawUrl); + await this.ensureLoaded(); + + const fileName = path.basename(result.filePath || `${result.id}.html`); + const id = result.id || path.basename(fileName, ".html"); + const record = this.upsertRecord(sourceUrl, { + id, + fileName, + warningsCount: Array.isArray(result.warnings) ? result.warnings.length : 0, + externalAssetsCount: Array.isArray(result.externalAssets) ? result.externalAssets.length : 0 + }); + await this.saveIndex(); + return this.toPublicRecord(record); + } + + async ensureLoaded() { + if (!this.loadPromise) { + this.loadPromise = this.loadIndex(); + } + await this.loadPromise; + } + + async loadIndex() { + await fs.mkdir(this.archivePath, { recursive: true }); + try { + const data = JSON.parse(await fs.readFile(this.indexPath, "utf8")); + if (data && data.version === INDEX_VERSION && data.archives && typeof data.archives === "object") { + this.index = data; + } + } catch (error) { + if (error.code !== "ENOENT") { + throw error; + } + } + + if (await this.scanArchiveFiles()) { + await this.saveIndex(); + } + } + + async scanArchiveFiles() { + let changed = false; + const entries = await fs.readdir(this.archivePath, { withFileTypes: true }).catch((error) => { + if (error.code === "ENOENT") { + return []; + } + throw error; + }); + + for (const entry of entries) { + if (!entry.isFile() || !entry.name.endsWith(".html")) { + continue; + } + + const filePath = path.join(this.archivePath, entry.name); + const metadata = await readArchiveMetadata(filePath); + if (!metadata?.sourceUrl) { + continue; + } + + let sourceUrl; + try { + sourceUrl = normalizeArchiveUrl(metadata.sourceUrl); + } catch { + continue; + } + const current = this.index.archives[sourceUrl]; + if (current?.fileName === entry.name) { + continue; + } + + this.index.archives[sourceUrl] = { + id: path.basename(entry.name, ".html"), + fileName: entry.name, + sourceUrl, + createdAt: metadata.createdAt || new Date().toISOString(), + updatedAt: new Date().toISOString() + }; + changed = true; + } + + for (const [sourceUrl, record] of Object.entries(this.index.archives)) { + if (!record?.fileName || !await this.hasArchiveFile(record.fileName)) { + delete this.index.archives[sourceUrl]; + changed = true; + } + } + + return changed; + } + + upsertRecord(sourceUrl, values) { + const previous = this.index.archives[sourceUrl]; + const now = new Date().toISOString(); + const record = { + id: values.id, + fileName: values.fileName, + sourceUrl, + createdAt: previous?.createdAt || now, + updatedAt: now, + warningsCount: values.warningsCount ?? previous?.warningsCount ?? 0, + externalAssetsCount: values.externalAssetsCount ?? previous?.externalAssetsCount ?? 0 + }; + this.index.archives[sourceUrl] = record; + return record; + } + + async hasArchiveFile(fileName) { + if (!isSafeArchiveFileName(fileName)) { + return false; + } + const stat = await fs.stat(path.join(this.archivePath, fileName)).catch(() => null); + return !!stat?.isFile(); + } + + toPublicRecord(record) { + return { + ...record, + archiveUrl: `${this.publicBasePath}/${encodeURIComponent(record.fileName)}` + }; + } + + async saveIndex() { + this.savePromise = this.savePromise.then(async () => { + await fs.mkdir(this.archivePath, { recursive: true }); + const tmpPath = `${this.indexPath}.${process.pid}.tmp`; + await fs.writeFile(tmpPath, `${JSON.stringify(this.index, null, 2)}\n`, "utf8"); + await fs.rename(tmpPath, this.indexPath); + }); + return this.savePromise; + } +} + +export function isSafeArchiveFileName(fileName) { + return ( + typeof fileName === "string" && + fileName === path.basename(fileName) && + fileName.endsWith(".html") && + !fileName.startsWith(".") + ); +} + +async function readArchiveMetadata(filePath) { + const handle = await fs.open(filePath, "r").catch(() => null); + if (!handle) { + return null; + } + + try { + const buffer = Buffer.alloc(4096); + const { bytesRead } = await handle.read(buffer, 0, buffer.length, 0); + const head = buffer.subarray(0, bytesRead).toString("utf8"); + const match = head.match(COMMENT_RE); + if (!match) { + return null; + } + return { + sourceUrl: match[1].replaceAll("- -", "--"), + createdAt: match[2] + }; + } finally { + await handle.close(); + } +} diff --git a/src/backend-server.mjs b/src/backend-server.mjs new file mode 100644 index 0000000..084ead3 --- /dev/null +++ b/src/backend-server.mjs @@ -0,0 +1,345 @@ +#!/usr/bin/env node +import { createReadStream } from "node:fs"; +import fs from "node:fs/promises"; +import http from "node:http"; +import path from "node:path"; +import { randomUUID } from "node:crypto"; +import { ArchiveCatalog, archiveIdForUrl, isSafeArchiveFileName, normalizeArchiveUrl } from "./archive-catalog.mjs"; +import { defaultArchivePath } from "./asset-inliner.mjs"; + +const PORT = Number(process.env.PORT || 5732); +const ARCHIVE_PATH = path.resolve(process.env.ARCHIVE_PATH || defaultArchivePath()); +const ARCHIVE_WORKER_URL = process.env.ARCHIVE_WORKER_URL || "http://127.0.0.1:5733"; +const PUBLIC_ARCHIVES_PATH = process.env.PUBLIC_ARCHIVES_PATH || "/archives"; +const JOB_TIMEOUT_MS = Number(process.env.ARCHIVE_JOB_TIMEOUT_MS || 120000); +const MAX_BODY_BYTES = 64 * 1024; + +const catalog = new ArchiveCatalog({ + archivePath: ARCHIVE_PATH, + publicBasePath: PUBLIC_ARCHIVES_PATH +}); + +const jobs = new Map(); +const activeJobByUrl = new Map(); +let workerQueue = Promise.resolve(); + +const server = http.createServer(async (req, res) => { + try { + await route(req, res); + } catch (error) { + sendJson(res, error.statusCode || 500, { + ok: false, + error: error.message || "Unexpected error" + }); + } +}); + +server.listen(PORT, () => { + console.log(`archive backend listening on ${PORT}`); + console.log(`archive path: ${ARCHIVE_PATH}`); + console.log(`archive worker: ${ARCHIVE_WORKER_URL}`); +}); + +const cleanupTimer = setInterval(cleanupJobs, 10 * 60 * 1000); +cleanupTimer.unref?.(); + +async function route(req, res) { + const requestUrl = new URL(req.url, `http://${req.headers.host || "localhost"}`); + + if (req.method === "GET" && requestUrl.pathname === "/healthz") { + sendJson(res, 200, { ok: true }); + return; + } + + if (req.method === "GET" && requestUrl.pathname === "/api/archives/lookup") { + const sourceUrl = normalizeArchiveUrl(requestUrl.searchParams.get("url")); + const archive = await catalog.findByUrl(sourceUrl); + sendJson(res, 200, { + ok: true, + exists: !!archive, + archive + }); + return; + } + + if (req.method === "POST" && requestUrl.pathname === "/api/archives") { + const body = await readJsonBody(req); + const sourceUrl = normalizeArchiveUrl(body.url); + const response = await createOrFindArchive(sourceUrl); + sendJson(res, response.statusCode, response.body); + return; + } + + const jobMatch = requestUrl.pathname.match(/^\/api\/jobs\/([^/]+)$/); + if (req.method === "GET" && jobMatch) { + const job = jobs.get(jobMatch[1]); + if (!job) { + sendJson(res, 404, { ok: false, error: "Job not found" }); + return; + } + sendJson(res, 200, { + ok: true, + job: publicJob(job) + }); + return; + } + + if (req.method === "GET" && requestUrl.pathname.startsWith(`${PUBLIC_ARCHIVES_PATH}/`)) { + await serveArchive(requestUrl.pathname.slice(PUBLIC_ARCHIVES_PATH.length + 1), res); + return; + } + + sendJson(res, 404, { ok: false, error: "Not found" }); +} + +async function createOrFindArchive(sourceUrl) { + const existing = await catalog.findByUrl(sourceUrl); + if (existing) { + return { + statusCode: 200, + body: { + ok: true, + status: "done", + mode: "existing", + archive: existing + } + }; + } + + const activeJobId = activeJobByUrl.get(sourceUrl); + const activeJob = activeJobId ? jobs.get(activeJobId) : null; + if (activeJob && !isTerminal(activeJob.status)) { + return { + statusCode: 202, + body: { + ok: true, + status: activeJob.status, + mode: "active", + job: publicJob(activeJob) + } + }; + } + + const job = { + id: cryptoRandomId(), + archiveId: archiveIdForUrl(sourceUrl), + sourceUrl, + status: "queued", + message: "Queued", + createdAt: new Date().toISOString(), + startedAt: null, + updatedAt: new Date().toISOString(), + finishedAt: null, + archive: null, + error: null + }; + jobs.set(job.id, job); + activeJobByUrl.set(sourceUrl, job.id); + enqueueJob(job); + + return { + statusCode: 202, + body: { + ok: true, + status: job.status, + mode: "created", + job: publicJob(job) + } + }; +} + +function enqueueJob(job) { + const run = () => executeJob(job); + workerQueue = workerQueue.then(run, run); +} + +async function executeJob(job) { + if (job.status !== "queued") { + return; + } + + updateJob(job, { + status: "running", + message: "Archiving", + startedAt: new Date().toISOString() + }); + + try { + const result = await requestWorkerArchive(job.sourceUrl, job.archiveId); + const archive = await catalog.recordResult(job.sourceUrl, result); + updateJob(job, { + status: "done", + message: "Opening", + archive, + finishedAt: new Date().toISOString() + }); + } catch (error) { + updateJob(job, { + status: "failed", + message: "Failed", + error: error.message || "Archive failed", + finishedAt: new Date().toISOString() + }); + } finally { + activeJobByUrl.delete(job.sourceUrl); + } +} + +async function requestWorkerArchive(sourceUrl, archiveId) { + const workerUrl = new URL("/archive", ARCHIVE_WORKER_URL); + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), JOB_TIMEOUT_MS); + timeout.unref?.(); + + try { + const response = await fetch(workerUrl, { + method: "POST", + headers: { + "content-type": "application/json" + }, + body: JSON.stringify({ url: sourceUrl, id: archiveId }), + signal: controller.signal + }); + const text = await response.text(); + const parsed = parseJson(text); + if (!response.ok || parsed.ok === false) { + throw new Error(parsed.error || text || `Worker returned ${response.status}`); + } + return parsed.result || parsed; + } catch (error) { + if (error.name === "AbortError") { + throw new Error(`Archive timed out after ${Math.round(JOB_TIMEOUT_MS / 1000)} seconds`); + } + throw error; + } finally { + clearTimeout(timeout); + } +} + +async function serveArchive(rawFileName, res) { + let fileName; + try { + fileName = decodeURIComponent(rawFileName); + } catch { + sendJson(res, 400, { ok: false, error: "Invalid archive path" }); + return; + } + + if (!isSafeArchiveFileName(fileName)) { + sendJson(res, 404, { ok: false, error: "Archive not found" }); + return; + } + + const filePath = path.join(ARCHIVE_PATH, fileName); + const stat = await fs.stat(filePath).catch(() => null); + if (!stat?.isFile()) { + sendJson(res, 404, { ok: false, error: "Archive not found" }); + return; + } + + const stream = createReadStream(filePath, { encoding: "utf8" }); + stream.on("error", () => { + if (!res.headersSent) { + sendJson(res, 404, { ok: false, error: "Archive not found" }); + } else { + res.destroy(); + } + }); + res.writeHead(200, { + "content-type": "text/html; charset=utf-8", + "cache-control": "no-store" + }); + stream.pipe(res); +} + +function updateJob(job, values) { + Object.assign(job, values, { + updatedAt: new Date().toISOString() + }); +} + +function publicJob(job) { + const startedAt = job.startedAt || job.createdAt; + return { + id: job.id, + sourceUrl: job.sourceUrl, + status: job.status, + message: job.message, + createdAt: job.createdAt, + startedAt, + updatedAt: job.updatedAt, + finishedAt: job.finishedAt, + elapsedMs: startedAt ? Date.now() - Date.parse(startedAt) : 0, + archive: job.archive, + error: job.error + }; +} + +function isTerminal(status) { + return status === "done" || status === "failed"; +} + +function cleanupJobs() { + const cutoff = Date.now() - 60 * 60 * 1000; + for (const [id, job] of jobs) { + if (isTerminal(job.status) && Date.parse(job.finishedAt || job.updatedAt) < cutoff) { + jobs.delete(id); + } + } +} + +async function readJsonBody(req) { + const text = await readRequestBody(req, MAX_BODY_BYTES); + if (!text.trim()) { + throw httpError(400, "Request body is required"); + } + try { + return JSON.parse(text); + } catch { + throw httpError(400, "Request body must be JSON"); + } +} + +function readRequestBody(req, maxBytes) { + return new Promise((resolve, reject) => { + const chunks = []; + let total = 0; + req.on("data", (chunk) => { + total += chunk.length; + if (total > maxBytes) { + reject(httpError(413, "Request body is too large")); + req.destroy(); + return; + } + chunks.push(chunk); + }); + req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8"))); + req.on("error", reject); + }); +} + +function sendJson(res, statusCode, payload) { + res.writeHead(statusCode, { + "content-type": "application/json; charset=utf-8", + "cache-control": "no-store" + }); + res.end(`${JSON.stringify(payload)}\n`); +} + +function parseJson(text) { + try { + return JSON.parse(text); + } catch { + throw new Error(text || "Worker returned invalid JSON"); + } +} + +function cryptoRandomId() { + return randomUUID(); +} + +function httpError(statusCode, message) { + const error = new Error(message); + error.statusCode = statusCode; + return error; +} diff --git a/src/frontend-server.mjs b/src/frontend-server.mjs new file mode 100644 index 0000000..8b51c31 --- /dev/null +++ b/src/frontend-server.mjs @@ -0,0 +1,157 @@ +#!/usr/bin/env node +import fs from "node:fs/promises"; +import http from "node:http"; +import path from "node:path"; +import { Readable } from "node:stream"; +import { fileURLToPath } from "node:url"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const PORT = Number(process.env.PORT || 5731); +const BACKEND_URL = process.env.BACKEND_URL || "http://127.0.0.1:5732"; +const PUBLIC_DIR = path.resolve(__dirname, "..", "public"); +const MAX_PROXY_BODY_BYTES = 128 * 1024; + +const CONTENT_TYPES = new Map([ + [".css", "text/css; charset=utf-8"], + [".html", "text/html; charset=utf-8"], + [".js", "text/javascript; charset=utf-8"], + [".svg", "image/svg+xml"] +]); + +const server = http.createServer(async (req, res) => { + try { + await route(req, res); + } catch (error) { + res.writeHead(error.statusCode || 500, { + "content-type": "text/plain; charset=utf-8", + "cache-control": "no-store" + }); + res.end(error.message || "Unexpected error"); + } +}); + +server.listen(PORT, () => { + console.log(`archive frontend listening on ${PORT}`); + console.log(`archive backend: ${BACKEND_URL}`); +}); + +async function route(req, res) { + const requestUrl = new URL(req.url, `http://${req.headers.host || "localhost"}`); + + if (requestUrl.pathname === "/healthz") { + res.writeHead(200, { + "content-type": "application/json; charset=utf-8", + "cache-control": "no-store" + }); + res.end('{"ok":true}\n'); + return; + } + + if (requestUrl.pathname.startsWith("/api/") || requestUrl.pathname.startsWith("/archives/")) { + await proxyToBackend(req, res, requestUrl); + return; + } + + if (req.method !== "GET" && req.method !== "HEAD") { + throw httpError(405, "Method not allowed"); + } + + if (requestUrl.pathname.startsWith("/assets/")) { + await serveStatic(requestUrl.pathname, res); + return; + } + + await serveStatic("/index.html", res); +} + +async function serveStatic(urlPath, res) { + let decodedPath; + try { + decodedPath = decodeURIComponent(urlPath); + } catch { + throw httpError(400, "Invalid path"); + } + + const filePath = path.join(PUBLIC_DIR, decodedPath); + const relative = path.relative(PUBLIC_DIR, filePath); + if (relative.startsWith("..") || path.isAbsolute(relative)) { + throw httpError(404, "Not found"); + } + + const bytes = await fs.readFile(filePath).catch((error) => { + if (error.code === "ENOENT") { + throw httpError(404, "Not found"); + } + throw error; + }); + + const type = CONTENT_TYPES.get(path.extname(filePath)) || "application/octet-stream"; + res.writeHead(200, { + "content-type": type, + "cache-control": "no-store" + }); + res.end(bytes); +} + +async function proxyToBackend(req, res, requestUrl) { + const upstreamUrl = new URL(`${requestUrl.pathname}${requestUrl.search}`, BACKEND_URL); + const headers = {}; + for (const [key, value] of Object.entries(req.headers)) { + if (["connection", "content-length", "host"].includes(key.toLowerCase())) { + continue; + } + if (Array.isArray(value)) { + headers[key] = value.join(", "); + } else if (value !== undefined) { + headers[key] = value; + } + } + + const body = req.method === "GET" || req.method === "HEAD" + ? undefined + : await readRequestBody(req, MAX_PROXY_BODY_BYTES); + + const upstream = await fetch(upstreamUrl, { + method: req.method, + headers, + body + }); + + const responseHeaders = {}; + upstream.headers.forEach((value, key) => { + if (!["connection", "content-encoding", "transfer-encoding"].includes(key.toLowerCase())) { + responseHeaders[key] = value; + } + }); + + res.writeHead(upstream.status, responseHeaders); + if (req.method === "HEAD" || !upstream.body) { + res.end(); + return; + } + Readable.fromWeb(upstream.body).pipe(res); +} + +function readRequestBody(req, maxBytes) { + return new Promise((resolve, reject) => { + const chunks = []; + let total = 0; + req.on("data", (chunk) => { + total += chunk.length; + if (total > maxBytes) { + reject(httpError(413, "Request body is too large")); + req.destroy(); + return; + } + chunks.push(chunk); + }); + req.on("end", () => resolve(Buffer.concat(chunks))); + req.on("error", reject); + }); +} + +function httpError(statusCode, message) { + const error = new Error(message); + error.statusCode = statusCode; + return error; +} diff --git a/src/worker-server.mjs b/src/worker-server.mjs new file mode 100644 index 0000000..37ba402 --- /dev/null +++ b/src/worker-server.mjs @@ -0,0 +1,103 @@ +#!/usr/bin/env node +import http from "node:http"; +import path from "node:path"; +import { archivePage, defaultArchivePath } from "./archiver.mjs"; +import { archiveIdForUrl, normalizeArchiveUrl } from "./archive-catalog.mjs"; + +const PORT = Number(process.env.PORT || process.env.ARCHIVE_WORKER_PORT || 5733); +const ARCHIVE_PATH = path.resolve(process.env.ARCHIVE_PATH || defaultArchivePath()); +const MAX_BODY_BYTES = 64 * 1024; + +let queue = Promise.resolve(); + +const server = http.createServer(async (req, res) => { + try { + await route(req, res); + } catch (error) { + sendJson(res, error.statusCode || 500, { + ok: false, + error: error.message || "Unexpected error" + }); + } +}); + +server.listen(PORT, () => { + console.log(`archive worker listening on ${PORT}`); + console.log(`archive path: ${ARCHIVE_PATH}`); +}); + +async function route(req, res) { + const requestUrl = new URL(req.url, `http://${req.headers.host || "localhost"}`); + + if (req.method === "GET" && requestUrl.pathname === "/healthz") { + sendJson(res, 200, { ok: true }); + return; + } + + if (req.method === "POST" && requestUrl.pathname === "/archive") { + const body = await readJsonBody(req); + const sourceUrl = normalizeArchiveUrl(body.url); + const id = typeof body.id === "string" && body.id.trim() ? body.id.trim() : archiveIdForUrl(sourceUrl); + const result = await enqueueArchive(sourceUrl, id); + sendJson(res, 200, { + ok: true, + result + }); + return; + } + + sendJson(res, 404, { ok: false, error: "Not found" }); +} + +function enqueueArchive(sourceUrl, id) { + const run = () => archivePage(sourceUrl, { + archivePath: ARCHIVE_PATH, + id + }); + queue = queue.then(run, run); + return queue; +} + +async function readJsonBody(req) { + const text = await readRequestBody(req, MAX_BODY_BYTES); + if (!text.trim()) { + throw httpError(400, "Request body is required"); + } + try { + return JSON.parse(text); + } catch { + throw httpError(400, "Request body must be JSON"); + } +} + +function readRequestBody(req, maxBytes) { + return new Promise((resolve, reject) => { + const chunks = []; + let total = 0; + req.on("data", (chunk) => { + total += chunk.length; + if (total > maxBytes) { + reject(httpError(413, "Request body is too large")); + req.destroy(); + return; + } + chunks.push(chunk); + }); + req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8"))); + req.on("error", reject); + }); +} + +function sendJson(res, statusCode, payload) { + res.writeHead(statusCode, { + "content-type": "application/json; charset=utf-8", + "cache-control": "no-store" + }); + res.end(`${JSON.stringify(payload)}\n`); +} + +function httpError(statusCode, message) { + const error = new Error(message); + error.statusCode = statusCode; + return error; +} diff --git a/test/archive-catalog.test.mjs b/test/archive-catalog.test.mjs new file mode 100644 index 0000000..c34bf06 --- /dev/null +++ b/test/archive-catalog.test.mjs @@ -0,0 +1,49 @@ +import assert from "node:assert/strict"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; +import test from "node:test"; +import { ArchiveCatalog, archiveFileNameForUrl, archiveIdForUrl, normalizeArchiveUrl } from "../src/archive-catalog.mjs"; + +test("normalizes only http and https archive URLs", () => { + assert.equal(normalizeArchiveUrl(" https://example.com/path "), "https://example.com/path"); + assert.throws(() => normalizeArchiveUrl("file:///tmp/page.html"), /Only http and https/); + assert.throws(() => normalizeArchiveUrl("not a url"), /valid URL/); +}); + +test("builds stable archive ids from the full URL", () => { + const first = archiveIdForUrl("https://example.com/article?x=1"); + const second = archiveIdForUrl("https://example.com/article?x=1"); + const third = archiveIdForUrl("https://example.com/article?x=2"); + assert.equal(first, second); + assert.notEqual(first, third); + assert.match(first, /^example-com-article-[a-f0-9]{16}$/); +}); + +test("finds stable archive files without rerendering", async () => { + const archivePath = await fs.mkdtemp(path.join(os.tmpdir(), "archive-catalog-")); + const sourceUrl = "https://example.com/"; + const fileName = archiveFileNameForUrl(sourceUrl); + await fs.writeFile(path.join(archivePath, fileName), "", "utf8"); + + const catalog = new ArchiveCatalog({ archivePath }); + const record = await catalog.findByUrl(sourceUrl); + + assert.equal(record.fileName, fileName); + assert.equal(record.archiveUrl, `/archives/${encodeURIComponent(fileName)}`); +}); + +test("indexes older timestamped archives from the archive comment", async () => { + const archivePath = await fs.mkdtemp(path.join(os.tmpdir(), "archive-catalog-")); + await fs.writeFile( + path.join(archivePath, "example-com-2026-05-16T00-00-00-000Z.html"), + '\n\n', + "utf8" + ); + + const catalog = new ArchiveCatalog({ archivePath }); + const record = await catalog.findByUrl("https://example.com/story"); + + assert.equal(record.fileName, "example-com-2026-05-16T00-00-00-000Z.html"); + assert.equal(record.sourceUrl, "https://example.com/story"); +}); diff --git a/test/archiver.test.mjs b/test/archiver.test.mjs new file mode 100644 index 0000000..1ebfd03 --- /dev/null +++ b/test/archiver.test.mjs @@ -0,0 +1,26 @@ +import assert from "node:assert/strict"; +import test from "node:test"; +import { renderPage } from "../src/archiver.mjs"; + +test("renderPage serializes CSSOM-inserted style rules", async () => { + const html = ` + + + + + +
Styled by CSSOM
+ `; + + const rendered = await renderPage(`data:text/html,${encodeURIComponent(html)}`, { + userscriptDelay: 0 + }); + + assert.match(rendered, /