adds frontend

This commit is contained in:
2026-05-16 16:36:51 -07:00
parent 40c63dc4e2
commit c00913ec35
17 changed files with 1473 additions and 0 deletions

250
src/archive-catalog.mjs Normal file
View File

@@ -0,0 +1,250 @@
import crypto from "node:crypto";
import fs from "node:fs/promises";
import path from "node:path";
import { defaultArchivePath } from "./asset-inliner.mjs";
const INDEX_FILE = ".archive-index.json";
const INDEX_VERSION = 1;
const COMMENT_RE = /<!--\s*Archived locally\. Source: ([\s\S]*?)\. Created: ([^.]*(?:\.[0-9]+)?Z)\.\s*-->/;
export function normalizeArchiveUrl(rawUrl) {
const text = String(rawUrl || "").trim();
if (!text) {
throw new Error("URL is required");
}
let url;
try {
url = new URL(text);
} catch {
throw new Error("Enter a valid URL");
}
if (url.protocol !== "http:" && url.protocol !== "https:") {
throw new Error("Only http and https URLs can be archived");
}
return url.href;
}
export function archiveIdForUrl(sourceUrl) {
const url = new URL(normalizeArchiveUrl(sourceUrl));
const stem =
`${url.hostname}${url.pathname}`
.replace(/\/+$/, "")
.replace(/[^a-z0-9]+/gi, "-")
.replace(/^-+|-+$/g, "")
.slice(0, 72) || "archive";
const digest = crypto.createHash("sha256").update(url.href).digest("hex").slice(0, 16);
return `${stem}-${digest}`;
}
export function archiveFileNameForUrl(sourceUrl) {
return `${archiveIdForUrl(sourceUrl)}.html`;
}
export class ArchiveCatalog {
constructor(options = {}) {
this.archivePath = path.resolve(options.archivePath || defaultArchivePath());
this.publicBasePath = options.publicBasePath || "/archives";
this.indexPath = path.join(this.archivePath, INDEX_FILE);
this.index = {
version: INDEX_VERSION,
archives: {}
};
this.loadPromise = null;
this.savePromise = Promise.resolve();
}
async findByUrl(rawUrl) {
const sourceUrl = normalizeArchiveUrl(rawUrl);
await this.ensureLoaded();
const indexed = this.index.archives[sourceUrl];
if (indexed && await this.hasArchiveFile(indexed.fileName)) {
return this.toPublicRecord(indexed);
}
if (indexed) {
delete this.index.archives[sourceUrl];
await this.saveIndex();
}
const stableFileName = archiveFileNameForUrl(sourceUrl);
if (await this.hasArchiveFile(stableFileName)) {
const record = this.upsertRecord(sourceUrl, {
id: path.basename(stableFileName, ".html"),
fileName: stableFileName
});
await this.saveIndex();
return this.toPublicRecord(record);
}
return null;
}
async recordResult(rawUrl, result) {
const sourceUrl = normalizeArchiveUrl(rawUrl);
await this.ensureLoaded();
const fileName = path.basename(result.filePath || `${result.id}.html`);
const id = result.id || path.basename(fileName, ".html");
const record = this.upsertRecord(sourceUrl, {
id,
fileName,
warningsCount: Array.isArray(result.warnings) ? result.warnings.length : 0,
externalAssetsCount: Array.isArray(result.externalAssets) ? result.externalAssets.length : 0
});
await this.saveIndex();
return this.toPublicRecord(record);
}
async ensureLoaded() {
if (!this.loadPromise) {
this.loadPromise = this.loadIndex();
}
await this.loadPromise;
}
async loadIndex() {
await fs.mkdir(this.archivePath, { recursive: true });
try {
const data = JSON.parse(await fs.readFile(this.indexPath, "utf8"));
if (data && data.version === INDEX_VERSION && data.archives && typeof data.archives === "object") {
this.index = data;
}
} catch (error) {
if (error.code !== "ENOENT") {
throw error;
}
}
if (await this.scanArchiveFiles()) {
await this.saveIndex();
}
}
async scanArchiveFiles() {
let changed = false;
const entries = await fs.readdir(this.archivePath, { withFileTypes: true }).catch((error) => {
if (error.code === "ENOENT") {
return [];
}
throw error;
});
for (const entry of entries) {
if (!entry.isFile() || !entry.name.endsWith(".html")) {
continue;
}
const filePath = path.join(this.archivePath, entry.name);
const metadata = await readArchiveMetadata(filePath);
if (!metadata?.sourceUrl) {
continue;
}
let sourceUrl;
try {
sourceUrl = normalizeArchiveUrl(metadata.sourceUrl);
} catch {
continue;
}
const current = this.index.archives[sourceUrl];
if (current?.fileName === entry.name) {
continue;
}
this.index.archives[sourceUrl] = {
id: path.basename(entry.name, ".html"),
fileName: entry.name,
sourceUrl,
createdAt: metadata.createdAt || new Date().toISOString(),
updatedAt: new Date().toISOString()
};
changed = true;
}
for (const [sourceUrl, record] of Object.entries(this.index.archives)) {
if (!record?.fileName || !await this.hasArchiveFile(record.fileName)) {
delete this.index.archives[sourceUrl];
changed = true;
}
}
return changed;
}
upsertRecord(sourceUrl, values) {
const previous = this.index.archives[sourceUrl];
const now = new Date().toISOString();
const record = {
id: values.id,
fileName: values.fileName,
sourceUrl,
createdAt: previous?.createdAt || now,
updatedAt: now,
warningsCount: values.warningsCount ?? previous?.warningsCount ?? 0,
externalAssetsCount: values.externalAssetsCount ?? previous?.externalAssetsCount ?? 0
};
this.index.archives[sourceUrl] = record;
return record;
}
async hasArchiveFile(fileName) {
if (!isSafeArchiveFileName(fileName)) {
return false;
}
const stat = await fs.stat(path.join(this.archivePath, fileName)).catch(() => null);
return !!stat?.isFile();
}
toPublicRecord(record) {
return {
...record,
archiveUrl: `${this.publicBasePath}/${encodeURIComponent(record.fileName)}`
};
}
async saveIndex() {
this.savePromise = this.savePromise.then(async () => {
await fs.mkdir(this.archivePath, { recursive: true });
const tmpPath = `${this.indexPath}.${process.pid}.tmp`;
await fs.writeFile(tmpPath, `${JSON.stringify(this.index, null, 2)}\n`, "utf8");
await fs.rename(tmpPath, this.indexPath);
});
return this.savePromise;
}
}
export function isSafeArchiveFileName(fileName) {
return (
typeof fileName === "string" &&
fileName === path.basename(fileName) &&
fileName.endsWith(".html") &&
!fileName.startsWith(".")
);
}
async function readArchiveMetadata(filePath) {
const handle = await fs.open(filePath, "r").catch(() => null);
if (!handle) {
return null;
}
try {
const buffer = Buffer.alloc(4096);
const { bytesRead } = await handle.read(buffer, 0, buffer.length, 0);
const head = buffer.subarray(0, bytesRead).toString("utf8");
const match = head.match(COMMENT_RE);
if (!match) {
return null;
}
return {
sourceUrl: match[1].replaceAll("- -", "--"),
createdAt: match[2]
};
} finally {
await handle.close();
}
}

345
src/backend-server.mjs Normal file
View File

@@ -0,0 +1,345 @@
#!/usr/bin/env node
import { createReadStream } from "node:fs";
import fs from "node:fs/promises";
import http from "node:http";
import path from "node:path";
import { randomUUID } from "node:crypto";
import { ArchiveCatalog, archiveIdForUrl, isSafeArchiveFileName, normalizeArchiveUrl } from "./archive-catalog.mjs";
import { defaultArchivePath } from "./asset-inliner.mjs";
const PORT = Number(process.env.PORT || 5732);
const ARCHIVE_PATH = path.resolve(process.env.ARCHIVE_PATH || defaultArchivePath());
const ARCHIVE_WORKER_URL = process.env.ARCHIVE_WORKER_URL || "http://127.0.0.1:5733";
const PUBLIC_ARCHIVES_PATH = process.env.PUBLIC_ARCHIVES_PATH || "/archives";
const JOB_TIMEOUT_MS = Number(process.env.ARCHIVE_JOB_TIMEOUT_MS || 120000);
const MAX_BODY_BYTES = 64 * 1024;
const catalog = new ArchiveCatalog({
archivePath: ARCHIVE_PATH,
publicBasePath: PUBLIC_ARCHIVES_PATH
});
const jobs = new Map();
const activeJobByUrl = new Map();
let workerQueue = Promise.resolve();
const server = http.createServer(async (req, res) => {
try {
await route(req, res);
} catch (error) {
sendJson(res, error.statusCode || 500, {
ok: false,
error: error.message || "Unexpected error"
});
}
});
server.listen(PORT, () => {
console.log(`archive backend listening on ${PORT}`);
console.log(`archive path: ${ARCHIVE_PATH}`);
console.log(`archive worker: ${ARCHIVE_WORKER_URL}`);
});
const cleanupTimer = setInterval(cleanupJobs, 10 * 60 * 1000);
cleanupTimer.unref?.();
async function route(req, res) {
const requestUrl = new URL(req.url, `http://${req.headers.host || "localhost"}`);
if (req.method === "GET" && requestUrl.pathname === "/healthz") {
sendJson(res, 200, { ok: true });
return;
}
if (req.method === "GET" && requestUrl.pathname === "/api/archives/lookup") {
const sourceUrl = normalizeArchiveUrl(requestUrl.searchParams.get("url"));
const archive = await catalog.findByUrl(sourceUrl);
sendJson(res, 200, {
ok: true,
exists: !!archive,
archive
});
return;
}
if (req.method === "POST" && requestUrl.pathname === "/api/archives") {
const body = await readJsonBody(req);
const sourceUrl = normalizeArchiveUrl(body.url);
const response = await createOrFindArchive(sourceUrl);
sendJson(res, response.statusCode, response.body);
return;
}
const jobMatch = requestUrl.pathname.match(/^\/api\/jobs\/([^/]+)$/);
if (req.method === "GET" && jobMatch) {
const job = jobs.get(jobMatch[1]);
if (!job) {
sendJson(res, 404, { ok: false, error: "Job not found" });
return;
}
sendJson(res, 200, {
ok: true,
job: publicJob(job)
});
return;
}
if (req.method === "GET" && requestUrl.pathname.startsWith(`${PUBLIC_ARCHIVES_PATH}/`)) {
await serveArchive(requestUrl.pathname.slice(PUBLIC_ARCHIVES_PATH.length + 1), res);
return;
}
sendJson(res, 404, { ok: false, error: "Not found" });
}
async function createOrFindArchive(sourceUrl) {
const existing = await catalog.findByUrl(sourceUrl);
if (existing) {
return {
statusCode: 200,
body: {
ok: true,
status: "done",
mode: "existing",
archive: existing
}
};
}
const activeJobId = activeJobByUrl.get(sourceUrl);
const activeJob = activeJobId ? jobs.get(activeJobId) : null;
if (activeJob && !isTerminal(activeJob.status)) {
return {
statusCode: 202,
body: {
ok: true,
status: activeJob.status,
mode: "active",
job: publicJob(activeJob)
}
};
}
const job = {
id: cryptoRandomId(),
archiveId: archiveIdForUrl(sourceUrl),
sourceUrl,
status: "queued",
message: "Queued",
createdAt: new Date().toISOString(),
startedAt: null,
updatedAt: new Date().toISOString(),
finishedAt: null,
archive: null,
error: null
};
jobs.set(job.id, job);
activeJobByUrl.set(sourceUrl, job.id);
enqueueJob(job);
return {
statusCode: 202,
body: {
ok: true,
status: job.status,
mode: "created",
job: publicJob(job)
}
};
}
function enqueueJob(job) {
const run = () => executeJob(job);
workerQueue = workerQueue.then(run, run);
}
async function executeJob(job) {
if (job.status !== "queued") {
return;
}
updateJob(job, {
status: "running",
message: "Archiving",
startedAt: new Date().toISOString()
});
try {
const result = await requestWorkerArchive(job.sourceUrl, job.archiveId);
const archive = await catalog.recordResult(job.sourceUrl, result);
updateJob(job, {
status: "done",
message: "Opening",
archive,
finishedAt: new Date().toISOString()
});
} catch (error) {
updateJob(job, {
status: "failed",
message: "Failed",
error: error.message || "Archive failed",
finishedAt: new Date().toISOString()
});
} finally {
activeJobByUrl.delete(job.sourceUrl);
}
}
async function requestWorkerArchive(sourceUrl, archiveId) {
const workerUrl = new URL("/archive", ARCHIVE_WORKER_URL);
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), JOB_TIMEOUT_MS);
timeout.unref?.();
try {
const response = await fetch(workerUrl, {
method: "POST",
headers: {
"content-type": "application/json"
},
body: JSON.stringify({ url: sourceUrl, id: archiveId }),
signal: controller.signal
});
const text = await response.text();
const parsed = parseJson(text);
if (!response.ok || parsed.ok === false) {
throw new Error(parsed.error || text || `Worker returned ${response.status}`);
}
return parsed.result || parsed;
} catch (error) {
if (error.name === "AbortError") {
throw new Error(`Archive timed out after ${Math.round(JOB_TIMEOUT_MS / 1000)} seconds`);
}
throw error;
} finally {
clearTimeout(timeout);
}
}
async function serveArchive(rawFileName, res) {
let fileName;
try {
fileName = decodeURIComponent(rawFileName);
} catch {
sendJson(res, 400, { ok: false, error: "Invalid archive path" });
return;
}
if (!isSafeArchiveFileName(fileName)) {
sendJson(res, 404, { ok: false, error: "Archive not found" });
return;
}
const filePath = path.join(ARCHIVE_PATH, fileName);
const stat = await fs.stat(filePath).catch(() => null);
if (!stat?.isFile()) {
sendJson(res, 404, { ok: false, error: "Archive not found" });
return;
}
const stream = createReadStream(filePath, { encoding: "utf8" });
stream.on("error", () => {
if (!res.headersSent) {
sendJson(res, 404, { ok: false, error: "Archive not found" });
} else {
res.destroy();
}
});
res.writeHead(200, {
"content-type": "text/html; charset=utf-8",
"cache-control": "no-store"
});
stream.pipe(res);
}
function updateJob(job, values) {
Object.assign(job, values, {
updatedAt: new Date().toISOString()
});
}
function publicJob(job) {
const startedAt = job.startedAt || job.createdAt;
return {
id: job.id,
sourceUrl: job.sourceUrl,
status: job.status,
message: job.message,
createdAt: job.createdAt,
startedAt,
updatedAt: job.updatedAt,
finishedAt: job.finishedAt,
elapsedMs: startedAt ? Date.now() - Date.parse(startedAt) : 0,
archive: job.archive,
error: job.error
};
}
function isTerminal(status) {
return status === "done" || status === "failed";
}
function cleanupJobs() {
const cutoff = Date.now() - 60 * 60 * 1000;
for (const [id, job] of jobs) {
if (isTerminal(job.status) && Date.parse(job.finishedAt || job.updatedAt) < cutoff) {
jobs.delete(id);
}
}
}
async function readJsonBody(req) {
const text = await readRequestBody(req, MAX_BODY_BYTES);
if (!text.trim()) {
throw httpError(400, "Request body is required");
}
try {
return JSON.parse(text);
} catch {
throw httpError(400, "Request body must be JSON");
}
}
function readRequestBody(req, maxBytes) {
return new Promise((resolve, reject) => {
const chunks = [];
let total = 0;
req.on("data", (chunk) => {
total += chunk.length;
if (total > maxBytes) {
reject(httpError(413, "Request body is too large"));
req.destroy();
return;
}
chunks.push(chunk);
});
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
req.on("error", reject);
});
}
function sendJson(res, statusCode, payload) {
res.writeHead(statusCode, {
"content-type": "application/json; charset=utf-8",
"cache-control": "no-store"
});
res.end(`${JSON.stringify(payload)}\n`);
}
function parseJson(text) {
try {
return JSON.parse(text);
} catch {
throw new Error(text || "Worker returned invalid JSON");
}
}
function cryptoRandomId() {
return randomUUID();
}
function httpError(statusCode, message) {
const error = new Error(message);
error.statusCode = statusCode;
return error;
}

157
src/frontend-server.mjs Normal file
View File

@@ -0,0 +1,157 @@
#!/usr/bin/env node
import fs from "node:fs/promises";
import http from "node:http";
import path from "node:path";
import { Readable } from "node:stream";
import { fileURLToPath } from "node:url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PORT = Number(process.env.PORT || 5731);
const BACKEND_URL = process.env.BACKEND_URL || "http://127.0.0.1:5732";
const PUBLIC_DIR = path.resolve(__dirname, "..", "public");
const MAX_PROXY_BODY_BYTES = 128 * 1024;
const CONTENT_TYPES = new Map([
[".css", "text/css; charset=utf-8"],
[".html", "text/html; charset=utf-8"],
[".js", "text/javascript; charset=utf-8"],
[".svg", "image/svg+xml"]
]);
const server = http.createServer(async (req, res) => {
try {
await route(req, res);
} catch (error) {
res.writeHead(error.statusCode || 500, {
"content-type": "text/plain; charset=utf-8",
"cache-control": "no-store"
});
res.end(error.message || "Unexpected error");
}
});
server.listen(PORT, () => {
console.log(`archive frontend listening on ${PORT}`);
console.log(`archive backend: ${BACKEND_URL}`);
});
async function route(req, res) {
const requestUrl = new URL(req.url, `http://${req.headers.host || "localhost"}`);
if (requestUrl.pathname === "/healthz") {
res.writeHead(200, {
"content-type": "application/json; charset=utf-8",
"cache-control": "no-store"
});
res.end('{"ok":true}\n');
return;
}
if (requestUrl.pathname.startsWith("/api/") || requestUrl.pathname.startsWith("/archives/")) {
await proxyToBackend(req, res, requestUrl);
return;
}
if (req.method !== "GET" && req.method !== "HEAD") {
throw httpError(405, "Method not allowed");
}
if (requestUrl.pathname.startsWith("/assets/")) {
await serveStatic(requestUrl.pathname, res);
return;
}
await serveStatic("/index.html", res);
}
async function serveStatic(urlPath, res) {
let decodedPath;
try {
decodedPath = decodeURIComponent(urlPath);
} catch {
throw httpError(400, "Invalid path");
}
const filePath = path.join(PUBLIC_DIR, decodedPath);
const relative = path.relative(PUBLIC_DIR, filePath);
if (relative.startsWith("..") || path.isAbsolute(relative)) {
throw httpError(404, "Not found");
}
const bytes = await fs.readFile(filePath).catch((error) => {
if (error.code === "ENOENT") {
throw httpError(404, "Not found");
}
throw error;
});
const type = CONTENT_TYPES.get(path.extname(filePath)) || "application/octet-stream";
res.writeHead(200, {
"content-type": type,
"cache-control": "no-store"
});
res.end(bytes);
}
async function proxyToBackend(req, res, requestUrl) {
const upstreamUrl = new URL(`${requestUrl.pathname}${requestUrl.search}`, BACKEND_URL);
const headers = {};
for (const [key, value] of Object.entries(req.headers)) {
if (["connection", "content-length", "host"].includes(key.toLowerCase())) {
continue;
}
if (Array.isArray(value)) {
headers[key] = value.join(", ");
} else if (value !== undefined) {
headers[key] = value;
}
}
const body = req.method === "GET" || req.method === "HEAD"
? undefined
: await readRequestBody(req, MAX_PROXY_BODY_BYTES);
const upstream = await fetch(upstreamUrl, {
method: req.method,
headers,
body
});
const responseHeaders = {};
upstream.headers.forEach((value, key) => {
if (!["connection", "content-encoding", "transfer-encoding"].includes(key.toLowerCase())) {
responseHeaders[key] = value;
}
});
res.writeHead(upstream.status, responseHeaders);
if (req.method === "HEAD" || !upstream.body) {
res.end();
return;
}
Readable.fromWeb(upstream.body).pipe(res);
}
function readRequestBody(req, maxBytes) {
return new Promise((resolve, reject) => {
const chunks = [];
let total = 0;
req.on("data", (chunk) => {
total += chunk.length;
if (total > maxBytes) {
reject(httpError(413, "Request body is too large"));
req.destroy();
return;
}
chunks.push(chunk);
});
req.on("end", () => resolve(Buffer.concat(chunks)));
req.on("error", reject);
});
}
function httpError(statusCode, message) {
const error = new Error(message);
error.statusCode = statusCode;
return error;
}

103
src/worker-server.mjs Normal file
View File

@@ -0,0 +1,103 @@
#!/usr/bin/env node
import http from "node:http";
import path from "node:path";
import { archivePage, defaultArchivePath } from "./archiver.mjs";
import { archiveIdForUrl, normalizeArchiveUrl } from "./archive-catalog.mjs";
const PORT = Number(process.env.PORT || process.env.ARCHIVE_WORKER_PORT || 5733);
const ARCHIVE_PATH = path.resolve(process.env.ARCHIVE_PATH || defaultArchivePath());
const MAX_BODY_BYTES = 64 * 1024;
let queue = Promise.resolve();
const server = http.createServer(async (req, res) => {
try {
await route(req, res);
} catch (error) {
sendJson(res, error.statusCode || 500, {
ok: false,
error: error.message || "Unexpected error"
});
}
});
server.listen(PORT, () => {
console.log(`archive worker listening on ${PORT}`);
console.log(`archive path: ${ARCHIVE_PATH}`);
});
async function route(req, res) {
const requestUrl = new URL(req.url, `http://${req.headers.host || "localhost"}`);
if (req.method === "GET" && requestUrl.pathname === "/healthz") {
sendJson(res, 200, { ok: true });
return;
}
if (req.method === "POST" && requestUrl.pathname === "/archive") {
const body = await readJsonBody(req);
const sourceUrl = normalizeArchiveUrl(body.url);
const id = typeof body.id === "string" && body.id.trim() ? body.id.trim() : archiveIdForUrl(sourceUrl);
const result = await enqueueArchive(sourceUrl, id);
sendJson(res, 200, {
ok: true,
result
});
return;
}
sendJson(res, 404, { ok: false, error: "Not found" });
}
function enqueueArchive(sourceUrl, id) {
const run = () => archivePage(sourceUrl, {
archivePath: ARCHIVE_PATH,
id
});
queue = queue.then(run, run);
return queue;
}
async function readJsonBody(req) {
const text = await readRequestBody(req, MAX_BODY_BYTES);
if (!text.trim()) {
throw httpError(400, "Request body is required");
}
try {
return JSON.parse(text);
} catch {
throw httpError(400, "Request body must be JSON");
}
}
function readRequestBody(req, maxBytes) {
return new Promise((resolve, reject) => {
const chunks = [];
let total = 0;
req.on("data", (chunk) => {
total += chunk.length;
if (total > maxBytes) {
reject(httpError(413, "Request body is too large"));
req.destroy();
return;
}
chunks.push(chunk);
});
req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
req.on("error", reject);
});
}
function sendJson(res, statusCode, payload) {
res.writeHead(statusCode, {
"content-type": "application/json; charset=utf-8",
"cache-control": "no-store"
});
res.end(`${JSON.stringify(payload)}\n`);
}
function httpError(statusCode, message) {
const error = new Error(message);
error.statusCode = statusCode;
return error;
}