Files
sigilbox/src/archiver.mjs

201 lines
5.5 KiB
JavaScript
Raw Normal View History

2026-05-14 08:12:13 -07:00
import fs from "node:fs/promises";
import path from "node:path";
import { createRequire } from "node:module";
import {
AssetInliner,
DEFAULT_USER_AGENT,
defaultArchivePath,
findEffectiveBase,
inputToUrl,
isHttpUrl,
2026-05-15 01:00:27 -07:00
slugForUrl
2026-05-14 08:12:13 -07:00
} from "./asset-inliner.mjs";
const require = createRequire(import.meta.url);
2026-05-15 01:00:27 -07:00
const PAGE_TIMEOUT_MS = 60000;
const NETWORK_IDLE_TIMEOUT_MS = 5000;
const VIEWPORT = {
width: 1024,
height: 768
};
2026-05-14 08:12:13 -07:00
export { DEFAULT_USER_AGENT, defaultArchivePath };
export async function archivePage(input, options = {}) {
const sourceUrl = inputToUrl(input);
const archivePath = options.archivePath || defaultArchivePath();
const id = options.id || slugForUrl(sourceUrl);
const filePath = path.join(archivePath, `${id}.html`);
2026-05-15 01:00:27 -07:00
await fs.mkdir(archivePath, { recursive: true });
2026-05-14 08:12:13 -07:00
2026-05-15 01:00:27 -07:00
const renderedHtml = await renderPage(sourceUrl);
const baseUrl = findEffectiveBase(renderedHtml, sourceUrl);
2026-05-14 08:12:13 -07:00
const inliner = new AssetInliner({
2026-05-15 01:00:27 -07:00
userAgent: DEFAULT_USER_AGENT,
referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined
2026-05-14 08:12:13 -07:00
});
2026-05-15 01:00:27 -07:00
2026-05-14 08:12:13 -07:00
const inlined = await inliner.inlineHtml(renderedHtml, baseUrl);
2026-05-15 01:00:27 -07:00
const finalHtml = addArchiveComment(inlined, sourceUrl);
2026-05-14 08:12:13 -07:00
await fs.writeFile(filePath, finalHtml, "utf8");
return {
id,
filePath,
sourceUrl,
archivePath,
warnings: inliner.warnings,
externalAssets: findExternalAssetRefs(finalHtml)
};
}
2026-05-15 01:00:27 -07:00
export async function renderPage(sourceUrl) {
2026-05-14 08:12:13 -07:00
const playwright = loadPlaywright();
2026-05-15 01:00:27 -07:00
const browser = await playwright.chromium.launch({ headless: true });
2026-05-14 08:12:13 -07:00
try {
const context = await browser.newContext({
2026-05-15 01:00:27 -07:00
userAgent: DEFAULT_USER_AGENT,
viewport: VIEWPORT
2026-05-14 08:12:13 -07:00
});
const page = await context.newPage();
2026-05-15 01:00:27 -07:00
await page.goto(sourceUrl, {
waitUntil: "domcontentloaded",
timeout: PAGE_TIMEOUT_MS
});
await waitForNetworkIdle(page);
await snapshotLoadedResourceUrls(page);
2026-05-14 08:12:13 -07:00
return await page.content();
} finally {
await browser.close();
}
}
2026-05-15 01:00:27 -07:00
async function waitForNetworkIdle(page) {
2026-05-14 08:12:13 -07:00
try {
2026-05-15 01:00:27 -07:00
await page.waitForLoadState("networkidle", {
timeout: NETWORK_IDLE_TIMEOUT_MS
2026-05-14 08:12:13 -07:00
});
2026-05-15 01:00:27 -07:00
} catch {
// Some pages keep sockets open; the DOM snapshot is still useful.
2026-05-14 08:12:13 -07:00
}
}
2026-05-15 01:00:27 -07:00
async function snapshotLoadedResourceUrls(page) {
await page.evaluate(() => {
document.querySelectorAll("img").forEach((img) => {
if (img.currentSrc) {
img.setAttribute("src", img.currentSrc);
2026-05-14 08:12:13 -07:00
}
2026-05-15 01:00:27 -07:00
});
2026-05-14 08:12:13 -07:00
2026-05-15 01:00:27 -07:00
document.querySelectorAll("video,audio").forEach((media) => {
if (media.currentSrc) {
media.setAttribute("src", media.currentSrc);
2026-05-14 08:12:13 -07:00
}
2026-05-15 01:00:27 -07:00
});
2026-05-14 08:12:13 -07:00
2026-05-15 01:00:27 -07:00
document.querySelectorAll("iframe").forEach((frame) => {
try {
const doc = frame.contentDocument;
if (doc?.documentElement) {
frame.setAttribute("srcdoc", "<!doctype html>" + doc.documentElement.outerHTML);
frame.removeAttribute("src");
2026-05-14 08:12:13 -07:00
}
2026-05-15 01:00:27 -07:00
} catch {
// Cross-origin frames are handled later by the asset inliner when possible.
2026-05-14 08:12:13 -07:00
}
2026-05-15 01:00:27 -07:00
});
});
2026-05-14 08:12:13 -07:00
}
function loadPlaywright() {
try {
return require("playwright");
} catch (error) {
throw new Error(
2026-05-15 01:00:27 -07:00
`Playwright is required. Run "npm install" and "npm run install-browsers". Original error: ${error.message}`
2026-05-14 08:12:13 -07:00
);
}
}
2026-05-15 01:00:27 -07:00
function addArchiveComment(html, sourceUrl) {
const safeSource = String(sourceUrl).replaceAll("--", "- -");
const comment = `<!-- Archived locally. Source: ${safeSource}. Created: ${new Date().toISOString()}. -->`;
2026-05-14 08:12:13 -07:00
if (/<!doctype/i.test(html)) {
return html.replace(/<!doctype[^>]*>/i, (doctype) => `${doctype}\n${comment}`);
}
return `<!doctype html>\n${comment}\n${html}`;
}
export function findExternalAssetRefs(html) {
const refs = new Set();
2026-05-14 09:11:05 -07:00
const attrPattern = /\s(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi;
2026-05-14 08:12:13 -07:00
for (const match of html.matchAll(attrPattern)) {
if (isSelfContainedAssetRef(match[2])) {
continue;
}
for (const part of match[2].split(",")) {
const candidate = part.trim().split(/\s+/)[0];
if (candidate && !isSelfContainedAssetRef(candidate)) {
refs.add(candidate);
}
}
}
2026-05-15 01:00:27 -07:00
2026-05-14 08:12:13 -07:00
const linkPattern = /<link\b[^>]*>/gi;
for (const match of html.matchAll(linkPattern)) {
const tag = match[0];
const rel = readAttribute(tag, "rel") || "";
if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) {
continue;
}
const href = readAttribute(tag, "href");
if (href && !isSelfContainedAssetRef(href)) {
refs.add(href);
}
}
2026-05-15 01:00:27 -07:00
2026-05-14 08:12:13 -07:00
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
for (const match of html.matchAll(cssUrlPattern)) {
2026-05-14 09:11:05 -07:00
const candidate = cleanCssUrl(match[2]);
2026-05-14 08:12:13 -07:00
if (candidate && !isSelfContainedAssetRef(candidate)) {
refs.add(candidate);
}
}
2026-05-15 01:00:27 -07:00
2026-05-14 08:12:13 -07:00
return Array.from(refs).sort();
}
function isSelfContainedAssetRef(value) {
2026-05-14 09:11:05 -07:00
const trimmed = cleanCssUrl(value);
2026-05-14 08:12:13 -07:00
return (
!trimmed ||
trimmed.startsWith("#") ||
2026-05-14 09:11:05 -07:00
/^%23/i.test(trimmed) ||
2026-05-14 08:12:13 -07:00
/^(?:data|about|javascript|mailto|tel):/i.test(trimmed)
);
}
function readAttribute(tag, attr) {
const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"));
return match ? match[2] ?? match[3] ?? match[4] ?? "" : "";
}
2026-05-14 09:11:05 -07:00
function cleanCssUrl(value) {
const decoded = String(value)
.trim()
.replaceAll("&amp;", "&")
.replaceAll("&quot;", '"')
.replaceAll("&#39;", "'")
.replaceAll("&apos;", "'");
const quote = decoded[0];
if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) {
return decoded.slice(1, -1).trim();
}
return decoded;
}