import fs from "node:fs/promises"; import path from "node:path"; import { createRequire } from "node:module"; import { AssetInliner, DEFAULT_USER_AGENT, defaultArchivePath, findEffectiveBase, inputToUrl, isHttpUrl, slugForUrl } from "./asset-inliner.mjs"; const require = createRequire(import.meta.url); const PAGE_TIMEOUT_MS = 60000; const NETWORK_IDLE_TIMEOUT_MS = 5000; const VIEWPORT = { width: 1024, height: 768 }; export { DEFAULT_USER_AGENT, defaultArchivePath }; export async function archivePage(input, options = {}) { const sourceUrl = inputToUrl(input); const archivePath = options.archivePath || defaultArchivePath(); const id = options.id || slugForUrl(sourceUrl); const filePath = path.join(archivePath, `${id}.html`); await fs.mkdir(archivePath, { recursive: true }); const renderedHtml = await renderPage(sourceUrl); const baseUrl = findEffectiveBase(renderedHtml, sourceUrl); const inliner = new AssetInliner({ userAgent: DEFAULT_USER_AGENT, referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined }); const inlined = await inliner.inlineHtml(renderedHtml, baseUrl); const finalHtml = addArchiveComment(inlined, sourceUrl); await fs.writeFile(filePath, finalHtml, "utf8"); return { id, filePath, sourceUrl, archivePath, warnings: inliner.warnings, externalAssets: findExternalAssetRefs(finalHtml) }; } export async function renderPage(sourceUrl) { const playwright = loadPlaywright(); const browser = await playwright.chromium.launch({ headless: true }); try { const context = await browser.newContext({ userAgent: DEFAULT_USER_AGENT, viewport: VIEWPORT }); const page = await context.newPage(); await page.goto(sourceUrl, { waitUntil: "domcontentloaded", timeout: PAGE_TIMEOUT_MS }); await waitForNetworkIdle(page); await snapshotLoadedResourceUrls(page); return await page.content(); } finally { await browser.close(); } } async function waitForNetworkIdle(page) { try { await page.waitForLoadState("networkidle", { timeout: NETWORK_IDLE_TIMEOUT_MS }); } catch { // Some pages keep sockets open; the DOM snapshot is still useful. } } async function snapshotLoadedResourceUrls(page) { await page.evaluate(() => { document.querySelectorAll("img").forEach((img) => { if (img.currentSrc) { img.setAttribute("src", img.currentSrc); } }); document.querySelectorAll("video,audio").forEach((media) => { if (media.currentSrc) { media.setAttribute("src", media.currentSrc); } }); document.querySelectorAll("iframe").forEach((frame) => { try { const doc = frame.contentDocument; if (doc?.documentElement) { frame.setAttribute("srcdoc", "" + doc.documentElement.outerHTML); frame.removeAttribute("src"); } } catch { // Cross-origin frames are handled later by the asset inliner when possible. } }); }); } function loadPlaywright() { try { return require("playwright"); } catch (error) { throw new Error( `Playwright is required. Run "npm install" and "npm run install-browsers". Original error: ${error.message}` ); } } function addArchiveComment(html, sourceUrl) { const safeSource = String(sourceUrl).replaceAll("--", "- -"); const comment = ``; if (/]*>/i, (doctype) => `${doctype}\n${comment}`); } return `\n${comment}\n${html}`; } export function findExternalAssetRefs(html) { const refs = new Set(); const attrPattern = /\s(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi; for (const match of html.matchAll(attrPattern)) { if (isSelfContainedAssetRef(match[2])) { continue; } for (const part of match[2].split(",")) { const candidate = part.trim().split(/\s+/)[0]; if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } } const linkPattern = /]*>/gi; for (const match of html.matchAll(linkPattern)) { const tag = match[0]; const rel = readAttribute(tag, "rel") || ""; if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) { continue; } const href = readAttribute(tag, "href"); if (href && !isSelfContainedAssetRef(href)) { refs.add(href); } } const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi; for (const match of html.matchAll(cssUrlPattern)) { const candidate = cleanCssUrl(match[2]); if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } return Array.from(refs).sort(); } function isSelfContainedAssetRef(value) { const trimmed = cleanCssUrl(value); return ( !trimmed || trimmed.startsWith("#") || /^%23/i.test(trimmed) || /^(?:data|about|javascript|mailto|tel):/i.test(trimmed) ); } function readAttribute(tag, attr) { const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i")); return match ? match[2] ?? match[3] ?? match[4] ?? "" : ""; } function cleanCssUrl(value) { const decoded = String(value) .trim() .replaceAll("&", "&") .replaceAll(""", '"') .replaceAll("'", "'") .replaceAll("'", "'"); const quote = decoded[0]; if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) { return decoded.slice(1, -1).trim(); } return decoded; }