2026-05-14 08:12:13 -07:00
|
|
|
import fs from "node:fs/promises";
|
|
|
|
|
import path from "node:path";
|
|
|
|
|
import { createRequire } from "node:module";
|
|
|
|
|
import {
|
|
|
|
|
AssetInliner,
|
|
|
|
|
DEFAULT_USER_AGENT,
|
|
|
|
|
defaultArchivePath,
|
|
|
|
|
findEffectiveBase,
|
|
|
|
|
inputToUrl,
|
|
|
|
|
isHttpUrl,
|
2026-05-15 01:00:27 -07:00
|
|
|
slugForUrl
|
2026-05-14 08:12:13 -07:00
|
|
|
} from "./asset-inliner.mjs";
|
|
|
|
|
|
|
|
|
|
const require = createRequire(import.meta.url);
|
2026-05-15 01:00:27 -07:00
|
|
|
const PAGE_TIMEOUT_MS = 60000;
|
|
|
|
|
const NETWORK_IDLE_TIMEOUT_MS = 5000;
|
|
|
|
|
const VIEWPORT = {
|
|
|
|
|
width: 1024,
|
|
|
|
|
height: 768
|
|
|
|
|
};
|
2026-05-14 08:12:13 -07:00
|
|
|
|
|
|
|
|
export { DEFAULT_USER_AGENT, defaultArchivePath };
|
|
|
|
|
|
|
|
|
|
export async function archivePage(input, options = {}) {
|
|
|
|
|
const sourceUrl = inputToUrl(input);
|
|
|
|
|
const archivePath = options.archivePath || defaultArchivePath();
|
|
|
|
|
const id = options.id || slugForUrl(sourceUrl);
|
|
|
|
|
const filePath = path.join(archivePath, `${id}.html`);
|
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
await fs.mkdir(archivePath, { recursive: true });
|
2026-05-14 08:12:13 -07:00
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
const renderedHtml = await renderPage(sourceUrl);
|
|
|
|
|
const baseUrl = findEffectiveBase(renderedHtml, sourceUrl);
|
2026-05-14 08:12:13 -07:00
|
|
|
const inliner = new AssetInliner({
|
2026-05-15 01:00:27 -07:00
|
|
|
userAgent: DEFAULT_USER_AGENT,
|
|
|
|
|
referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined
|
2026-05-14 08:12:13 -07:00
|
|
|
});
|
2026-05-15 01:00:27 -07:00
|
|
|
|
2026-05-14 08:12:13 -07:00
|
|
|
const inlined = await inliner.inlineHtml(renderedHtml, baseUrl);
|
2026-05-15 01:00:27 -07:00
|
|
|
const finalHtml = addArchiveComment(inlined, sourceUrl);
|
2026-05-14 08:12:13 -07:00
|
|
|
await fs.writeFile(filePath, finalHtml, "utf8");
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
id,
|
|
|
|
|
filePath,
|
|
|
|
|
sourceUrl,
|
|
|
|
|
archivePath,
|
|
|
|
|
warnings: inliner.warnings,
|
|
|
|
|
externalAssets: findExternalAssetRefs(finalHtml)
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
export async function renderPage(sourceUrl) {
|
2026-05-14 08:12:13 -07:00
|
|
|
const playwright = loadPlaywright();
|
2026-05-15 01:00:27 -07:00
|
|
|
const browser = await playwright.chromium.launch({ headless: true });
|
|
|
|
|
|
2026-05-14 08:12:13 -07:00
|
|
|
try {
|
|
|
|
|
const context = await browser.newContext({
|
2026-05-15 01:00:27 -07:00
|
|
|
userAgent: DEFAULT_USER_AGENT,
|
|
|
|
|
viewport: VIEWPORT
|
2026-05-14 08:12:13 -07:00
|
|
|
});
|
|
|
|
|
const page = await context.newPage();
|
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
await page.goto(sourceUrl, {
|
|
|
|
|
waitUntil: "domcontentloaded",
|
|
|
|
|
timeout: PAGE_TIMEOUT_MS
|
|
|
|
|
});
|
|
|
|
|
await waitForNetworkIdle(page);
|
|
|
|
|
await snapshotLoadedResourceUrls(page);
|
2026-05-14 08:12:13 -07:00
|
|
|
|
|
|
|
|
return await page.content();
|
|
|
|
|
} finally {
|
|
|
|
|
await browser.close();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
async function waitForNetworkIdle(page) {
|
2026-05-14 08:12:13 -07:00
|
|
|
try {
|
2026-05-15 01:00:27 -07:00
|
|
|
await page.waitForLoadState("networkidle", {
|
|
|
|
|
timeout: NETWORK_IDLE_TIMEOUT_MS
|
2026-05-14 08:12:13 -07:00
|
|
|
});
|
2026-05-15 01:00:27 -07:00
|
|
|
} catch {
|
|
|
|
|
// Some pages keep sockets open; the DOM snapshot is still useful.
|
2026-05-14 08:12:13 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
async function snapshotLoadedResourceUrls(page) {
|
|
|
|
|
await page.evaluate(() => {
|
|
|
|
|
document.querySelectorAll("img").forEach((img) => {
|
|
|
|
|
if (img.currentSrc) {
|
|
|
|
|
img.setAttribute("src", img.currentSrc);
|
2026-05-14 08:12:13 -07:00
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
});
|
2026-05-14 08:12:13 -07:00
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
document.querySelectorAll("video,audio").forEach((media) => {
|
|
|
|
|
if (media.currentSrc) {
|
|
|
|
|
media.setAttribute("src", media.currentSrc);
|
2026-05-14 08:12:13 -07:00
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
});
|
2026-05-14 08:12:13 -07:00
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
document.querySelectorAll("iframe").forEach((frame) => {
|
|
|
|
|
try {
|
|
|
|
|
const doc = frame.contentDocument;
|
|
|
|
|
if (doc?.documentElement) {
|
|
|
|
|
frame.setAttribute("srcdoc", "<!doctype html>" + doc.documentElement.outerHTML);
|
|
|
|
|
frame.removeAttribute("src");
|
2026-05-14 08:12:13 -07:00
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
} catch {
|
|
|
|
|
// Cross-origin frames are handled later by the asset inliner when possible.
|
2026-05-14 08:12:13 -07:00
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
});
|
|
|
|
|
});
|
2026-05-14 08:12:13 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function loadPlaywright() {
|
|
|
|
|
try {
|
|
|
|
|
return require("playwright");
|
|
|
|
|
} catch (error) {
|
|
|
|
|
throw new Error(
|
2026-05-15 01:00:27 -07:00
|
|
|
`Playwright is required. Run "npm install" and "npm run install-browsers". Original error: ${error.message}`
|
2026-05-14 08:12:13 -07:00
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
function addArchiveComment(html, sourceUrl) {
|
|
|
|
|
const safeSource = String(sourceUrl).replaceAll("--", "- -");
|
|
|
|
|
const comment = `<!-- Archived locally. Source: ${safeSource}. Created: ${new Date().toISOString()}. -->`;
|
2026-05-14 08:12:13 -07:00
|
|
|
if (/<!doctype/i.test(html)) {
|
|
|
|
|
return html.replace(/<!doctype[^>]*>/i, (doctype) => `${doctype}\n${comment}`);
|
|
|
|
|
}
|
|
|
|
|
return `<!doctype html>\n${comment}\n${html}`;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export function findExternalAssetRefs(html) {
|
|
|
|
|
const refs = new Set();
|
2026-05-14 09:11:05 -07:00
|
|
|
const attrPattern = /\s(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi;
|
2026-05-14 08:12:13 -07:00
|
|
|
for (const match of html.matchAll(attrPattern)) {
|
|
|
|
|
if (isSelfContainedAssetRef(match[2])) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
for (const part of match[2].split(",")) {
|
|
|
|
|
const candidate = part.trim().split(/\s+/)[0];
|
|
|
|
|
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
|
|
|
|
refs.add(candidate);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
|
2026-05-14 08:12:13 -07:00
|
|
|
const linkPattern = /<link\b[^>]*>/gi;
|
|
|
|
|
for (const match of html.matchAll(linkPattern)) {
|
|
|
|
|
const tag = match[0];
|
|
|
|
|
const rel = readAttribute(tag, "rel") || "";
|
|
|
|
|
if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
const href = readAttribute(tag, "href");
|
|
|
|
|
if (href && !isSelfContainedAssetRef(href)) {
|
|
|
|
|
refs.add(href);
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
|
2026-05-14 08:12:13 -07:00
|
|
|
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
|
|
|
|
|
for (const match of html.matchAll(cssUrlPattern)) {
|
2026-05-14 09:11:05 -07:00
|
|
|
const candidate = cleanCssUrl(match[2]);
|
2026-05-14 08:12:13 -07:00
|
|
|
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
|
|
|
|
refs.add(candidate);
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
|
2026-05-14 08:12:13 -07:00
|
|
|
return Array.from(refs).sort();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function isSelfContainedAssetRef(value) {
|
2026-05-14 09:11:05 -07:00
|
|
|
const trimmed = cleanCssUrl(value);
|
2026-05-14 08:12:13 -07:00
|
|
|
return (
|
|
|
|
|
!trimmed ||
|
|
|
|
|
trimmed.startsWith("#") ||
|
2026-05-14 09:11:05 -07:00
|
|
|
/^%23/i.test(trimmed) ||
|
2026-05-14 08:12:13 -07:00
|
|
|
/^(?:data|about|javascript|mailto|tel):/i.test(trimmed)
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function readAttribute(tag, attr) {
|
|
|
|
|
const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"));
|
|
|
|
|
return match ? match[2] ?? match[3] ?? match[4] ?? "" : "";
|
|
|
|
|
}
|
2026-05-14 09:11:05 -07:00
|
|
|
|
|
|
|
|
function cleanCssUrl(value) {
|
|
|
|
|
const decoded = String(value)
|
|
|
|
|
.trim()
|
|
|
|
|
.replaceAll("&", "&")
|
|
|
|
|
.replaceAll(""", '"')
|
|
|
|
|
.replaceAll("'", "'")
|
|
|
|
|
.replaceAll("'", "'");
|
|
|
|
|
const quote = decoded[0];
|
|
|
|
|
if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) {
|
|
|
|
|
return decoded.slice(1, -1).trim();
|
|
|
|
|
}
|
|
|
|
|
return decoded;
|
|
|
|
|
}
|