import fs from "node:fs/promises"; import path from "node:path"; import { createRequire } from "node:module"; import { AssetInliner, DEFAULT_USER_AGENT, defaultArchivePath, findEffectiveBase, htmlEscape, inputToUrl, isFileUrl, isHttpUrl, slugForUrl, stripArchiveShell } from "./asset-inliner.mjs"; const require = createRequire(import.meta.url); export { DEFAULT_USER_AGENT, defaultArchivePath }; const AD_SELECTORS = [ "[data-ad-status]", "[data-ad-type]", "[aria-label*='advertisement' i]", "[id^='leaderboard']", "[class*='LeaderboardAd_']", "[class*='FullWidthAd_']", "[class*='BaseAd_']", ".adWrapper", ".dvz-v0-ad", "amp-ad", "iframe[src*='doubleclick']", "iframe[src*='googletagmanager']", "iframe[src*='googlesyndication']" ]; const TRACKER_HOST_PATTERNS = [ "doubleclick.net", "googletagmanager.com", "googlesyndication.com", "google-analytics.com", "pub.doubleverify.com", "securepubads.g.doubleclick.net", "s10.histats.com", "sstatic1.histats.com" ]; export async function archivePage(input, options = {}) { const sourceUrl = inputToUrl(input); const archivePath = options.archivePath || defaultArchivePath(); const id = options.id || slugForUrl(sourceUrl); const filePath = path.join(archivePath, `${id}.html`); await fs.mkdir(archivePath, { recursive: true }); const rawHtml = await readInputHtml(sourceUrl, options); const baseUrl = rawHtml ? findEffectiveBase(rawHtml, sourceUrl) : sourceUrl; const useStatic = options.static || (isFileUrl(sourceUrl) && options.stripArchiveShell && options.render !== true); const renderedHtml = useStatic ? prepareStaticHtml(rawHtml, options) : await renderPage(sourceUrl, { ...options, rawHtml, baseUrl }); const inliner = new AssetInliner({ userAgent: options.userAgent || DEFAULT_USER_AGENT, referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined, maxAssetBytes: options.maxAssetBytes }); const inlined = await inliner.inlineHtml(renderedHtml, baseUrl); const finalHtml = addArchiveComment(inlined, sourceUrl, options); await fs.writeFile(filePath, finalHtml, "utf8"); return { id, filePath, sourceUrl, archivePath, warnings: inliner.warnings, externalAssets: findExternalAssetRefs(finalHtml) }; } export async function readInputHtml(sourceUrl, options = {}) { if (isFileUrl(sourceUrl)) { return fs.readFile(new URL(sourceUrl), "utf8"); } if (!isHttpUrl(sourceUrl) || !options.static) { return null; } const response = await fetch(sourceUrl, { headers: { "user-agent": options.userAgent || DEFAULT_USER_AGENT, accept: "text/html,application/xhtml+xml" }, redirect: "follow" }); if (!response.ok) { throw new Error(`Failed to fetch ${sourceUrl}: HTTP ${response.status}`); } return response.text(); } function prepareStaticHtml(rawHtml, options = {}) { if (!rawHtml) { throw new Error("Static mode requires an HTML input file or fetched HTML document."); } return options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml; } export async function renderPage(sourceUrl, options = {}) { const playwright = loadPlaywright(); const browser = await playwright.chromium.launch({ headless: true }); try { const context = await browser.newContext({ javaScriptEnabled: options.javaScriptEnabled ?? !(options.rawHtml && isFileUrl(sourceUrl)), userAgent: options.userAgent || DEFAULT_USER_AGENT, viewport: { width: options.viewportWidth || 1024, height: options.viewportHeight || 768 } }); const page = await context.newPage(); if (options.stripAds !== false) { await page.route("**/*", (route) => { const url = route.request().url(); if (isTrackerUrl(url)) { return route.abort(); } return route.continue(); }); } if (options.rawHtml && isFileUrl(sourceUrl)) { const content = prepareRenderInputHtml(options.rawHtml, options); await page.setContent(content, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 }); } else { await page.goto(sourceUrl, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 }); } await settlePage(page, options); await cleanupAndFreezePage(page, options); return await page.content(); } finally { await browser.close(); } } async function settlePage(page, options) { try { await page.waitForLoadState("networkidle", { timeout: options.networkIdleTimeoutMs || 15000 }); } catch { // Dynamic pages often keep long-lived connections open; DOM capture can still proceed. } if (options.scroll !== false) { await page.evaluate(async () => { await new Promise((resolve) => { let total = 0; const step = Math.max(400, Math.floor(window.innerHeight * 0.8)); const timer = setInterval(() => { const previous = document.scrollingElement?.scrollTop || window.scrollY; window.scrollBy(0, step); total += step; const current = document.scrollingElement?.scrollTop || window.scrollY; if (current === previous || total > Math.max(document.body.scrollHeight, 20000)) { clearInterval(timer); window.scrollTo(0, 0); resolve(); } }, 120); }); }); } } async function cleanupAndFreezePage(page, options) { await page.evaluate( ({ adSelectors, freezeStyles, maxFreezeElements, maxSanitizeElements, stripAds, stripArchiveShell: shouldStripArchiveShell }) => { function removeAll(selector) { document.querySelectorAll(selector).forEach((node) => node.remove()); } if (shouldStripArchiveShell) { const content = document.querySelector("#CONTENT .html1") || document.querySelector("#CONTENT"); if (content) { document.body.innerHTML = ""; document.body.appendChild(content.cloneNode(true)); document.documentElement.removeAttribute("prefix"); document.documentElement.removeAttribute("itemscope"); document.documentElement.removeAttribute("itemtype"); } } removeAll("script"); removeAll("noscript"); removeAll("link[rel='preconnect'],link[rel='dns-prefetch'],link[rel='modulepreload']"); removeAll("meta[name='next-head-count']"); if (stripAds) { for (const selector of adSelectors) { try { removeAll(selector); } catch { // Ignore unsupported selectors in older browser engines. } } } document.querySelectorAll("img").forEach((img) => { if (img.currentSrc) { img.setAttribute("data-original-src", img.getAttribute("src") || ""); img.setAttribute("src", img.currentSrc); } img.removeAttribute("srcset"); img.removeAttribute("sizes"); img.setAttribute("loading", "lazy"); }); document.querySelectorAll("source").forEach((source) => { source.removeAttribute("srcset"); }); document.querySelectorAll("video,audio").forEach((media) => { if (media.currentSrc) { media.setAttribute("src", media.currentSrc); } }); document.querySelectorAll("iframe").forEach((frame) => { const src = frame.getAttribute("src"); if (src) { frame.setAttribute("data-archived-src", src); } try { const doc = frame.contentDocument; if (doc?.documentElement) { frame.setAttribute("srcdoc", "" + doc.documentElement.outerHTML); frame.removeAttribute("src"); } } catch { // Cross-origin iframe sources are handled in the Node-side inliner when possible. } }); const walkedElements = []; const walker = document.createTreeWalker(document.documentElement, NodeFilter.SHOW_ELEMENT); let element = document.documentElement; let visited = 0; while (element && visited < maxSanitizeElements) { walkedElements.push(element); for (const attr of Array.from(element.attributes)) { if (/^on/i.test(attr.name) || attr.name === "integrity" || attr.name === "nonce") { element.removeAttribute(attr.name); } } visited += 1; element = walker.nextNode(); } if (!freezeStyles || element || walkedElements.length > maxFreezeElements) { return; } for (const element of walkedElements) { if (element.tagName === "SCRIPT" || element.tagName === "STYLE") { continue; } const computed = window.getComputedStyle(element); const declarations = []; for (let i = 0; i < computed.length; i += 1) { const property = computed[i]; const value = computed.getPropertyValue(property); if (value) { declarations.push(`${property}:${value}`); } } if (declarations.length) { element.setAttribute("style", declarations.join(";")); } } }, { adSelectors: AD_SELECTORS, freezeStyles: options.freezeStyles !== false, maxFreezeElements: options.maxFreezeElements || 2500, maxSanitizeElements: options.maxSanitizeElements || 5000, stripAds: options.stripAds !== false, stripArchiveShell: Boolean(options.stripArchiveShell) } ); } function prepareRenderInputHtml(rawHtml, options) { let html = options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml; html = html .replace(//gi, "") .replace(//gi, ""); if (!options.baseUrl) { return html; } if (/`; if (/]*>/i.test(html)) { return html.replace(/]*>/i, (match) => `${match}${baseTag}`); } return `${baseTag}${html}`; } function loadPlaywright() { try { return require("playwright"); } catch (error) { throw new Error( `Playwright is required for render mode. Run "npm install" and "npm run install-browsers", or use --static for HTML input files. Original error: ${error.message}` ); } } function isTrackerUrl(rawUrl) { let host = ""; try { host = new URL(rawUrl).hostname; } catch { return false; } return TRACKER_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`)); } function addArchiveComment(html, sourceUrl, options) { const comment = ``; if (/]*>/i, (doctype) => `${doctype}\n${comment}`); } return `\n${comment}\n${html}`; } export function findExternalAssetRefs(html) { const refs = new Set(); const attrPattern = /\b(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi; for (const match of html.matchAll(attrPattern)) { if (isSelfContainedAssetRef(match[2])) { continue; } for (const part of match[2].split(",")) { const candidate = part.trim().split(/\s+/)[0]; if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } } const linkPattern = /]*>/gi; for (const match of html.matchAll(linkPattern)) { const tag = match[0]; const rel = readAttribute(tag, "rel") || ""; if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) { continue; } const href = readAttribute(tag, "href"); if (href && !isSelfContainedAssetRef(href)) { refs.add(href); } } const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi; for (const match of html.matchAll(cssUrlPattern)) { const candidate = match[2].trim(); if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } return Array.from(refs).sort(); } function isSelfContainedAssetRef(value) { const trimmed = value.trim(); return ( !trimmed || trimmed.startsWith("#") || /^(?:data|about|javascript|mailto|tel):/i.test(trimmed) ); } function readAttribute(tag, attr) { const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i")); return match ? match[2] ?? match[3] ?? match[4] ?? "" : ""; }