import fs from "node:fs/promises"; import path from "node:path"; import { createRequire } from "node:module"; import { AssetInliner, DEFAULT_USER_AGENT, defaultArchivePath, findEffectiveBase, htmlEscape, inputToUrl, isFileUrl, isHttpUrl, slugForUrl, stripArchiveShell } from "./asset-inliner.mjs"; const require = createRequire(import.meta.url); export { DEFAULT_USER_AGENT, defaultArchivePath }; const AD_SELECTORS = [ "[data-ad-status]", "[data-ad-type]", "[aria-label*='advertisement' i]", "[id^='leaderboard']", "[class*='LeaderboardAd_']", "[class*='FullWidthAd_']", "[class*='BaseAd_']", ".adWrapper", ".dvz-v0-ad", "amp-ad", "iframe[src*='doubleclick']", "iframe[src*='googletagmanager']", "iframe[src*='googlesyndication']" ]; const TRACKER_HOST_PATTERNS = [ "doubleclick.net", "googletagmanager.com", "googlesyndication.com", "google-analytics.com", "amazon-adsystem.com", "pub.doubleverify.com", "securepubads.g.doubleclick.net", "s10.histats.com", "sstatic1.histats.com" ]; const ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS = [ "getadmiral.com" ]; const BLOCKED_HOST_PATTERNS = [ ...TRACKER_HOST_PATTERNS, ...ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS ]; const ANTI_ADBLOCK_TEXT_PATTERNS = [ "\\bad\\s*block(?:er|ing)?\\b", "\\bad[-\\s]?block\\b", "\\badblock(?:er|ing)?\\b", "\\badvertis(?:e|ing)\\s+block(?:er|ing)?\\b", "\\bblock(?:ing)?\\s+(?:ads|advertis(?:ements?|ing))\\b", "\\b(?:disable|turn\\s+off|pause)\\s+(?:your\\s+)?ad[-\\s]?block(?:er|ing)?\\b", "\\b(?:allowlist|whitelist)\\s+(?:our\\s+|this\\s+)?(?:site|website|domain)\\b", "\\ballow(?:ing)?\\s+(?:our\\s+)?(?:ads|advertis(?:ements?|ing))\\b", "\\bads?\\s+(?:are\\s+)?blocked\\b" ]; const BLOCKED_CAPTURE_PATTERNS = [ { reason: "DataDome CAPTCHA/bot challenge", any: [ /DataDome CAPTCHA/i, /captcha-delivery\.com/i ] }, { reason: "blocked/CAPTCHA challenge", any: [ /]*>\s*You have been blocked\s*<\/title>/i, /]*>\s*Access Denied\s*<\/title>/i, /\bunusual traffic\b/i ] }, { reason: "human verification challenge", all: [ /\bverify you are (?:a )?human\b/i, /\b(?:captcha|challenge|g-recaptcha|hcaptcha|turnstile)\b/i ] } ]; export async function archivePage(input, options = {}) { const sourceUrl = inputToUrl(input); const archivePath = options.archivePath || defaultArchivePath(); const id = options.id || slugForUrl(sourceUrl); const filePath = path.join(archivePath, `${id}.html`); await fs.mkdir(archivePath, { recursive: true }); const rawHtml = await readInputHtml(sourceUrl, options); const baseUrl = rawHtml ? findEffectiveBase(rawHtml, sourceUrl) : sourceUrl; const useStatic = options.static || (isFileUrl(sourceUrl) && options.stripArchiveShell && options.render !== true); const renderedHtml = useStatic ? prepareStaticHtml(rawHtml, options) : await renderPage(sourceUrl, { ...options, rawHtml, baseUrl }); assertNotBlockedCapture(renderedHtml, sourceUrl); const inliner = new AssetInliner({ userAgent: options.userAgent || DEFAULT_USER_AGENT, referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined, maxAssetBytes: options.maxAssetBytes, maxInlineStyleBytes: options.maxInlineStyleBytes }); const inlined = await inliner.inlineHtml(renderedHtml, baseUrl); const finalHtml = addArchiveComment(inlined, sourceUrl, options); await fs.writeFile(filePath, finalHtml, "utf8"); return { id, filePath, sourceUrl, archivePath, warnings: inliner.warnings, externalAssets: findExternalAssetRefs(finalHtml) }; } export async function readInputHtml(sourceUrl, options = {}) { if (isFileUrl(sourceUrl)) { return fs.readFile(new URL(sourceUrl), "utf8"); } if (!isHttpUrl(sourceUrl) || !options.static) { return null; } const response = await fetch(sourceUrl, { headers: { "user-agent": options.userAgent || DEFAULT_USER_AGENT, accept: "text/html,application/xhtml+xml" }, redirect: "follow" }); if (!response.ok) { throw new Error(`Failed to fetch ${sourceUrl}: HTTP ${response.status}`); } return response.text(); } function prepareStaticHtml(rawHtml, options = {}) { if (!rawHtml) { throw new Error("Static mode requires an HTML input file or fetched HTML document."); } return options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml; } export async function renderPage(sourceUrl, options = {}) { const playwright = loadPlaywright(); const browser = await playwright.chromium.launch({ headless: true }); try { const context = await browser.newContext({ javaScriptEnabled: options.javaScriptEnabled ?? !(options.rawHtml && isFileUrl(sourceUrl)), userAgent: options.userAgent || DEFAULT_USER_AGENT, viewport: { width: options.viewportWidth || 1024, height: options.viewportHeight || 768 } }); const page = await context.newPage(); if (options.stripAds !== false) { await page.route("**/*", (route) => { const url = route.request().url(); if (isTrackerUrl(url)) { return route.abort(); } return route.continue(); }); } if (options.rawHtml && isFileUrl(sourceUrl)) { const content = prepareRenderInputHtml(options.rawHtml, options); await page.setContent(content, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 }); } else { await page.goto(sourceUrl, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 }); } await settlePage(page, options); await cleanupAndFreezePage(page, options); return await page.content(); } finally { await browser.close(); } } function assertNotBlockedCapture(html, sourceUrl) { const detected = detectBlockedCapture(html); if (!detected) { return; } throw new Error( `Archive capture failed for ${sourceUrl}: ${detected}. The renderer received a challenge page instead of the requested content, so no archive was written.` ); } function detectBlockedCapture(html) { for (const { reason, any, all } of BLOCKED_CAPTURE_PATTERNS) { const anyMatched = !any || any.some((pattern) => pattern.test(html)); const allMatched = !all || all.every((pattern) => pattern.test(html)); if (anyMatched && allMatched) { return reason; } } return null; } async function settlePage(page, options) { try { await page.waitForLoadState("networkidle", { timeout: options.networkIdleTimeoutMs || 15000 }); } catch { // Dynamic pages often keep long-lived connections open; DOM capture can still proceed. } if (options.scroll !== false) { await page.evaluate(async () => { await new Promise((resolve) => { let total = 0; const step = Math.max(400, Math.floor(window.innerHeight * 0.8)); const timer = setInterval(() => { const previous = document.scrollingElement?.scrollTop || window.scrollY; window.scrollBy(0, step); total += step; const current = document.scrollingElement?.scrollTop || window.scrollY; if (current === previous || total > Math.max(document.body.scrollHeight, 20000)) { clearInterval(timer); window.scrollTo(0, 0); resolve(); } }, 120); }); }); } } async function cleanupAndFreezePage(page, options) { await page.evaluate( ({ adSelectors, antiAdblockProviderHostPatterns, antiAdblockTextPatterns, freezeStyles, maxFreezeElements, maxSanitizeElements, stripAds, stripArchiveShell: shouldStripArchiveShell }) => { function removeAll(selector) { document.querySelectorAll(selector).forEach((node) => node.remove()); } if (shouldStripArchiveShell) { const content = document.querySelector("#CONTENT .html1") || document.querySelector("#CONTENT"); if (content) { document.body.innerHTML = ""; document.body.appendChild(content.cloneNode(true)); document.documentElement.removeAttribute("prefix"); document.documentElement.removeAttribute("itemscope"); document.documentElement.removeAttribute("itemtype"); } } removeAll("script"); removeAll("noscript"); removeAll("link[rel='preconnect'],link[rel='dns-prefetch'],link[rel='modulepreload']"); removeAll("meta[name='next-head-count']"); if (stripAds) { for (const selector of adSelectors) { try { removeAll(selector); } catch { // Ignore unsupported selectors in older browser engines. } } removeAntiAdblockOverlays(); } document.querySelectorAll("img").forEach((img) => { if (img.currentSrc) { img.setAttribute("data-original-src", img.getAttribute("src") || ""); img.setAttribute("src", img.currentSrc); } img.removeAttribute("srcset"); img.removeAttribute("sizes"); img.setAttribute("loading", "lazy"); }); document.querySelectorAll("source").forEach((source) => { source.removeAttribute("srcset"); }); document.querySelectorAll("video,audio").forEach((media) => { if (media.currentSrc) { media.setAttribute("src", media.currentSrc); } }); document.querySelectorAll("iframe").forEach((frame) => { const src = frame.getAttribute("src"); if (src) { frame.setAttribute("data-archived-src", src); } try { const doc = frame.contentDocument; if (doc?.documentElement) { frame.setAttribute("srcdoc", "" + doc.documentElement.outerHTML); frame.removeAttribute("src"); } } catch { // Cross-origin iframe sources are handled in the Node-side inliner when possible. } }); function removeAntiAdblockOverlays() { const textPatterns = antiAdblockTextPatterns.map((pattern) => new RegExp(pattern, "i")); const candidates = new Set(); const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ELEMENT); let node = walker.currentNode; let visited = 0; while (node && visited < maxSanitizeElements) { if (node !== document.body && node !== document.documentElement && hasAntiAdblockSignal(node, textPatterns)) { candidates.add(node); } visited += 1; node = walker.nextNode(); } let removed = 0; candidates.forEach((node) => { const container = findRoadblockContainer(node, textPatterns); if (container && isLikelyAntiAdblockRoadblock(container, textPatterns)) { container.remove(); removed += 1; } }); if (removed > 0) { for (const element of [document.documentElement, document.body]) { element.style.removeProperty("overflow"); element.style.removeProperty("position"); element.style.removeProperty("inset"); } } } function findRoadblockContainer(node, textPatterns) { let current = node; let best = null; while (current?.parentElement && current.parentElement !== document.body) { if (isLikelyAntiAdblockRoadblock(current, textPatterns)) { best = current; } const parentTextLength = normalizeText(current.parentElement.textContent || "").length; if (parentTextLength > 8000) { break; } current = current.parentElement; } return best || (isLikelyAntiAdblockRoadblock(node, textPatterns) ? node : null); } function normalizeText(text) { return text.replace(/\s+/g, " ").trim(); } function hasAntiAdblockSignal(node, textPatterns) { return hasAntiAdblockText(node, textPatterns) || hasAntiAdblockProviderUrl(node) || hasAntiAdblockProviderDescendant(node); } function isLikelyAntiAdblockRoadblock(node, textPatterns) { if (!node || node === document.body || node === document.documentElement) { return false; } const hasSignal = hasAntiAdblockText(node, textPatterns) || hasAntiAdblockProviderUrl(node) || hasAntiAdblockProviderDescendant(node); if (!hasSignal) { return false; } const looksBlocking = isOverlayLike(node) || hasDialogSemantics(node) || hasBlockingClassName(node) || hasActionControl(node) || hasAntiAdblockProviderDescendant(node); if (!looksBlocking) { return false; } const embeddedInContent = node.closest("article, main"); return !embeddedInContent || isOverlayLike(node) || hasDialogSemantics(node) || hasBlockingClassName(node); } function hasAntiAdblockText(node, textPatterns) { const text = normalizeText(node.textContent || ""); return text.length > 0 && text.length <= 2000 && textPatterns.some((pattern) => pattern.test(text)); } function hasAntiAdblockProviderDescendant(node) { return Array.from(node.querySelectorAll?.("a[href], iframe[src], script[src]") || []).some((descendant) => hasAntiAdblockProviderUrl(descendant) ); } function hasAntiAdblockProviderUrl(node) { for (const attr of ["href", "src", "data-src"]) { const value = node.getAttribute?.(attr) || node[attr] || ""; if (value && antiAdblockProviderHostPatterns.some((host) => decodedText(value).includes(host))) { return true; } } return false; } function isOverlayLike(node) { const style = window.getComputedStyle(node); const rect = node.getBoundingClientRect(); const viewportArea = Math.max(1, window.innerWidth * window.innerHeight); const area = Math.max(0, rect.width) * Math.max(0, rect.height); const zIndex = Number.parseInt(style.zIndex, 10); const hasHighZIndex = Number.isFinite(zIndex) && zIndex >= 10; const positionIsBlocking = style.position === "fixed" || style.position === "sticky"; const coversMeaningfulArea = area / viewportArea >= 0.15; const coversMostViewport = area / viewportArea >= 0.45; return ( (positionIsBlocking && (hasHighZIndex || coversMeaningfulArea)) || (hasHighZIndex && coversMostViewport) ); } function hasDialogSemantics(node) { const role = node.getAttribute?.("role"); return node.tagName === "DIALOG" || role === "dialog" || role === "alertdialog" || node.getAttribute?.("aria-modal") === "true"; } function hasBlockingClassName(node) { return /(?:ad[-_ ]?block|adblock|allow[-_ ]?ads|overlay|modal|interstitial|backdrop|roadblock)/i.test( `${node.id || ""} ${node.className || ""}` ); } function hasActionControl(node) { return Boolean(node.querySelector?.('button, [role="button"], a[href], input[type="button"], input[type="submit"]')); } function decodedText(value) { try { return decodeURIComponent(value).toLowerCase(); } catch { return value.toLowerCase(); } } const walkedElements = []; const walker = document.createTreeWalker(document.documentElement, NodeFilter.SHOW_ELEMENT); let element = document.documentElement; let visited = 0; while (element && visited < maxSanitizeElements) { walkedElements.push(element); for (const attr of Array.from(element.attributes)) { if (/^on/i.test(attr.name) || attr.name === "integrity" || attr.name === "nonce") { element.removeAttribute(attr.name); } } visited += 1; element = walker.nextNode(); } if (!freezeStyles || element || walkedElements.length > maxFreezeElements) { return; } for (const element of walkedElements) { if (element.tagName === "SCRIPT" || element.tagName === "STYLE") { continue; } const computed = window.getComputedStyle(element); const declarations = []; for (let i = 0; i < computed.length; i += 1) { const property = computed[i]; const value = computed.getPropertyValue(property); if (value) { declarations.push(`${property}:${value}`); } } if (declarations.length) { element.setAttribute("style", declarations.join(";")); } } }, { adSelectors: AD_SELECTORS, antiAdblockProviderHostPatterns: ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS, antiAdblockTextPatterns: ANTI_ADBLOCK_TEXT_PATTERNS, freezeStyles: options.freezeStyles === true, maxFreezeElements: options.maxFreezeElements || 2500, maxSanitizeElements: options.maxSanitizeElements || 5000, stripAds: options.stripAds !== false, stripArchiveShell: Boolean(options.stripArchiveShell) } ); } function prepareRenderInputHtml(rawHtml, options) { let html = options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml; html = html .replace(//gi, "") .replace(//gi, ""); if (!options.baseUrl) { return html; } if (/`; if (/]*>/i.test(html)) { return html.replace(/]*>/i, (match) => `${match}${baseTag}`); } return `${baseTag}${html}`; } function loadPlaywright() { try { return require("playwright"); } catch (error) { throw new Error( `Playwright is required for render mode. Run "npm install" and "npm run install-browsers", or use --static for HTML input files. Original error: ${error.message}` ); } } function isTrackerUrl(rawUrl) { let host = ""; try { host = new URL(rawUrl).hostname; } catch { return false; } return BLOCKED_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`)); } function addArchiveComment(html, sourceUrl, options) { const comment = ``; if (/]*>/i, (doctype) => `${doctype}\n${comment}`); } return `\n${comment}\n${html}`; } export function findExternalAssetRefs(html) { const refs = new Set(); const attrPattern = /\s(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi; for (const match of html.matchAll(attrPattern)) { if (isSelfContainedAssetRef(match[2])) { continue; } for (const part of match[2].split(",")) { const candidate = part.trim().split(/\s+/)[0]; if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } } const linkPattern = /]*>/gi; for (const match of html.matchAll(linkPattern)) { const tag = match[0]; const rel = readAttribute(tag, "rel") || ""; if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) { continue; } const href = readAttribute(tag, "href"); if (href && !isSelfContainedAssetRef(href)) { refs.add(href); } } const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi; for (const match of html.matchAll(cssUrlPattern)) { const candidate = cleanCssUrl(match[2]); if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } return Array.from(refs).sort(); } function isSelfContainedAssetRef(value) { const trimmed = cleanCssUrl(value); return ( !trimmed || trimmed.startsWith("#") || /^%23/i.test(trimmed) || /^(?:data|about|javascript|mailto|tel):/i.test(trimmed) ); } function readAttribute(tag, attr) { const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i")); return match ? match[2] ?? match[3] ?? match[4] ?? "" : ""; } function cleanCssUrl(value) { const decoded = String(value) .trim() .replaceAll("&", "&") .replaceAll(""", '"') .replaceAll("'", "'") .replaceAll("'", "'"); const quote = decoded[0]; if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) { return decoded.slice(1, -1).trim(); } return decoded; }