From 4bf0e3396161b9d5b212e9fdfc6d7d86caca255d Mon Sep 17 00:00:00 2001 From: James Magahern Date: Thu, 14 May 2026 09:11:05 -0700 Subject: [PATCH] better errors --- README.md | 2 + src/archiver.mjs | 243 ++++++++++++++++++++++++++++++++++++++++-- src/asset-inliner.mjs | 49 ++++++++- src/cli.mjs | 7 +- src/server.mjs | 4 +- 5 files changed, 292 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 0b2870d..a522386 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,8 @@ node src/cli.mjs archive ./bloomberg-archive.html --static --strip-archive-shell Local `archive.ph` HTML inputs with `--strip-archive-shell` use the static extractor by default because those files already contain the rendered page. Add `--render` only when you explicitly want Chromium to load the local HTML first. +Computed-style freezing is off by default for live web pages because it can inflate modern article pages into very large HTML files. Add `--freeze-styles` only when stylesheet inlining is not enough to preserve layout. + Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set. ## API diff --git a/src/archiver.mjs b/src/archiver.mjs index 8165482..dcf510c 100644 --- a/src/archiver.mjs +++ b/src/archiver.mjs @@ -39,12 +39,59 @@ const TRACKER_HOST_PATTERNS = [ "googletagmanager.com", "googlesyndication.com", "google-analytics.com", + "amazon-adsystem.com", "pub.doubleverify.com", "securepubads.g.doubleclick.net", "s10.histats.com", "sstatic1.histats.com" ]; +const ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS = [ + "getadmiral.com" +]; + +const BLOCKED_HOST_PATTERNS = [ + ...TRACKER_HOST_PATTERNS, + ...ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS +]; + +const ANTI_ADBLOCK_TEXT_PATTERNS = [ + "\\bad\\s*block(?:er|ing)?\\b", + "\\bad[-\\s]?block\\b", + "\\badblock(?:er|ing)?\\b", + "\\badvertis(?:e|ing)\\s+block(?:er|ing)?\\b", + "\\bblock(?:ing)?\\s+(?:ads|advertis(?:ements?|ing))\\b", + "\\b(?:disable|turn\\s+off|pause)\\s+(?:your\\s+)?ad[-\\s]?block(?:er|ing)?\\b", + "\\b(?:allowlist|whitelist)\\s+(?:our\\s+|this\\s+)?(?:site|website|domain)\\b", + "\\ballow(?:ing)?\\s+(?:our\\s+)?(?:ads|advertis(?:ements?|ing))\\b", + "\\bads?\\s+(?:are\\s+)?blocked\\b" +]; + +const BLOCKED_CAPTURE_PATTERNS = [ + { + reason: "DataDome CAPTCHA/bot challenge", + any: [ + /DataDome CAPTCHA/i, + /captcha-delivery\.com/i + ] + }, + { + reason: "blocked/CAPTCHA challenge", + any: [ + /]*>\s*You have been blocked\s*<\/title>/i, + /]*>\s*Access Denied\s*<\/title>/i, + /\bunusual traffic\b/i + ] + }, + { + reason: "human verification challenge", + all: [ + /\bverify you are (?:a )?human\b/i, + /\b(?:captcha|challenge|g-recaptcha|hcaptcha|turnstile)\b/i + ] + } +]; + export async function archivePage(input, options = {}) { const sourceUrl = inputToUrl(input); const archivePath = options.archivePath || defaultArchivePath(); @@ -58,11 +105,13 @@ export async function archivePage(input, options = {}) { const renderedHtml = useStatic ? prepareStaticHtml(rawHtml, options) : await renderPage(sourceUrl, { ...options, rawHtml, baseUrl }); + assertNotBlockedCapture(renderedHtml, sourceUrl); const inliner = new AssetInliner({ userAgent: options.userAgent || DEFAULT_USER_AGENT, referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined, - maxAssetBytes: options.maxAssetBytes + maxAssetBytes: options.maxAssetBytes, + maxInlineStyleBytes: options.maxInlineStyleBytes }); const inlined = await inliner.inlineHtml(renderedHtml, baseUrl); const finalHtml = addArchiveComment(inlined, sourceUrl, options); @@ -146,6 +195,27 @@ export async function renderPage(sourceUrl, options = {}) { } } +function assertNotBlockedCapture(html, sourceUrl) { + const detected = detectBlockedCapture(html); + if (!detected) { + return; + } + throw new Error( + `Archive capture failed for ${sourceUrl}: ${detected}. The renderer received a challenge page instead of the requested content, so no archive was written.` + ); +} + +function detectBlockedCapture(html) { + for (const { reason, any, all } of BLOCKED_CAPTURE_PATTERNS) { + const anyMatched = !any || any.some((pattern) => pattern.test(html)); + const allMatched = !all || all.every((pattern) => pattern.test(html)); + if (anyMatched && allMatched) { + return reason; + } + } + return null; +} + async function settlePage(page, options) { try { await page.waitForLoadState("networkidle", { timeout: options.networkIdleTimeoutMs || 15000 }); @@ -178,6 +248,8 @@ async function cleanupAndFreezePage(page, options) { await page.evaluate( ({ adSelectors, + antiAdblockProviderHostPatterns, + antiAdblockTextPatterns, freezeStyles, maxFreezeElements, maxSanitizeElements, @@ -212,6 +284,7 @@ async function cleanupAndFreezePage(page, options) { // Ignore unsupported selectors in older browser engines. } } + removeAntiAdblockOverlays(); } document.querySelectorAll("img").forEach((img) => { @@ -250,6 +323,147 @@ async function cleanupAndFreezePage(page, options) { } }); + function removeAntiAdblockOverlays() { + const textPatterns = antiAdblockTextPatterns.map((pattern) => new RegExp(pattern, "i")); + const candidates = new Set(); + const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ELEMENT); + let node = walker.currentNode; + let visited = 0; + + while (node && visited < maxSanitizeElements) { + if (node !== document.body && node !== document.documentElement && hasAntiAdblockSignal(node, textPatterns)) { + candidates.add(node); + } + visited += 1; + node = walker.nextNode(); + } + + let removed = 0; + candidates.forEach((node) => { + const container = findRoadblockContainer(node, textPatterns); + if (container && isLikelyAntiAdblockRoadblock(container, textPatterns)) { + container.remove(); + removed += 1; + } + }); + + if (removed > 0) { + for (const element of [document.documentElement, document.body]) { + element.style.removeProperty("overflow"); + element.style.removeProperty("position"); + element.style.removeProperty("inset"); + } + } + } + + function findRoadblockContainer(node, textPatterns) { + let current = node; + let best = null; + while (current?.parentElement && current.parentElement !== document.body) { + if (isLikelyAntiAdblockRoadblock(current, textPatterns)) { + best = current; + } + const parentTextLength = normalizeText(current.parentElement.textContent || "").length; + if (parentTextLength > 8000) { + break; + } + current = current.parentElement; + } + return best || (isLikelyAntiAdblockRoadblock(node, textPatterns) ? node : null); + } + + function normalizeText(text) { + return text.replace(/\s+/g, " ").trim(); + } + + function hasAntiAdblockSignal(node, textPatterns) { + return hasAntiAdblockText(node, textPatterns) || hasAntiAdblockProviderUrl(node) || hasAntiAdblockProviderDescendant(node); + } + + function isLikelyAntiAdblockRoadblock(node, textPatterns) { + if (!node || node === document.body || node === document.documentElement) { + return false; + } + const hasSignal = + hasAntiAdblockText(node, textPatterns) || + hasAntiAdblockProviderUrl(node) || + hasAntiAdblockProviderDescendant(node); + if (!hasSignal) { + return false; + } + const looksBlocking = + isOverlayLike(node) || + hasDialogSemantics(node) || + hasBlockingClassName(node) || + hasActionControl(node) || + hasAntiAdblockProviderDescendant(node); + if (!looksBlocking) { + return false; + } + const embeddedInContent = node.closest("article, main"); + return !embeddedInContent || isOverlayLike(node) || hasDialogSemantics(node) || hasBlockingClassName(node); + } + + function hasAntiAdblockText(node, textPatterns) { + const text = normalizeText(node.textContent || ""); + return text.length > 0 && text.length <= 2000 && textPatterns.some((pattern) => pattern.test(text)); + } + + function hasAntiAdblockProviderDescendant(node) { + return Array.from(node.querySelectorAll?.("a[href], iframe[src], script[src]") || []).some((descendant) => + hasAntiAdblockProviderUrl(descendant) + ); + } + + function hasAntiAdblockProviderUrl(node) { + for (const attr of ["href", "src", "data-src"]) { + const value = node.getAttribute?.(attr) || node[attr] || ""; + if (value && antiAdblockProviderHostPatterns.some((host) => decodedText(value).includes(host))) { + return true; + } + } + return false; + } + + function isOverlayLike(node) { + const style = window.getComputedStyle(node); + const rect = node.getBoundingClientRect(); + const viewportArea = Math.max(1, window.innerWidth * window.innerHeight); + const area = Math.max(0, rect.width) * Math.max(0, rect.height); + const zIndex = Number.parseInt(style.zIndex, 10); + const hasHighZIndex = Number.isFinite(zIndex) && zIndex >= 10; + const positionIsBlocking = style.position === "fixed" || style.position === "sticky"; + const coversMeaningfulArea = area / viewportArea >= 0.15; + const coversMostViewport = area / viewportArea >= 0.45; + return ( + (positionIsBlocking && (hasHighZIndex || coversMeaningfulArea)) || + (hasHighZIndex && coversMostViewport) + ); + } + + function hasDialogSemantics(node) { + const role = node.getAttribute?.("role"); + return node.tagName === "DIALOG" || role === "dialog" || role === "alertdialog" || node.getAttribute?.("aria-modal") === "true"; + } + + function hasBlockingClassName(node) { + return /(?:ad[-_ ]?block|adblock|allow[-_ ]?ads|overlay|modal|interstitial|backdrop|roadblock)/i.test( + `${node.id || ""} ${node.className || ""}` + ); + } + + function hasActionControl(node) { + return Boolean(node.querySelector?.('button, [role="button"], a[href], input[type="button"], input[type="submit"]')); + } + + function decodedText(value) { + try { + return decodeURIComponent(value).toLowerCase(); + } catch { + return value.toLowerCase(); + } + } + const walkedElements = []; const walker = document.createTreeWalker(document.documentElement, NodeFilter.SHOW_ELEMENT); let element = document.documentElement; @@ -289,7 +503,9 @@ async function cleanupAndFreezePage(page, options) { }, { adSelectors: AD_SELECTORS, - freezeStyles: options.freezeStyles !== false, + antiAdblockProviderHostPatterns: ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS, + antiAdblockTextPatterns: ANTI_ADBLOCK_TEXT_PATTERNS, + freezeStyles: options.freezeStyles === true, maxFreezeElements: options.maxFreezeElements || 2500, maxSanitizeElements: options.maxSanitizeElements || 5000, stripAds: options.stripAds !== false, @@ -333,7 +549,7 @@ function isTrackerUrl(rawUrl) { } catch { return false; } - return TRACKER_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`)); + return BLOCKED_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`)); } function addArchiveComment(html, sourceUrl, options) { @@ -346,7 +562,7 @@ function addArchiveComment(html, sourceUrl, options) { export function findExternalAssetRefs(html) { const refs = new Set(); - const attrPattern = /\b(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi; + const attrPattern = /\s(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi; for (const match of html.matchAll(attrPattern)) { if (isSelfContainedAssetRef(match[2])) { continue; @@ -372,7 +588,7 @@ export function findExternalAssetRefs(html) { } const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi; for (const match of html.matchAll(cssUrlPattern)) { - const candidate = match[2].trim(); + const candidate = cleanCssUrl(match[2]); if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } @@ -381,10 +597,11 @@ export function findExternalAssetRefs(html) { } function isSelfContainedAssetRef(value) { - const trimmed = value.trim(); + const trimmed = cleanCssUrl(value); return ( !trimmed || trimmed.startsWith("#") || + /^%23/i.test(trimmed) || /^(?:data|about|javascript|mailto|tel):/i.test(trimmed) ); } @@ -393,3 +610,17 @@ function readAttribute(tag, attr) { const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i")); return match ? match[2] ?? match[3] ?? match[4] ?? "" : ""; } + +function cleanCssUrl(value) { + const decoded = String(value) + .trim() + .replaceAll("&", "&") + .replaceAll(""", '"') + .replaceAll("'", "'") + .replaceAll("'", "'"); + const quote = decoded[0]; + if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) { + return decoded.slice(1, -1).trim(); + } + return decoded; +} diff --git a/src/asset-inliner.mjs b/src/asset-inliner.mjs index cb68a5d..fdcc82a 100644 --- a/src/asset-inliner.mjs +++ b/src/asset-inliner.mjs @@ -92,6 +92,7 @@ export function resolveUrl(rawUrl, baseUrl) { if ( !trimmed || trimmed.startsWith("#") || + /^%23/i.test(trimmed) || /^(?:about|blob|data|javascript|mailto|tel):/i.test(trimmed) ) { return trimmed; @@ -182,6 +183,7 @@ export class AssetInliner { this.userAgent = options.userAgent || DEFAULT_USER_AGENT; this.referer = options.referer; this.maxAssetBytes = options.maxAssetBytes || 30 * 1024 * 1024; + this.maxInlineStyleBytes = options.maxInlineStyleBytes || 128 * 1024; this.cache = new Map(); this.warnings = []; } @@ -195,6 +197,10 @@ export class AssetInliner { output = output.replace(//gi, ""); output = output.replace(//gi, ""); output = output.replace(/]*\brel=(["']?)(?:preconnect|dns-prefetch|modulepreload)\1[^>]*>/gi, ""); + output = output.replace( + /]*\brel=(["']?)preload\1)(?=[^>]*\bas=(["']?)script\2)[^>]*>/gi, + "" + ); output = await replaceAsync(output, /]*)>([\s\S]*?)<\/style>/gi, async (match) => { const attrs = match[1] || ""; @@ -203,7 +209,15 @@ export class AssetInliner { }); output = await replaceAsync(output, /\sstyle=(["'])([\s\S]*?)\1/gi, async (match) => { - const css = await this.inlineCss(htmlDecode(match[2]), effectiveBase); + const decoded = htmlDecode(match[2]); + if (!hasExternalCssReference(decoded)) { + return match[0]; + } + if (decoded.length > this.maxInlineStyleBytes) { + this.warnings.push(`Skipped inline style asset rewrite: ${decoded.length} bytes exceeds ${this.maxInlineStyleBytes}`); + return match[0]; + } + const css = await this.inlineCss(decoded, effectiveBase); return ` style=${match[1]}${htmlEscape(css)}${match[1]}`; }); @@ -260,6 +274,9 @@ export class AssetInliner { /\b(?:icon|apple-touch-icon|image_src)\b/i.test(rel) || (/\bpreload\b/i.test(rel) && /^(?:font|image|style)$/i.test(asValue)); if (!isInlineableLink) { + if (/\bpreload\b/i.test(rel) && /^script$/i.test(asValue)) { + return ""; + } return tag; } if (/\bpreload\b/i.test(rel) && /^style$/i.test(asValue)) { @@ -341,8 +358,8 @@ export class AssetInliner { ); output = await replaceAsync(output, /url\(\s*(["']?)([^"')]+)\1\s*\)/gi, async (match) => { - const raw = htmlDecode(match[2].trim()); - if (!raw || raw.startsWith("#") || /^(?:data|blob|about|javascript):/i.test(raw)) { + const raw = cleanCssUrl(match[2]); + if (!raw || raw.startsWith("#") || /^%23/i.test(raw) || /^(?:data|blob|about|javascript):/i.test(raw)) { return match[0]; } const dataUri = await this.toDataUri(raw, baseUrl); @@ -444,7 +461,7 @@ export class AssetInliner { function removeExternalBookkeepingUrls(html) { return html.replace( - /\s(?:old-src|currentSourceUrl|data-original-src|data-archived-src)=(["'])(https?:\/\/[\s\S]*?)\1/gi, + /\s(?:old-src|currentSourceUrl|data-original-src|data-archived-src)=(["'])([\s\S]*?)\1/gi, "" ); } @@ -505,7 +522,7 @@ function setAttribute(tag, attr, value) { if (attrRegex.test(tag)) { return tag.replace(attrRegex, `${attr}="${escaped}"`); } - return tag.replace(/\/?>$/, (end) => ` ${attr}="${escaped}"${end}`); + return tag.replace(/^<[^>]*>/, (openingTag) => openingTag.replace(/\/?>$/, (end) => ` ${attr}="${escaped}"${end}`)); } function removeAttribute(tag, attr) { @@ -519,3 +536,25 @@ function replaceMissingMediaAttribute(tag, attr) { } return removeAttribute(tag, attr); } + +function hasExternalCssReference(css) { + if (/@import\b/i.test(css)) { + return true; + } + for (const match of css.matchAll(/url\(\s*(["']?)([^"')]+)\1\s*\)/gi)) { + const raw = cleanCssUrl(match[2]); + if (raw && !raw.startsWith("#") && !/^%23/i.test(raw) && !/^(?:data|blob|about|javascript):/i.test(raw)) { + return true; + } + } + return false; +} + +function cleanCssUrl(value) { + const decoded = htmlDecode(String(value).trim()); + const quote = decoded[0]; + if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) { + return decoded.slice(1, -1).trim(); + } + return decoded; +} diff --git a/src/cli.mjs b/src/cli.mjs index c1f86c8..2bbcfff 100644 --- a/src/cli.mjs +++ b/src/cli.mjs @@ -36,10 +36,13 @@ Options: --id Output id/file stem --static Do not use a browser; transform the input HTML only --render Force browser rendering for local archive-shell HTML + --freeze-styles Snapshot computed styles into inline style attributes --strip-archive-shell Remove an archive.ph shell from an already archived HTML file --no-strip-ads Keep ad-like elements --user-agent User agent to send for page and asset requests --max-asset-bytes Per-asset inline limit + --max-inline-style-bytes + Per-style-attribute inline rewrite limit Default user agent: ${DEFAULT_USER_AGENT}`); @@ -64,12 +67,14 @@ async function main() { const result = await archivePage(input, { archivePath: args["archive-path"], id: args.id, + freezeStyles: Boolean(args["freeze-styles"]), render: Boolean(args.render), static: Boolean(args.static), stripArchiveShell: Boolean(args["strip-archive-shell"]), stripAds: args["strip-ads"] !== false, userAgent: args["user-agent"] || DEFAULT_USER_AGENT, - maxAssetBytes: args["max-asset-bytes"] ? Number(args["max-asset-bytes"]) : undefined + maxAssetBytes: args["max-asset-bytes"] ? Number(args["max-asset-bytes"]) : undefined, + maxInlineStyleBytes: args["max-inline-style-bytes"] ? Number(args["max-inline-style-bytes"]) : undefined }); console.log(`Archived: ${result.sourceUrl}`); diff --git a/src/server.mjs b/src/server.mjs index 8c7c37c..9ebb772 100644 --- a/src/server.mjs +++ b/src/server.mjs @@ -21,12 +21,14 @@ const server = http.createServer(async (req, res) => { const result = await archivePage(body.url, { archivePath, id: body.id, + freezeStyles: Boolean(body.freezeStyles), render: Boolean(body.render), static: Boolean(body.static), stripArchiveShell: Boolean(body.stripArchiveShell), stripAds: body.stripAds !== false, userAgent: body.userAgent || DEFAULT_USER_AGENT, - maxAssetBytes: body.maxAssetBytes + maxAssetBytes: body.maxAssetBytes, + maxInlineStyleBytes: body.maxInlineStyleBytes }); return sendJson(res, 201, { id: result.id,