diff --git a/README.md b/README.md
index 0b2870d..a522386 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,8 @@ node src/cli.mjs archive ./bloomberg-archive.html --static --strip-archive-shell
Local `archive.ph` HTML inputs with `--strip-archive-shell` use the static extractor by default because those files already contain the rendered page. Add `--render` only when you explicitly want Chromium to load the local HTML first.
+Computed-style freezing is off by default for live web pages because it can inflate modern article pages into very large HTML files. Add `--freeze-styles` only when stylesheet inlining is not enough to preserve layout.
+
Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set.
## API
diff --git a/src/archiver.mjs b/src/archiver.mjs
index 8165482..dcf510c 100644
--- a/src/archiver.mjs
+++ b/src/archiver.mjs
@@ -39,12 +39,59 @@ const TRACKER_HOST_PATTERNS = [
"googletagmanager.com",
"googlesyndication.com",
"google-analytics.com",
+ "amazon-adsystem.com",
"pub.doubleverify.com",
"securepubads.g.doubleclick.net",
"s10.histats.com",
"sstatic1.histats.com"
];
+const ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS = [
+ "getadmiral.com"
+];
+
+const BLOCKED_HOST_PATTERNS = [
+ ...TRACKER_HOST_PATTERNS,
+ ...ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS
+];
+
+const ANTI_ADBLOCK_TEXT_PATTERNS = [
+ "\\bad\\s*block(?:er|ing)?\\b",
+ "\\bad[-\\s]?block\\b",
+ "\\badblock(?:er|ing)?\\b",
+ "\\badvertis(?:e|ing)\\s+block(?:er|ing)?\\b",
+ "\\bblock(?:ing)?\\s+(?:ads|advertis(?:ements?|ing))\\b",
+ "\\b(?:disable|turn\\s+off|pause)\\s+(?:your\\s+)?ad[-\\s]?block(?:er|ing)?\\b",
+ "\\b(?:allowlist|whitelist)\\s+(?:our\\s+|this\\s+)?(?:site|website|domain)\\b",
+ "\\ballow(?:ing)?\\s+(?:our\\s+)?(?:ads|advertis(?:ements?|ing))\\b",
+ "\\bads?\\s+(?:are\\s+)?blocked\\b"
+];
+
+const BLOCKED_CAPTURE_PATTERNS = [
+ {
+ reason: "DataDome CAPTCHA/bot challenge",
+ any: [
+ /DataDome CAPTCHA/i,
+ /captcha-delivery\.com/i
+ ]
+ },
+ {
+ reason: "blocked/CAPTCHA challenge",
+ any: [
+ /
]*>\s*You have been blocked\s*<\/title>/i,
+ /]*>\s*Access Denied\s*<\/title>/i,
+ /\bunusual traffic\b/i
+ ]
+ },
+ {
+ reason: "human verification challenge",
+ all: [
+ /\bverify you are (?:a )?human\b/i,
+ /\b(?:captcha|challenge|g-recaptcha|hcaptcha|turnstile)\b/i
+ ]
+ }
+];
+
export async function archivePage(input, options = {}) {
const sourceUrl = inputToUrl(input);
const archivePath = options.archivePath || defaultArchivePath();
@@ -58,11 +105,13 @@ export async function archivePage(input, options = {}) {
const renderedHtml = useStatic
? prepareStaticHtml(rawHtml, options)
: await renderPage(sourceUrl, { ...options, rawHtml, baseUrl });
+ assertNotBlockedCapture(renderedHtml, sourceUrl);
const inliner = new AssetInliner({
userAgent: options.userAgent || DEFAULT_USER_AGENT,
referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined,
- maxAssetBytes: options.maxAssetBytes
+ maxAssetBytes: options.maxAssetBytes,
+ maxInlineStyleBytes: options.maxInlineStyleBytes
});
const inlined = await inliner.inlineHtml(renderedHtml, baseUrl);
const finalHtml = addArchiveComment(inlined, sourceUrl, options);
@@ -146,6 +195,27 @@ export async function renderPage(sourceUrl, options = {}) {
}
}
+function assertNotBlockedCapture(html, sourceUrl) {
+ const detected = detectBlockedCapture(html);
+ if (!detected) {
+ return;
+ }
+ throw new Error(
+ `Archive capture failed for ${sourceUrl}: ${detected}. The renderer received a challenge page instead of the requested content, so no archive was written.`
+ );
+}
+
+function detectBlockedCapture(html) {
+ for (const { reason, any, all } of BLOCKED_CAPTURE_PATTERNS) {
+ const anyMatched = !any || any.some((pattern) => pattern.test(html));
+ const allMatched = !all || all.every((pattern) => pattern.test(html));
+ if (anyMatched && allMatched) {
+ return reason;
+ }
+ }
+ return null;
+}
+
async function settlePage(page, options) {
try {
await page.waitForLoadState("networkidle", { timeout: options.networkIdleTimeoutMs || 15000 });
@@ -178,6 +248,8 @@ async function cleanupAndFreezePage(page, options) {
await page.evaluate(
({
adSelectors,
+ antiAdblockProviderHostPatterns,
+ antiAdblockTextPatterns,
freezeStyles,
maxFreezeElements,
maxSanitizeElements,
@@ -212,6 +284,7 @@ async function cleanupAndFreezePage(page, options) {
// Ignore unsupported selectors in older browser engines.
}
}
+ removeAntiAdblockOverlays();
}
document.querySelectorAll("img").forEach((img) => {
@@ -250,6 +323,147 @@ async function cleanupAndFreezePage(page, options) {
}
});
+ function removeAntiAdblockOverlays() {
+ const textPatterns = antiAdblockTextPatterns.map((pattern) => new RegExp(pattern, "i"));
+ const candidates = new Set();
+ const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ELEMENT);
+ let node = walker.currentNode;
+ let visited = 0;
+
+ while (node && visited < maxSanitizeElements) {
+ if (node !== document.body && node !== document.documentElement && hasAntiAdblockSignal(node, textPatterns)) {
+ candidates.add(node);
+ }
+ visited += 1;
+ node = walker.nextNode();
+ }
+
+ let removed = 0;
+ candidates.forEach((node) => {
+ const container = findRoadblockContainer(node, textPatterns);
+ if (container && isLikelyAntiAdblockRoadblock(container, textPatterns)) {
+ container.remove();
+ removed += 1;
+ }
+ });
+
+ if (removed > 0) {
+ for (const element of [document.documentElement, document.body]) {
+ element.style.removeProperty("overflow");
+ element.style.removeProperty("position");
+ element.style.removeProperty("inset");
+ }
+ }
+ }
+
+ function findRoadblockContainer(node, textPatterns) {
+ let current = node;
+ let best = null;
+ while (current?.parentElement && current.parentElement !== document.body) {
+ if (isLikelyAntiAdblockRoadblock(current, textPatterns)) {
+ best = current;
+ }
+ const parentTextLength = normalizeText(current.parentElement.textContent || "").length;
+ if (parentTextLength > 8000) {
+ break;
+ }
+ current = current.parentElement;
+ }
+ return best || (isLikelyAntiAdblockRoadblock(node, textPatterns) ? node : null);
+ }
+
+ function normalizeText(text) {
+ return text.replace(/\s+/g, " ").trim();
+ }
+
+ function hasAntiAdblockSignal(node, textPatterns) {
+ return hasAntiAdblockText(node, textPatterns) || hasAntiAdblockProviderUrl(node) || hasAntiAdblockProviderDescendant(node);
+ }
+
+ function isLikelyAntiAdblockRoadblock(node, textPatterns) {
+ if (!node || node === document.body || node === document.documentElement) {
+ return false;
+ }
+ const hasSignal =
+ hasAntiAdblockText(node, textPatterns) ||
+ hasAntiAdblockProviderUrl(node) ||
+ hasAntiAdblockProviderDescendant(node);
+ if (!hasSignal) {
+ return false;
+ }
+ const looksBlocking =
+ isOverlayLike(node) ||
+ hasDialogSemantics(node) ||
+ hasBlockingClassName(node) ||
+ hasActionControl(node) ||
+ hasAntiAdblockProviderDescendant(node);
+ if (!looksBlocking) {
+ return false;
+ }
+ const embeddedInContent = node.closest("article, main");
+ return !embeddedInContent || isOverlayLike(node) || hasDialogSemantics(node) || hasBlockingClassName(node);
+ }
+
+ function hasAntiAdblockText(node, textPatterns) {
+ const text = normalizeText(node.textContent || "");
+ return text.length > 0 && text.length <= 2000 && textPatterns.some((pattern) => pattern.test(text));
+ }
+
+ function hasAntiAdblockProviderDescendant(node) {
+ return Array.from(node.querySelectorAll?.("a[href], iframe[src], script[src]") || []).some((descendant) =>
+ hasAntiAdblockProviderUrl(descendant)
+ );
+ }
+
+ function hasAntiAdblockProviderUrl(node) {
+ for (const attr of ["href", "src", "data-src"]) {
+ const value = node.getAttribute?.(attr) || node[attr] || "";
+ if (value && antiAdblockProviderHostPatterns.some((host) => decodedText(value).includes(host))) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ function isOverlayLike(node) {
+ const style = window.getComputedStyle(node);
+ const rect = node.getBoundingClientRect();
+ const viewportArea = Math.max(1, window.innerWidth * window.innerHeight);
+ const area = Math.max(0, rect.width) * Math.max(0, rect.height);
+ const zIndex = Number.parseInt(style.zIndex, 10);
+ const hasHighZIndex = Number.isFinite(zIndex) && zIndex >= 10;
+ const positionIsBlocking = style.position === "fixed" || style.position === "sticky";
+ const coversMeaningfulArea = area / viewportArea >= 0.15;
+ const coversMostViewport = area / viewportArea >= 0.45;
+ return (
+ (positionIsBlocking && (hasHighZIndex || coversMeaningfulArea)) ||
+ (hasHighZIndex && coversMostViewport)
+ );
+ }
+
+ function hasDialogSemantics(node) {
+ const role = node.getAttribute?.("role");
+ return node.tagName === "DIALOG" || role === "dialog" || role === "alertdialog" || node.getAttribute?.("aria-modal") === "true";
+ }
+
+ function hasBlockingClassName(node) {
+ return /(?:ad[-_ ]?block|adblock|allow[-_ ]?ads|overlay|modal|interstitial|backdrop|roadblock)/i.test(
+ `${node.id || ""} ${node.className || ""}`
+ );
+ }
+
+ function hasActionControl(node) {
+ return Boolean(node.querySelector?.('button, [role="button"], a[href], input[type="button"], input[type="submit"]'));
+ }
+
+ function decodedText(value) {
+ try {
+ return decodeURIComponent(value).toLowerCase();
+ } catch {
+ return value.toLowerCase();
+ }
+ }
+
const walkedElements = [];
const walker = document.createTreeWalker(document.documentElement, NodeFilter.SHOW_ELEMENT);
let element = document.documentElement;
@@ -289,7 +503,9 @@ async function cleanupAndFreezePage(page, options) {
},
{
adSelectors: AD_SELECTORS,
- freezeStyles: options.freezeStyles !== false,
+ antiAdblockProviderHostPatterns: ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS,
+ antiAdblockTextPatterns: ANTI_ADBLOCK_TEXT_PATTERNS,
+ freezeStyles: options.freezeStyles === true,
maxFreezeElements: options.maxFreezeElements || 2500,
maxSanitizeElements: options.maxSanitizeElements || 5000,
stripAds: options.stripAds !== false,
@@ -333,7 +549,7 @@ function isTrackerUrl(rawUrl) {
} catch {
return false;
}
- return TRACKER_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`));
+ return BLOCKED_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`));
}
function addArchiveComment(html, sourceUrl, options) {
@@ -346,7 +562,7 @@ function addArchiveComment(html, sourceUrl, options) {
export function findExternalAssetRefs(html) {
const refs = new Set();
- const attrPattern = /\b(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi;
+ const attrPattern = /\s(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi;
for (const match of html.matchAll(attrPattern)) {
if (isSelfContainedAssetRef(match[2])) {
continue;
@@ -372,7 +588,7 @@ export function findExternalAssetRefs(html) {
}
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
for (const match of html.matchAll(cssUrlPattern)) {
- const candidate = match[2].trim();
+ const candidate = cleanCssUrl(match[2]);
if (candidate && !isSelfContainedAssetRef(candidate)) {
refs.add(candidate);
}
@@ -381,10 +597,11 @@ export function findExternalAssetRefs(html) {
}
function isSelfContainedAssetRef(value) {
- const trimmed = value.trim();
+ const trimmed = cleanCssUrl(value);
return (
!trimmed ||
trimmed.startsWith("#") ||
+ /^%23/i.test(trimmed) ||
/^(?:data|about|javascript|mailto|tel):/i.test(trimmed)
);
}
@@ -393,3 +610,17 @@ function readAttribute(tag, attr) {
const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"));
return match ? match[2] ?? match[3] ?? match[4] ?? "" : "";
}
+
+function cleanCssUrl(value) {
+ const decoded = String(value)
+ .trim()
+ .replaceAll("&", "&")
+ .replaceAll(""", '"')
+ .replaceAll("'", "'")
+ .replaceAll("'", "'");
+ const quote = decoded[0];
+ if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) {
+ return decoded.slice(1, -1).trim();
+ }
+ return decoded;
+}
diff --git a/src/asset-inliner.mjs b/src/asset-inliner.mjs
index cb68a5d..fdcc82a 100644
--- a/src/asset-inliner.mjs
+++ b/src/asset-inliner.mjs
@@ -92,6 +92,7 @@ export function resolveUrl(rawUrl, baseUrl) {
if (
!trimmed ||
trimmed.startsWith("#") ||
+ /^%23/i.test(trimmed) ||
/^(?:about|blob|data|javascript|mailto|tel):/i.test(trimmed)
) {
return trimmed;
@@ -182,6 +183,7 @@ export class AssetInliner {
this.userAgent = options.userAgent || DEFAULT_USER_AGENT;
this.referer = options.referer;
this.maxAssetBytes = options.maxAssetBytes || 30 * 1024 * 1024;
+ this.maxInlineStyleBytes = options.maxInlineStyleBytes || 128 * 1024;
this.cache = new Map();
this.warnings = [];
}
@@ -195,6 +197,10 @@ export class AssetInliner {
output = output.replace(/