better errors
This commit is contained in:
@@ -26,6 +26,8 @@ node src/cli.mjs archive ./bloomberg-archive.html --static --strip-archive-shell
|
|||||||
|
|
||||||
Local `archive.ph` HTML inputs with `--strip-archive-shell` use the static extractor by default because those files already contain the rendered page. Add `--render` only when you explicitly want Chromium to load the local HTML first.
|
Local `archive.ph` HTML inputs with `--strip-archive-shell` use the static extractor by default because those files already contain the rendered page. Add `--render` only when you explicitly want Chromium to load the local HTML first.
|
||||||
|
|
||||||
|
Computed-style freezing is off by default for live web pages because it can inflate modern article pages into very large HTML files. Add `--freeze-styles` only when stylesheet inlining is not enough to preserve layout.
|
||||||
|
|
||||||
Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set.
|
Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set.
|
||||||
|
|
||||||
## API
|
## API
|
||||||
|
|||||||
243
src/archiver.mjs
243
src/archiver.mjs
@@ -39,12 +39,59 @@ const TRACKER_HOST_PATTERNS = [
|
|||||||
"googletagmanager.com",
|
"googletagmanager.com",
|
||||||
"googlesyndication.com",
|
"googlesyndication.com",
|
||||||
"google-analytics.com",
|
"google-analytics.com",
|
||||||
|
"amazon-adsystem.com",
|
||||||
"pub.doubleverify.com",
|
"pub.doubleverify.com",
|
||||||
"securepubads.g.doubleclick.net",
|
"securepubads.g.doubleclick.net",
|
||||||
"s10.histats.com",
|
"s10.histats.com",
|
||||||
"sstatic1.histats.com"
|
"sstatic1.histats.com"
|
||||||
];
|
];
|
||||||
|
|
||||||
|
const ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS = [
|
||||||
|
"getadmiral.com"
|
||||||
|
];
|
||||||
|
|
||||||
|
const BLOCKED_HOST_PATTERNS = [
|
||||||
|
...TRACKER_HOST_PATTERNS,
|
||||||
|
...ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS
|
||||||
|
];
|
||||||
|
|
||||||
|
const ANTI_ADBLOCK_TEXT_PATTERNS = [
|
||||||
|
"\\bad\\s*block(?:er|ing)?\\b",
|
||||||
|
"\\bad[-\\s]?block\\b",
|
||||||
|
"\\badblock(?:er|ing)?\\b",
|
||||||
|
"\\badvertis(?:e|ing)\\s+block(?:er|ing)?\\b",
|
||||||
|
"\\bblock(?:ing)?\\s+(?:ads|advertis(?:ements?|ing))\\b",
|
||||||
|
"\\b(?:disable|turn\\s+off|pause)\\s+(?:your\\s+)?ad[-\\s]?block(?:er|ing)?\\b",
|
||||||
|
"\\b(?:allowlist|whitelist)\\s+(?:our\\s+|this\\s+)?(?:site|website|domain)\\b",
|
||||||
|
"\\ballow(?:ing)?\\s+(?:our\\s+)?(?:ads|advertis(?:ements?|ing))\\b",
|
||||||
|
"\\bads?\\s+(?:are\\s+)?blocked\\b"
|
||||||
|
];
|
||||||
|
|
||||||
|
const BLOCKED_CAPTURE_PATTERNS = [
|
||||||
|
{
|
||||||
|
reason: "DataDome CAPTCHA/bot challenge",
|
||||||
|
any: [
|
||||||
|
/DataDome CAPTCHA/i,
|
||||||
|
/captcha-delivery\.com/i
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
reason: "blocked/CAPTCHA challenge",
|
||||||
|
any: [
|
||||||
|
/<title[^>]*>\s*You have been blocked\s*<\/title>/i,
|
||||||
|
/<title[^>]*>\s*Access Denied\s*<\/title>/i,
|
||||||
|
/\bunusual traffic\b/i
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
reason: "human verification challenge",
|
||||||
|
all: [
|
||||||
|
/\bverify you are (?:a )?human\b/i,
|
||||||
|
/\b(?:captcha|challenge|g-recaptcha|hcaptcha|turnstile)\b/i
|
||||||
|
]
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
export async function archivePage(input, options = {}) {
|
export async function archivePage(input, options = {}) {
|
||||||
const sourceUrl = inputToUrl(input);
|
const sourceUrl = inputToUrl(input);
|
||||||
const archivePath = options.archivePath || defaultArchivePath();
|
const archivePath = options.archivePath || defaultArchivePath();
|
||||||
@@ -58,11 +105,13 @@ export async function archivePage(input, options = {}) {
|
|||||||
const renderedHtml = useStatic
|
const renderedHtml = useStatic
|
||||||
? prepareStaticHtml(rawHtml, options)
|
? prepareStaticHtml(rawHtml, options)
|
||||||
: await renderPage(sourceUrl, { ...options, rawHtml, baseUrl });
|
: await renderPage(sourceUrl, { ...options, rawHtml, baseUrl });
|
||||||
|
assertNotBlockedCapture(renderedHtml, sourceUrl);
|
||||||
|
|
||||||
const inliner = new AssetInliner({
|
const inliner = new AssetInliner({
|
||||||
userAgent: options.userAgent || DEFAULT_USER_AGENT,
|
userAgent: options.userAgent || DEFAULT_USER_AGENT,
|
||||||
referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined,
|
referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined,
|
||||||
maxAssetBytes: options.maxAssetBytes
|
maxAssetBytes: options.maxAssetBytes,
|
||||||
|
maxInlineStyleBytes: options.maxInlineStyleBytes
|
||||||
});
|
});
|
||||||
const inlined = await inliner.inlineHtml(renderedHtml, baseUrl);
|
const inlined = await inliner.inlineHtml(renderedHtml, baseUrl);
|
||||||
const finalHtml = addArchiveComment(inlined, sourceUrl, options);
|
const finalHtml = addArchiveComment(inlined, sourceUrl, options);
|
||||||
@@ -146,6 +195,27 @@ export async function renderPage(sourceUrl, options = {}) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function assertNotBlockedCapture(html, sourceUrl) {
|
||||||
|
const detected = detectBlockedCapture(html);
|
||||||
|
if (!detected) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
throw new Error(
|
||||||
|
`Archive capture failed for ${sourceUrl}: ${detected}. The renderer received a challenge page instead of the requested content, so no archive was written.`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function detectBlockedCapture(html) {
|
||||||
|
for (const { reason, any, all } of BLOCKED_CAPTURE_PATTERNS) {
|
||||||
|
const anyMatched = !any || any.some((pattern) => pattern.test(html));
|
||||||
|
const allMatched = !all || all.every((pattern) => pattern.test(html));
|
||||||
|
if (anyMatched && allMatched) {
|
||||||
|
return reason;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
async function settlePage(page, options) {
|
async function settlePage(page, options) {
|
||||||
try {
|
try {
|
||||||
await page.waitForLoadState("networkidle", { timeout: options.networkIdleTimeoutMs || 15000 });
|
await page.waitForLoadState("networkidle", { timeout: options.networkIdleTimeoutMs || 15000 });
|
||||||
@@ -178,6 +248,8 @@ async function cleanupAndFreezePage(page, options) {
|
|||||||
await page.evaluate(
|
await page.evaluate(
|
||||||
({
|
({
|
||||||
adSelectors,
|
adSelectors,
|
||||||
|
antiAdblockProviderHostPatterns,
|
||||||
|
antiAdblockTextPatterns,
|
||||||
freezeStyles,
|
freezeStyles,
|
||||||
maxFreezeElements,
|
maxFreezeElements,
|
||||||
maxSanitizeElements,
|
maxSanitizeElements,
|
||||||
@@ -212,6 +284,7 @@ async function cleanupAndFreezePage(page, options) {
|
|||||||
// Ignore unsupported selectors in older browser engines.
|
// Ignore unsupported selectors in older browser engines.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
removeAntiAdblockOverlays();
|
||||||
}
|
}
|
||||||
|
|
||||||
document.querySelectorAll("img").forEach((img) => {
|
document.querySelectorAll("img").forEach((img) => {
|
||||||
@@ -250,6 +323,147 @@ async function cleanupAndFreezePage(page, options) {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
function removeAntiAdblockOverlays() {
|
||||||
|
const textPatterns = antiAdblockTextPatterns.map((pattern) => new RegExp(pattern, "i"));
|
||||||
|
const candidates = new Set();
|
||||||
|
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ELEMENT);
|
||||||
|
let node = walker.currentNode;
|
||||||
|
let visited = 0;
|
||||||
|
|
||||||
|
while (node && visited < maxSanitizeElements) {
|
||||||
|
if (node !== document.body && node !== document.documentElement && hasAntiAdblockSignal(node, textPatterns)) {
|
||||||
|
candidates.add(node);
|
||||||
|
}
|
||||||
|
visited += 1;
|
||||||
|
node = walker.nextNode();
|
||||||
|
}
|
||||||
|
|
||||||
|
let removed = 0;
|
||||||
|
candidates.forEach((node) => {
|
||||||
|
const container = findRoadblockContainer(node, textPatterns);
|
||||||
|
if (container && isLikelyAntiAdblockRoadblock(container, textPatterns)) {
|
||||||
|
container.remove();
|
||||||
|
removed += 1;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (removed > 0) {
|
||||||
|
for (const element of [document.documentElement, document.body]) {
|
||||||
|
element.style.removeProperty("overflow");
|
||||||
|
element.style.removeProperty("position");
|
||||||
|
element.style.removeProperty("inset");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function findRoadblockContainer(node, textPatterns) {
|
||||||
|
let current = node;
|
||||||
|
let best = null;
|
||||||
|
while (current?.parentElement && current.parentElement !== document.body) {
|
||||||
|
if (isLikelyAntiAdblockRoadblock(current, textPatterns)) {
|
||||||
|
best = current;
|
||||||
|
}
|
||||||
|
const parentTextLength = normalizeText(current.parentElement.textContent || "").length;
|
||||||
|
if (parentTextLength > 8000) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
current = current.parentElement;
|
||||||
|
}
|
||||||
|
return best || (isLikelyAntiAdblockRoadblock(node, textPatterns) ? node : null);
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeText(text) {
|
||||||
|
return text.replace(/\s+/g, " ").trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasAntiAdblockSignal(node, textPatterns) {
|
||||||
|
return hasAntiAdblockText(node, textPatterns) || hasAntiAdblockProviderUrl(node) || hasAntiAdblockProviderDescendant(node);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isLikelyAntiAdblockRoadblock(node, textPatterns) {
|
||||||
|
if (!node || node === document.body || node === document.documentElement) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const hasSignal =
|
||||||
|
hasAntiAdblockText(node, textPatterns) ||
|
||||||
|
hasAntiAdblockProviderUrl(node) ||
|
||||||
|
hasAntiAdblockProviderDescendant(node);
|
||||||
|
if (!hasSignal) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const looksBlocking =
|
||||||
|
isOverlayLike(node) ||
|
||||||
|
hasDialogSemantics(node) ||
|
||||||
|
hasBlockingClassName(node) ||
|
||||||
|
hasActionControl(node) ||
|
||||||
|
hasAntiAdblockProviderDescendant(node);
|
||||||
|
if (!looksBlocking) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const embeddedInContent = node.closest("article, main");
|
||||||
|
return !embeddedInContent || isOverlayLike(node) || hasDialogSemantics(node) || hasBlockingClassName(node);
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasAntiAdblockText(node, textPatterns) {
|
||||||
|
const text = normalizeText(node.textContent || "");
|
||||||
|
return text.length > 0 && text.length <= 2000 && textPatterns.some((pattern) => pattern.test(text));
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasAntiAdblockProviderDescendant(node) {
|
||||||
|
return Array.from(node.querySelectorAll?.("a[href], iframe[src], script[src]") || []).some((descendant) =>
|
||||||
|
hasAntiAdblockProviderUrl(descendant)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasAntiAdblockProviderUrl(node) {
|
||||||
|
for (const attr of ["href", "src", "data-src"]) {
|
||||||
|
const value = node.getAttribute?.(attr) || node[attr] || "";
|
||||||
|
if (value && antiAdblockProviderHostPatterns.some((host) => decodedText(value).includes(host))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function isOverlayLike(node) {
|
||||||
|
const style = window.getComputedStyle(node);
|
||||||
|
const rect = node.getBoundingClientRect();
|
||||||
|
const viewportArea = Math.max(1, window.innerWidth * window.innerHeight);
|
||||||
|
const area = Math.max(0, rect.width) * Math.max(0, rect.height);
|
||||||
|
const zIndex = Number.parseInt(style.zIndex, 10);
|
||||||
|
const hasHighZIndex = Number.isFinite(zIndex) && zIndex >= 10;
|
||||||
|
const positionIsBlocking = style.position === "fixed" || style.position === "sticky";
|
||||||
|
const coversMeaningfulArea = area / viewportArea >= 0.15;
|
||||||
|
const coversMostViewport = area / viewportArea >= 0.45;
|
||||||
|
return (
|
||||||
|
(positionIsBlocking && (hasHighZIndex || coversMeaningfulArea)) ||
|
||||||
|
(hasHighZIndex && coversMostViewport)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasDialogSemantics(node) {
|
||||||
|
const role = node.getAttribute?.("role");
|
||||||
|
return node.tagName === "DIALOG" || role === "dialog" || role === "alertdialog" || node.getAttribute?.("aria-modal") === "true";
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasBlockingClassName(node) {
|
||||||
|
return /(?:ad[-_ ]?block|adblock|allow[-_ ]?ads|overlay|modal|interstitial|backdrop|roadblock)/i.test(
|
||||||
|
`${node.id || ""} ${node.className || ""}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function hasActionControl(node) {
|
||||||
|
return Boolean(node.querySelector?.('button, [role="button"], a[href], input[type="button"], input[type="submit"]'));
|
||||||
|
}
|
||||||
|
|
||||||
|
function decodedText(value) {
|
||||||
|
try {
|
||||||
|
return decodeURIComponent(value).toLowerCase();
|
||||||
|
} catch {
|
||||||
|
return value.toLowerCase();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const walkedElements = [];
|
const walkedElements = [];
|
||||||
const walker = document.createTreeWalker(document.documentElement, NodeFilter.SHOW_ELEMENT);
|
const walker = document.createTreeWalker(document.documentElement, NodeFilter.SHOW_ELEMENT);
|
||||||
let element = document.documentElement;
|
let element = document.documentElement;
|
||||||
@@ -289,7 +503,9 @@ async function cleanupAndFreezePage(page, options) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
adSelectors: AD_SELECTORS,
|
adSelectors: AD_SELECTORS,
|
||||||
freezeStyles: options.freezeStyles !== false,
|
antiAdblockProviderHostPatterns: ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS,
|
||||||
|
antiAdblockTextPatterns: ANTI_ADBLOCK_TEXT_PATTERNS,
|
||||||
|
freezeStyles: options.freezeStyles === true,
|
||||||
maxFreezeElements: options.maxFreezeElements || 2500,
|
maxFreezeElements: options.maxFreezeElements || 2500,
|
||||||
maxSanitizeElements: options.maxSanitizeElements || 5000,
|
maxSanitizeElements: options.maxSanitizeElements || 5000,
|
||||||
stripAds: options.stripAds !== false,
|
stripAds: options.stripAds !== false,
|
||||||
@@ -333,7 +549,7 @@ function isTrackerUrl(rawUrl) {
|
|||||||
} catch {
|
} catch {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return TRACKER_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`));
|
return BLOCKED_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`));
|
||||||
}
|
}
|
||||||
|
|
||||||
function addArchiveComment(html, sourceUrl, options) {
|
function addArchiveComment(html, sourceUrl, options) {
|
||||||
@@ -346,7 +562,7 @@ function addArchiveComment(html, sourceUrl, options) {
|
|||||||
|
|
||||||
export function findExternalAssetRefs(html) {
|
export function findExternalAssetRefs(html) {
|
||||||
const refs = new Set();
|
const refs = new Set();
|
||||||
const attrPattern = /\b(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi;
|
const attrPattern = /\s(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi;
|
||||||
for (const match of html.matchAll(attrPattern)) {
|
for (const match of html.matchAll(attrPattern)) {
|
||||||
if (isSelfContainedAssetRef(match[2])) {
|
if (isSelfContainedAssetRef(match[2])) {
|
||||||
continue;
|
continue;
|
||||||
@@ -372,7 +588,7 @@ export function findExternalAssetRefs(html) {
|
|||||||
}
|
}
|
||||||
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
|
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
|
||||||
for (const match of html.matchAll(cssUrlPattern)) {
|
for (const match of html.matchAll(cssUrlPattern)) {
|
||||||
const candidate = match[2].trim();
|
const candidate = cleanCssUrl(match[2]);
|
||||||
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
||||||
refs.add(candidate);
|
refs.add(candidate);
|
||||||
}
|
}
|
||||||
@@ -381,10 +597,11 @@ export function findExternalAssetRefs(html) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function isSelfContainedAssetRef(value) {
|
function isSelfContainedAssetRef(value) {
|
||||||
const trimmed = value.trim();
|
const trimmed = cleanCssUrl(value);
|
||||||
return (
|
return (
|
||||||
!trimmed ||
|
!trimmed ||
|
||||||
trimmed.startsWith("#") ||
|
trimmed.startsWith("#") ||
|
||||||
|
/^%23/i.test(trimmed) ||
|
||||||
/^(?:data|about|javascript|mailto|tel):/i.test(trimmed)
|
/^(?:data|about|javascript|mailto|tel):/i.test(trimmed)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -393,3 +610,17 @@ function readAttribute(tag, attr) {
|
|||||||
const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"));
|
const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"));
|
||||||
return match ? match[2] ?? match[3] ?? match[4] ?? "" : "";
|
return match ? match[2] ?? match[3] ?? match[4] ?? "" : "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function cleanCssUrl(value) {
|
||||||
|
const decoded = String(value)
|
||||||
|
.trim()
|
||||||
|
.replaceAll("&", "&")
|
||||||
|
.replaceAll(""", '"')
|
||||||
|
.replaceAll("'", "'")
|
||||||
|
.replaceAll("'", "'");
|
||||||
|
const quote = decoded[0];
|
||||||
|
if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) {
|
||||||
|
return decoded.slice(1, -1).trim();
|
||||||
|
}
|
||||||
|
return decoded;
|
||||||
|
}
|
||||||
|
|||||||
@@ -92,6 +92,7 @@ export function resolveUrl(rawUrl, baseUrl) {
|
|||||||
if (
|
if (
|
||||||
!trimmed ||
|
!trimmed ||
|
||||||
trimmed.startsWith("#") ||
|
trimmed.startsWith("#") ||
|
||||||
|
/^%23/i.test(trimmed) ||
|
||||||
/^(?:about|blob|data|javascript|mailto|tel):/i.test(trimmed)
|
/^(?:about|blob|data|javascript|mailto|tel):/i.test(trimmed)
|
||||||
) {
|
) {
|
||||||
return trimmed;
|
return trimmed;
|
||||||
@@ -182,6 +183,7 @@ export class AssetInliner {
|
|||||||
this.userAgent = options.userAgent || DEFAULT_USER_AGENT;
|
this.userAgent = options.userAgent || DEFAULT_USER_AGENT;
|
||||||
this.referer = options.referer;
|
this.referer = options.referer;
|
||||||
this.maxAssetBytes = options.maxAssetBytes || 30 * 1024 * 1024;
|
this.maxAssetBytes = options.maxAssetBytes || 30 * 1024 * 1024;
|
||||||
|
this.maxInlineStyleBytes = options.maxInlineStyleBytes || 128 * 1024;
|
||||||
this.cache = new Map();
|
this.cache = new Map();
|
||||||
this.warnings = [];
|
this.warnings = [];
|
||||||
}
|
}
|
||||||
@@ -195,6 +197,10 @@ export class AssetInliner {
|
|||||||
output = output.replace(/<script\b[\s\S]*?<\/script>/gi, "");
|
output = output.replace(/<script\b[\s\S]*?<\/script>/gi, "");
|
||||||
output = output.replace(/<noscript\b[\s\S]*?<\/noscript>/gi, "");
|
output = output.replace(/<noscript\b[\s\S]*?<\/noscript>/gi, "");
|
||||||
output = output.replace(/<link\b[^>]*\brel=(["']?)(?:preconnect|dns-prefetch|modulepreload)\1[^>]*>/gi, "");
|
output = output.replace(/<link\b[^>]*\brel=(["']?)(?:preconnect|dns-prefetch|modulepreload)\1[^>]*>/gi, "");
|
||||||
|
output = output.replace(
|
||||||
|
/<link\b(?=[^>]*\brel=(["']?)preload\1)(?=[^>]*\bas=(["']?)script\2)[^>]*>/gi,
|
||||||
|
""
|
||||||
|
);
|
||||||
|
|
||||||
output = await replaceAsync(output, /<style\b([^>]*)>([\s\S]*?)<\/style>/gi, async (match) => {
|
output = await replaceAsync(output, /<style\b([^>]*)>([\s\S]*?)<\/style>/gi, async (match) => {
|
||||||
const attrs = match[1] || "";
|
const attrs = match[1] || "";
|
||||||
@@ -203,7 +209,15 @@ export class AssetInliner {
|
|||||||
});
|
});
|
||||||
|
|
||||||
output = await replaceAsync(output, /\sstyle=(["'])([\s\S]*?)\1/gi, async (match) => {
|
output = await replaceAsync(output, /\sstyle=(["'])([\s\S]*?)\1/gi, async (match) => {
|
||||||
const css = await this.inlineCss(htmlDecode(match[2]), effectiveBase);
|
const decoded = htmlDecode(match[2]);
|
||||||
|
if (!hasExternalCssReference(decoded)) {
|
||||||
|
return match[0];
|
||||||
|
}
|
||||||
|
if (decoded.length > this.maxInlineStyleBytes) {
|
||||||
|
this.warnings.push(`Skipped inline style asset rewrite: ${decoded.length} bytes exceeds ${this.maxInlineStyleBytes}`);
|
||||||
|
return match[0];
|
||||||
|
}
|
||||||
|
const css = await this.inlineCss(decoded, effectiveBase);
|
||||||
return ` style=${match[1]}${htmlEscape(css)}${match[1]}`;
|
return ` style=${match[1]}${htmlEscape(css)}${match[1]}`;
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -260,6 +274,9 @@ export class AssetInliner {
|
|||||||
/\b(?:icon|apple-touch-icon|image_src)\b/i.test(rel) ||
|
/\b(?:icon|apple-touch-icon|image_src)\b/i.test(rel) ||
|
||||||
(/\bpreload\b/i.test(rel) && /^(?:font|image|style)$/i.test(asValue));
|
(/\bpreload\b/i.test(rel) && /^(?:font|image|style)$/i.test(asValue));
|
||||||
if (!isInlineableLink) {
|
if (!isInlineableLink) {
|
||||||
|
if (/\bpreload\b/i.test(rel) && /^script$/i.test(asValue)) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
return tag;
|
return tag;
|
||||||
}
|
}
|
||||||
if (/\bpreload\b/i.test(rel) && /^style$/i.test(asValue)) {
|
if (/\bpreload\b/i.test(rel) && /^style$/i.test(asValue)) {
|
||||||
@@ -341,8 +358,8 @@ export class AssetInliner {
|
|||||||
);
|
);
|
||||||
|
|
||||||
output = await replaceAsync(output, /url\(\s*(["']?)([^"')]+)\1\s*\)/gi, async (match) => {
|
output = await replaceAsync(output, /url\(\s*(["']?)([^"')]+)\1\s*\)/gi, async (match) => {
|
||||||
const raw = htmlDecode(match[2].trim());
|
const raw = cleanCssUrl(match[2]);
|
||||||
if (!raw || raw.startsWith("#") || /^(?:data|blob|about|javascript):/i.test(raw)) {
|
if (!raw || raw.startsWith("#") || /^%23/i.test(raw) || /^(?:data|blob|about|javascript):/i.test(raw)) {
|
||||||
return match[0];
|
return match[0];
|
||||||
}
|
}
|
||||||
const dataUri = await this.toDataUri(raw, baseUrl);
|
const dataUri = await this.toDataUri(raw, baseUrl);
|
||||||
@@ -444,7 +461,7 @@ export class AssetInliner {
|
|||||||
|
|
||||||
function removeExternalBookkeepingUrls(html) {
|
function removeExternalBookkeepingUrls(html) {
|
||||||
return html.replace(
|
return html.replace(
|
||||||
/\s(?:old-src|currentSourceUrl|data-original-src|data-archived-src)=(["'])(https?:\/\/[\s\S]*?)\1/gi,
|
/\s(?:old-src|currentSourceUrl|data-original-src|data-archived-src)=(["'])([\s\S]*?)\1/gi,
|
||||||
""
|
""
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -505,7 +522,7 @@ function setAttribute(tag, attr, value) {
|
|||||||
if (attrRegex.test(tag)) {
|
if (attrRegex.test(tag)) {
|
||||||
return tag.replace(attrRegex, `${attr}="${escaped}"`);
|
return tag.replace(attrRegex, `${attr}="${escaped}"`);
|
||||||
}
|
}
|
||||||
return tag.replace(/\/?>$/, (end) => ` ${attr}="${escaped}"${end}`);
|
return tag.replace(/^<[^>]*>/, (openingTag) => openingTag.replace(/\/?>$/, (end) => ` ${attr}="${escaped}"${end}`));
|
||||||
}
|
}
|
||||||
|
|
||||||
function removeAttribute(tag, attr) {
|
function removeAttribute(tag, attr) {
|
||||||
@@ -519,3 +536,25 @@ function replaceMissingMediaAttribute(tag, attr) {
|
|||||||
}
|
}
|
||||||
return removeAttribute(tag, attr);
|
return removeAttribute(tag, attr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function hasExternalCssReference(css) {
|
||||||
|
if (/@import\b/i.test(css)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
for (const match of css.matchAll(/url\(\s*(["']?)([^"')]+)\1\s*\)/gi)) {
|
||||||
|
const raw = cleanCssUrl(match[2]);
|
||||||
|
if (raw && !raw.startsWith("#") && !/^%23/i.test(raw) && !/^(?:data|blob|about|javascript):/i.test(raw)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function cleanCssUrl(value) {
|
||||||
|
const decoded = htmlDecode(String(value).trim());
|
||||||
|
const quote = decoded[0];
|
||||||
|
if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) {
|
||||||
|
return decoded.slice(1, -1).trim();
|
||||||
|
}
|
||||||
|
return decoded;
|
||||||
|
}
|
||||||
|
|||||||
@@ -36,10 +36,13 @@ Options:
|
|||||||
--id <id> Output id/file stem
|
--id <id> Output id/file stem
|
||||||
--static Do not use a browser; transform the input HTML only
|
--static Do not use a browser; transform the input HTML only
|
||||||
--render Force browser rendering for local archive-shell HTML
|
--render Force browser rendering for local archive-shell HTML
|
||||||
|
--freeze-styles Snapshot computed styles into inline style attributes
|
||||||
--strip-archive-shell Remove an archive.ph shell from an already archived HTML file
|
--strip-archive-shell Remove an archive.ph shell from an already archived HTML file
|
||||||
--no-strip-ads Keep ad-like elements
|
--no-strip-ads Keep ad-like elements
|
||||||
--user-agent <ua> User agent to send for page and asset requests
|
--user-agent <ua> User agent to send for page and asset requests
|
||||||
--max-asset-bytes <bytes> Per-asset inline limit
|
--max-asset-bytes <bytes> Per-asset inline limit
|
||||||
|
--max-inline-style-bytes <bytes>
|
||||||
|
Per-style-attribute inline rewrite limit
|
||||||
|
|
||||||
Default user agent:
|
Default user agent:
|
||||||
${DEFAULT_USER_AGENT}`);
|
${DEFAULT_USER_AGENT}`);
|
||||||
@@ -64,12 +67,14 @@ async function main() {
|
|||||||
const result = await archivePage(input, {
|
const result = await archivePage(input, {
|
||||||
archivePath: args["archive-path"],
|
archivePath: args["archive-path"],
|
||||||
id: args.id,
|
id: args.id,
|
||||||
|
freezeStyles: Boolean(args["freeze-styles"]),
|
||||||
render: Boolean(args.render),
|
render: Boolean(args.render),
|
||||||
static: Boolean(args.static),
|
static: Boolean(args.static),
|
||||||
stripArchiveShell: Boolean(args["strip-archive-shell"]),
|
stripArchiveShell: Boolean(args["strip-archive-shell"]),
|
||||||
stripAds: args["strip-ads"] !== false,
|
stripAds: args["strip-ads"] !== false,
|
||||||
userAgent: args["user-agent"] || DEFAULT_USER_AGENT,
|
userAgent: args["user-agent"] || DEFAULT_USER_AGENT,
|
||||||
maxAssetBytes: args["max-asset-bytes"] ? Number(args["max-asset-bytes"]) : undefined
|
maxAssetBytes: args["max-asset-bytes"] ? Number(args["max-asset-bytes"]) : undefined,
|
||||||
|
maxInlineStyleBytes: args["max-inline-style-bytes"] ? Number(args["max-inline-style-bytes"]) : undefined
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(`Archived: ${result.sourceUrl}`);
|
console.log(`Archived: ${result.sourceUrl}`);
|
||||||
|
|||||||
@@ -21,12 +21,14 @@ const server = http.createServer(async (req, res) => {
|
|||||||
const result = await archivePage(body.url, {
|
const result = await archivePage(body.url, {
|
||||||
archivePath,
|
archivePath,
|
||||||
id: body.id,
|
id: body.id,
|
||||||
|
freezeStyles: Boolean(body.freezeStyles),
|
||||||
render: Boolean(body.render),
|
render: Boolean(body.render),
|
||||||
static: Boolean(body.static),
|
static: Boolean(body.static),
|
||||||
stripArchiveShell: Boolean(body.stripArchiveShell),
|
stripArchiveShell: Boolean(body.stripArchiveShell),
|
||||||
stripAds: body.stripAds !== false,
|
stripAds: body.stripAds !== false,
|
||||||
userAgent: body.userAgent || DEFAULT_USER_AGENT,
|
userAgent: body.userAgent || DEFAULT_USER_AGENT,
|
||||||
maxAssetBytes: body.maxAssetBytes
|
maxAssetBytes: body.maxAssetBytes,
|
||||||
|
maxInlineStyleBytes: body.maxInlineStyleBytes
|
||||||
});
|
});
|
||||||
return sendJson(res, 201, {
|
return sendJson(res, 201, {
|
||||||
id: result.id,
|
id: result.id,
|
||||||
|
|||||||
Reference in New Issue
Block a user