import fs from "node:fs/promises"; import path from "node:path"; import { createRequire } from "node:module"; import { fileURLToPath } from "node:url"; import { AssetInliner, DEFAULT_USER_AGENT, defaultArchivePath, findEffectiveBase, inputToUrl, isHttpUrl, splitSrcset, slugForUrl } from "./asset-inliner.mjs"; const require = createRequire(import.meta.url); const __dirname = path.dirname(fileURLToPath(import.meta.url)); const PAGE_TIMEOUT_MS = 60000; const NETWORK_IDLE_TIMEOUT_MS = 5000; const VIEWPORT = { width: 1366, height: 768 }; const COMMON_ANNOYANCE_SELECTORS = [ "[id^=\"sp_message_container_\"]", "iframe[id^=\"sp_message_iframe_\"]", "iframe[title*=\"consent\" i]", "iframe[title*=\"privacy manager\" i]", "#onetrust-consent-sdk", "#onetrust-banner-sdk", "#didomi-host", "#qc-cmp2-container", ".qc-cmp2-container", "#CybotCookiebotDialog", ".iubenda-cs-container", "#cmpwrapper", "[id^=\"cmpbox\"]", ".fc-consent-root", ".fc-dialog-container", "[aria-modal=\"true\"][id*=\"consent\" i]", "[aria-modal=\"true\"][id*=\"cookie\" i]", "[role=\"dialog\"][aria-label*=\"cookie\" i]", "[role=\"dialog\"][aria-label*=\"consent\" i]", "[id*=\"cookie-banner\" i]", "[class*=\"cookie-banner\" i]", "[id*=\"cookie-consent\" i]", "[class*=\"cookie-consent\" i]", "[id*=\"cookie-notice\" i]", "[class*=\"cookie-notice\" i]", "[id*=\"cookie-popup\" i]", "[class*=\"cookie-popup\" i]", "[id*=\"adblock\" i]", "[class*=\"adblock\" i]", "[id*=\"ad-block\" i]", "[class*=\"ad-block\" i]" ]; const COMMON_ANNOYANCE_ROOT_CLASSES = [ "sp-message-open", "didomi-popup-open", "qc-cmp-ui-showing", "ot-sdk-show-settings", "iubenda-cs-visible" ]; const COMMON_ANNOYANCE_CSS = ` ${COMMON_ANNOYANCE_SELECTORS.join(",\n")} { display: none !important; visibility: hidden !important; pointer-events: none !important; } html.sp-message-open, body.sp-message-open, html.didomi-popup-open, body.didomi-popup-open, html.qc-cmp-ui-showing, body.qc-cmp-ui-showing, html.iubenda-cs-visible, body.iubenda-cs-visible { overflow: auto !important; position: static !important; } `; export { DEFAULT_USER_AGENT, defaultArchivePath }; // --------------------------------------------------------------------------- // Privacy filters integration // --------------------------------------------------------------------------- const PRIVACY_FILTERS_DIR = path.join(__dirname, "..", "privacy-filters"); const FILTER_LIST_FILES = [ { id: "bpc-paywall", file: "bpc-paywall-filter.txt" }, { id: "easylist", file: path.join("lists", "easylist.txt") }, { id: "ublock-filters", file: path.join("lists", "ublock-filters.txt") }, { id: "easylist-cookie", file: path.join("lists", "easylist-cookie.txt") }, { id: "ublock-annoyances", file: path.join("lists", "ublock-annoyances.txt") }, { id: "ublock-cookies", file: path.join("lists", "ublock-cookies.txt") } ]; let privacyFiltersAvailable = false; let filterRules = emptyFilterRules(); let userScriptData = []; // { file, content, matches, excludes } let userScriptRequireContent = ""; async function loadPrivacyFilters() { try { const filterSets = []; for (const list of FILTER_LIST_FILES) { try { const filterPath = path.join(PRIVACY_FILTERS_DIR, list.file); const filterContent = await fs.readFile(filterPath, "utf8"); filterSets.push(parseFilterRules(filterContent, { source: list.id })); } catch (error) { if (error?.code !== "ENOENT") { throw error; } } } filterRules = mergeFilterRules(filterSets); const userscriptDir = path.join(PRIVACY_FILTERS_DIR, "userscript"); userScriptRequireContent = await fs.readFile(path.join(userscriptDir, "bpc_func.js"), "utf8"); userScriptData = []; const userScriptFiles = [ "bpc.en.user.js", "bpc.de.user.js", "bpc.es.pt.user.js", "bpc.fi.se.user.js", "bpc.fr.user.js", "bpc.it.user.js", "bpc.nl.user.js", "bpc.pl.user.js" ]; for (const file of userScriptFiles) { const content = await fs.readFile(path.join(userscriptDir, file), "utf8"); const meta = parseUserScriptMetadata(content); userScriptData.push({ file, content, ...meta }); } privacyFiltersAvailable = filterRules.blockRules.length > 0 || filterRules.importantBlockRules.length > 0 || filterRules.allowRules.length > 0 || filterRules.importantAllowRules.length > 0 || filterRules.cosmeticRules.length > 0 || userScriptData.length > 0; } catch { // Privacy filters directory missing or unreadable; archive without them. } } await loadPrivacyFilters(); // --- Adblock filter parsing ------------------------------------------------ const COSMETIC_SEPARATORS = [ { token: "#@?#", kind: "cosmeticException", extended: true }, { token: "#@$#", kind: "styleException" }, { token: "#@%#", kind: "scriptException" }, { token: "#@^", kind: "htmlException" }, { token: "#@#", kind: "cosmeticException" }, { token: "#?#", kind: "extendedCosmetic", extended: true }, { token: "#$#", kind: "style" }, { token: "#%#", kind: "script" }, { token: "#^", kind: "html" }, { token: "##", kind: "cosmetic" } ]; const RESOURCE_TYPE_ALIASES = new Map([ ["beacon", "ping"], ["css", "stylesheet"], ["doc", "document"], ["document", "document"], ["fetch", "xmlhttprequest"], ["font", "font"], ["frame", "subdocument"], ["image", "image"], ["inline-script", "inline-script"], ["media", "media"], ["object", "object"], ["object-subrequest", "object"], ["other", "other"], ["ping", "ping"], ["script", "script"], ["stylesheet", "stylesheet"], ["subdocument", "subdocument"], ["websocket", "websocket"], ["xhr", "xmlhttprequest"], ["xmlhttprequest", "xmlhttprequest"] ]); const SKIP_NETWORK_OPTION_NAMES = new Set([ "cookie", "csp", "cname", "denyallow", "ehide", "elemhide", "ghide", "genericblock", "generichide", "header", "ipaddress", "jsonprune", "method", "permissions", "popunder", "popup", "queryprune", "redirect", "redirect-rule", "removeparam", "replace", "rewrite", "shide", "specifichide", "uritransform", "urlskip", "webrtc", "xmlprune" ]); const MULTI_PART_PUBLIC_SUFFIXES = new Set([ "ac.uk", "co.jp", "co.nz", "co.uk", "com.au", "com.br", "com.mx", "com.tr", "com.tw", "com.cn", "net.au", "net.nz", "org.au", "org.nz", "org.uk" ]); function emptyFilterRules() { return { blockRules: [], importantBlockRules: [], allowRules: [], importantAllowRules: [], cosmeticRules: [], cosmeticExceptionRules: [], badFilterKeys: new Set(), sourceFiles: [], blockRuleIndex: null, importantBlockRuleIndex: null, allowRuleIndex: null, importantAllowRuleIndex: null }; } function mergeFilterRules(filterSets) { const merged = emptyFilterRules(); for (const set of filterSets) { merged.blockRules.push(...set.blockRules); merged.importantBlockRules.push(...set.importantBlockRules); merged.allowRules.push(...set.allowRules); merged.importantAllowRules.push(...set.importantAllowRules); merged.cosmeticRules.push(...set.cosmeticRules); merged.cosmeticExceptionRules.push(...set.cosmeticExceptionRules); merged.sourceFiles.push(...set.sourceFiles); for (const key of set.badFilterKeys) { merged.badFilterKeys.add(key); } } if (merged.badFilterKeys.size > 0) { const isActive = (rule) => !merged.badFilterKeys.has(rule.key); merged.blockRules = merged.blockRules.filter(isActive); merged.importantBlockRules = merged.importantBlockRules.filter(isActive); merged.allowRules = merged.allowRules.filter(isActive); merged.importantAllowRules = merged.importantAllowRules.filter(isActive); } return finalizeFilterRules(merged); } export function parseFilterRules(content, options = {}) { const rules = emptyFilterRules(); if (options.source) { rules.sourceFiles.push(options.source); } let preprocessorDepth = 0; for (const rawLine of content.split("\n")) { const line = rawLine.trim(); if (!line) continue; if (line.startsWith("!#if")) { preprocessorDepth += 1; continue; } if (line.startsWith("!#endif")) { preprocessorDepth = Math.max(0, preprocessorDepth - 1); continue; } if (preprocessorDepth > 0 || line.startsWith("!#") || line.startsWith("!") || line.startsWith("[")) { continue; } const cosmetic = parseCosmeticFilterLine(line, options.source); if (cosmetic) { if (cosmetic.kind === "cosmeticException") { rules.cosmeticExceptionRules.push(cosmetic); } else if (cosmetic.kind === "cosmetic") { rules.cosmeticRules.push(cosmetic); } continue; } if (cosmetic === false) { continue; } const isException = line.startsWith("@@"); const networkLine = isException ? line.slice(2) : line; const rule = parseNetworkRule(networkLine, { exception: isException, source: options.source }); if (!rule) continue; if (rule.badfilter) { rules.badFilterKeys.add(rule.key); } else if (isException && rule.important) { rules.importantAllowRules.push(rule); } else if (isException) { rules.allowRules.push(rule); } else if (rule.important) { rules.importantBlockRules.push(rule); } else { rules.blockRules.push(rule); } } return finalizeFilterRules(rules); } function finalizeFilterRules(rules) { if (rules.badFilterKeys.size > 0) { const isActive = (rule) => !rules.badFilterKeys.has(rule.key); rules.blockRules = rules.blockRules.filter(isActive); rules.importantBlockRules = rules.importantBlockRules.filter(isActive); rules.allowRules = rules.allowRules.filter(isActive); rules.importantAllowRules = rules.importantAllowRules.filter(isActive); } rules.blockRuleIndex = buildNetworkRuleIndex(rules.blockRules); rules.importantBlockRuleIndex = buildNetworkRuleIndex(rules.importantBlockRules); rules.allowRuleIndex = buildNetworkRuleIndex(rules.allowRules); rules.importantAllowRuleIndex = buildNetworkRuleIndex(rules.importantAllowRules); return rules; } function buildNetworkRuleIndex(rules) { const byDomain = new Map(); const wildcardDomainRules = []; const otherRules = []; for (const rule of rules) { if (rule.kind !== "domain") { otherRules.push(rule); continue; } if (rule.domain.includes("*")) { wildcardDomainRules.push(rule); continue; } const bucket = byDomain.get(rule.domain) || []; bucket.push(rule); byDomain.set(rule.domain, bucket); } return { byDomain, wildcardDomainRules, otherRules }; } function parseCosmeticFilterLine(line, source) { const separator = findCosmeticSeparator(line); if (!separator) return null; const domains = line.slice(0, separator.index); const body = line.slice(separator.index + separator.token.length).trim(); if (!body) return false; if (separator.kind === "cosmeticException") { return { kind: "cosmeticException", domains, selector: cosmeticSelectorKey(body), source }; } if (separator.kind === "style") { const css = adguardStyleRuleToCss(body); return css ? { kind: "cosmetic", domains, selector: cosmeticSelectorKey(body), css, source } : false; } if (separator.kind !== "cosmetic" || separator.extended) { return false; } const css = cosmeticSelectorToCss(body); return css ? { kind: "cosmetic", domains, selector: cosmeticSelectorKey(body), css, source } : false; } function findCosmeticSeparator(line) { let best = null; for (const separator of COSMETIC_SEPARATORS) { const index = line.indexOf(separator.token); if ( index >= 0 && (!best || index < best.index || (index === best.index && separator.token.length > best.token.length)) ) { best = { ...separator, index }; } } return best; } function cosmeticSelectorKey(selector) { return selector.trim().replace(/\s+/g, " "); } function parseNetworkRule(line, options = {}) { const split = splitNetworkOptions(line); const parsedOptions = parseNetworkOptions(split.options); const key = networkRuleKey(split.pattern, parsedOptions.optionsForKey, options.exception); if (parsedOptions.badfilter) { return { badfilter: true, key }; } if (!split.pattern || parsedOptions.skip) { return null; } const compiled = compileNetworkPattern(split.pattern, parsedOptions.matchCase); if (!compiled) { return null; } return { ...compiled, key, types: parsedOptions.types, excludedTypes: parsedOptions.excludedTypes, isThirdParty: parsedOptions.isThirdParty, isFirstParty: parsedOptions.isFirstParty, includeDomains: parsedOptions.includeDomains, excludeDomains: parsedOptions.excludeDomains, includeTargetDomains: parsedOptions.includeTargetDomains, excludeTargetDomains: parsedOptions.excludeTargetDomains, important: parsedOptions.important, source: options.source }; } function splitNetworkOptions(line) { const lastDollar = line.lastIndexOf("$"); if (lastDollar <= 0) { return { pattern: line, options: [] }; } const optionText = line.slice(lastDollar + 1); if (!looksLikeFilterOptions(optionText)) { return { pattern: line, options: [] }; } return { pattern: line.slice(0, lastDollar), options: splitFilterOptions(optionText) }; } function looksLikeFilterOptions(optionText) { if (!optionText || /\s/.test(optionText)) return false; const firstOption = optionText.split(",", 1)[0]; return /^~?[a-z][a-z0-9_-]*(?:=|$)/i.test(firstOption); } function splitFilterOptions(optionText) { return optionText .split(",") .map((option) => option.trim()) .filter(Boolean); } function parseNetworkOptions(options) { const parsed = { types: [], excludedTypes: [], isThirdParty: false, isFirstParty: false, includeDomains: [], excludeDomains: [], includeTargetDomains: [], excludeTargetDomains: [], important: false, matchCase: false, badfilter: false, skip: false, optionsForKey: [] }; for (const rawOption of options) { const option = rawOption.trim(); if (!option) continue; const negated = option.startsWith("~"); const optionBody = negated ? option.slice(1) : option; const eqIndex = optionBody.indexOf("="); const name = (eqIndex >= 0 ? optionBody.slice(0, eqIndex) : optionBody).toLowerCase(); const value = eqIndex >= 0 ? optionBody.slice(eqIndex + 1) : ""; if (name !== "badfilter") { parsed.optionsForKey.push(option); } if (name === "badfilter") { parsed.badfilter = true; continue; } if (name === "important") { parsed.important = true; continue; } if (name === "match-case") { parsed.matchCase = true; continue; } if (name === "third-party" || name === "3p" || name === "strict3p") { parsed.isThirdParty = !negated; parsed.isFirstParty = negated; continue; } if (name === "first-party" || name === "1p" || name === "strict1p") { parsed.isFirstParty = !negated; parsed.isThirdParty = negated; continue; } if (name === "domain" || name === "from") { const domains = parseDomainOptionValue(value); parsed.includeDomains.push(...domains.include); parsed.excludeDomains.push(...domains.exclude); continue; } if (name === "to") { const domains = parseDomainOptionValue(value); parsed.includeTargetDomains.push(...domains.include); parsed.excludeTargetDomains.push(...domains.exclude); continue; } const resourceType = RESOURCE_TYPE_ALIASES.get(name); if (resourceType) { if (negated) { parsed.excludedTypes.push(resourceType); } else { parsed.types.push(resourceType); } continue; } if (SKIP_NETWORK_OPTION_NAMES.has(name) || eqIndex >= 0) { parsed.skip = true; } } return parsed; } function parseDomainOptionValue(value) { const include = []; const exclude = []; if (!value) { return { include, exclude }; } for (const rawDomain of value.split("|")) { const domain = rawDomain.trim().toLowerCase(); if (!domain) continue; if (domain.startsWith("~")) { exclude.push(domain.slice(1)); } else { include.push(domain); } } return { include, exclude }; } function networkRuleKey(pattern, options, exception) { return `${exception ? "@@" : ""}${pattern}${options.length > 0 ? `$${options.join(",")}` : ""}`; } function compileNetworkPattern(pattern, matchCase) { const flags = matchCase ? "" : "i"; if (pattern.startsWith("||")) { const domainRule = parseDomainAnchoredPattern(pattern, flags); return domainRule; } if (pattern.startsWith("/") && pattern.endsWith("/") && pattern.length > 1) { try { return { kind: "regex", regex: new RegExp(pattern.slice(1, -1), flags) }; } catch { return null; } } try { return { kind: "pattern", regex: new RegExp(adblockPatternToRegex(pattern), flags) }; } catch { return null; } } function parseDomainAnchoredPattern(pattern, flags) { const domainPath = pattern.slice(2); let domainEnd = 0; while ( domainEnd < domainPath.length && domainPath[domainEnd] !== "/" && domainPath[domainEnd] !== "^" ) { domainEnd += 1; } const domain = domainPath.slice(0, domainEnd).toLowerCase(); if (!domain || /[\\[\]{}()]/.test(domain)) { return null; } const suffix = domainPath.slice(domainEnd); let path = ""; if (suffix.startsWith("/")) { path = suffix; } else if (suffix.startsWith("^/")) { path = suffix.slice(1); } else if (suffix && suffix !== "^") { return null; } let pathRegex = null; if (path) { try { pathRegex = new RegExp("^" + adblockPatternToRegex(path), flags); } catch { return null; } } return { kind: "domain", domain, path, pathRegex }; } function adguardStyleRuleToCss(rule) { if (!rule.includes("{") || !rule.includes("}") || /[\r\n]/.test(rule)) { return null; } return rule; } function cosmeticSelectorToCss(selector) { const trimmed = selector.trim(); if (!trimmed || trimmed.startsWith("+js") || trimmed.startsWith("^")) { return null; } if (trimmed.endsWith(":remove()")) { const baseSelector = trimmed.slice(0, -":remove()".length); return isSupportedCosmeticSelector(baseSelector) ? `${baseSelector} { display: none !important; }` : null; } const styleMatch = trimmed.match(/:style\((.+)\)$/); if (styleMatch) { const baseSelector = trimmed.slice(0, trimmed.lastIndexOf(":style(")); return isSupportedCosmeticSelector(baseSelector) ? `${baseSelector} { ${styleMatch[1]} }` : null; } return isSupportedCosmeticSelector(trimmed) ? `${trimmed} { display: none !important; }` : null; } function isSupportedCosmeticSelector(selector) { if (!selector || /[\r\n{}]/.test(selector)) return false; const unsupportedTokens = [ ":-abp-contains(", ":-abp-has(", ":-abp-properties(", ":contains(", ":has-text(", ":matches-attr(", ":matches-css", ":matches-media", ":matches-path", ":min-text-length(", ":others()", ":remove()", ":upward(", ":watch-attr(", ":xpath(" ]; const lower = selector.toLowerCase(); return !unsupportedTokens.some((token) => lower.includes(token)); } export function getCosmeticCssForHostname(rules, hostname) { const normalizedHostname = String(hostname || "").toLowerCase(); const exceptionKeys = new Set(); for (const exception of rules.cosmeticExceptionRules || []) { if (matchesCosmeticDomains(exception.domains, normalizedHostname)) { exceptionKeys.add(exception.selector); } } const lines = []; for (const rule of rules.cosmeticRules || []) { if ( matchesCosmeticDomains(rule.domains, normalizedHostname) && !exceptionKeys.has(rule.selector) ) { lines.push(rule.css); } } return lines; } function matchesCosmeticDomains(domainSpec, hostname) { if (!domainSpec || domainSpec === "*") return true; const domains = domainSpec .split(",") .map((domain) => domain.trim().toLowerCase()) .filter(Boolean); const positives = []; const negatives = []; for (const domain of domains) { if (domain.startsWith("~")) { negatives.push(domain.slice(1)); } else { positives.push(domain); } } if (negatives.some((domain) => domainMatchesPattern(hostname, domain))) { return false; } if (positives.length === 0) { return true; } return positives.some((domain) => domainMatchesPattern(hostname, domain)); } function matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule) { const normalizedSourceHostname = String(sourceHostname || "").toLowerCase(); if (rule.includeDomains.length > 0) { const ok = rule.includeDomains.some((domain) => domainMatchesPattern(normalizedSourceHostname, domain) ); if (!ok) return false; } if (rule.excludeDomains.length > 0) { const blocked = rule.excludeDomains.some((domain) => domainMatchesPattern(normalizedSourceHostname, domain) ); if (blocked) return false; } if (rule.includeTargetDomains.length > 0) { const ok = rule.includeTargetDomains.some((domain) => domainMatchesPattern(hostname, domain) ); if (!ok) return false; } if (rule.excludeTargetDomains.length > 0) { const blocked = rule.excludeTargetDomains.some((domain) => domainMatchesPattern(hostname, domain) ); if (blocked) return false; } if (rule.excludedTypes.some((type) => resourceTypeMatches(type, resourceType))) { return false; } if (rule.types.length > 0) { if (!rule.types.some((type) => resourceTypeMatches(type, resourceType))) { return false; } } const isThirdParty = isThirdPartyRequest(hostname, normalizedSourceHostname); if (rule.isThirdParty && !isThirdParty) { return false; } if (rule.isFirstParty && isThirdParty) { return false; } if (rule.kind === "domain") { if (!domainPatternMatches(hostname, rule.domain)) return false; if (rule.pathRegex && !rule.pathRegex.test(urlObj.pathname + urlObj.search)) return false; return true; } if (rule.kind === "regex" || rule.kind === "pattern") { return rule.regex.test(url); } return false; } function resourceTypeMatches(filterType, resourceType) { const typeMap = { document: ["document"], font: ["font"], image: ["image"], "inline-script": ["script"], media: ["media"], object: ["object"], other: ["other"], ping: ["ping", "fetch"], script: ["script"], stylesheet: ["stylesheet"], subdocument: ["document", "subdocument"], websocket: ["websocket"], xmlhttprequest: ["fetch", "xhr", "xmlhttprequest"] }; const mapped = typeMap[filterType]; return mapped ? mapped.includes(resourceType) : false; } function domainPatternMatches(hostname, pattern) { const normalized = pattern.replace(/\^$/, "").toLowerCase(); if (!normalized) return false; if (!normalized.includes("*")) { return hostname === normalized || hostname.endsWith("." + normalized); } return domainMatchesPattern(hostname, normalized); } function domainMatchesPattern(hostname, pattern) { const normalizedHostname = String(hostname || "").toLowerCase(); const normalizedPattern = String(pattern || "").replace(/\^$/, "").toLowerCase(); if (!normalizedPattern) return false; if (normalizedPattern === "*") return true; if (!normalizedPattern.includes("*")) { return normalizedHostname === normalizedPattern || normalizedHostname.endsWith("." + normalizedPattern); } const source = normalizedPattern .split("*") .map((part) => part.replace(/[|\\{}()[\]^$+?.]/g, "\\$&")) .join(".*"); const re = new RegExp(`${normalizedPattern.startsWith("*") ? "^" : "(?:^|\\.)"}${source}$`, "i"); return re.test(normalizedHostname); } function isThirdPartyRequest(hostname, sourceHostname) { if (!hostname || !sourceHostname) { return hostname !== sourceHostname; } const requestSite = registrableDomain(hostname); const sourceSite = registrableDomain(sourceHostname); if (!requestSite || !sourceSite) { return hostname !== sourceHostname && !hostname.endsWith("." + sourceHostname); } return requestSite !== sourceSite; } function registrableDomain(hostname) { const normalized = String(hostname || "").toLowerCase().replace(/\.$/, ""); if (!normalized || /^\d{1,3}(?:\.\d{1,3}){3}$/.test(normalized) || normalized === "localhost") { return normalized; } const parts = normalized.split(".").filter(Boolean); if (parts.length <= 2) { return normalized; } const suffix2 = parts.slice(-2).join("."); if (MULTI_PART_PUBLIC_SUFFIXES.has(suffix2) && parts.length >= 3) { return parts.slice(-3).join("."); } return parts.slice(-2).join("."); } function adblockPatternToRegex(pattern) { let source = ""; let remaining = pattern; let anchoredStart = false; let anchoredEnd = false; if (remaining.startsWith("|")) { anchoredStart = true; remaining = remaining.slice(1); } if (remaining.endsWith("|")) { anchoredEnd = true; remaining = remaining.slice(0, -1); } for (const ch of remaining) { if (ch === "*") { source += ".*"; } else if (ch === "^") { source += "(?:[^A-Za-z0-9_.%-]|$)"; } else { source += ch.replace(/[|\\{}()[\]^$+?.]/g, "\\$&"); } } return `${anchoredStart ? "^" : ""}${source}${anchoredEnd ? "$" : ""}`; } export function shouldBlockRequestWithRules(rules, url, resourceType, sourceHostname) { if (url === sourceHostname || (sourceHostname && url.startsWith(sourceHostname + "/"))) { return false; } let urlObj; try { urlObj = new URL(url); } catch { return false; } const hostname = urlObj.hostname; for (const rule of networkRuleCandidates(rules.importantAllowRules, rules.importantAllowRuleIndex, hostname)) { if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) { return false; } } for (const rule of networkRuleCandidates(rules.importantBlockRules, rules.importantBlockRuleIndex, hostname)) { if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) { return true; } } for (const rule of networkRuleCandidates(rules.allowRules, rules.allowRuleIndex, hostname)) { if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) { return false; } } for (const rule of networkRuleCandidates(rules.blockRules, rules.blockRuleIndex, hostname)) { if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) { return true; } } return false; } function shouldBlockRequest(url, resourceType, sourceHostname) { return shouldBlockRequestWithRules(filterRules, url, resourceType, sourceHostname); } function* networkRuleCandidates(rules = [], index, hostname) { if (!index) { yield* rules; return; } for (const suffix of hostnameSuffixes(hostname)) { const bucket = index.byDomain.get(suffix); if (bucket) { yield* bucket; } } yield* index.wildcardDomainRules; yield* index.otherRules; } function hostnameSuffixes(hostname) { const normalized = String(hostname || "").toLowerCase(); if (!normalized) return [""]; const labels = normalized.split(".").filter(Boolean); const suffixes = []; for (let index = 0; index < labels.length; index += 1) { suffixes.push(labels.slice(index).join(".")); } return suffixes; } // --- Userscript metadata parsing ------------------------------------------- function parseUserScriptMetadata(content) { const metaBlock = content.match(/\/\/\s*==UserScript==([\s\S]*?)\/\/\s*==\/UserScript==/); const matches = []; const excludes = []; if (!metaBlock) return { matches, excludes }; const lines = metaBlock[1].split("\n"); for (const line of lines) { const matchMatch = line.match(/@match\s+(.+)/); if (matchMatch) { matches.push(matchMatch[1].trim()); continue; } const excludeMatch = line.match(/@exclude\s+(.+)/); if (excludeMatch) { excludes.push(excludeMatch[1].trim()); } } return { matches, excludes }; } function urlMatchesPattern(url, pattern) { // Simple glob-style pattern matching for userscript @match // Format: *://*.example.com/* or http://example.com/path try { const urlObj = new URL(url); const protocol = urlObj.protocol.slice(0, -1); // "http" or "https" const hostname = urlObj.hostname; const pathname = urlObj.pathname; // Split pattern const protoEnd = pattern.indexOf("://"); if (protoEnd < 0) return false; const patternProto = pattern.slice(0, protoEnd); const rest = pattern.slice(protoEnd + 3); // Protocol match if (patternProto !== "*" && patternProto !== protocol) return false; // Split rest into host and path const slashIdx = rest.indexOf("/"); const patternHost = slashIdx >= 0 ? rest.slice(0, slashIdx) : rest; const patternPath = slashIdx >= 0 ? rest.slice(slashIdx) : "/"; // Host match if (!matchHost(hostname, patternHost)) return false; // Path match if (!matchPath(pathname, patternPath)) return false; return true; } catch { return false; } } function matchHost(hostname, pattern) { if (pattern === "*") return true; if (pattern.startsWith("*.")) { const suffix = pattern.slice(2); return hostname === suffix || hostname.endsWith("." + suffix); } return hostname === pattern; } function matchPath(pathname, pattern) { if (pattern === "/*") return true; // Convert glob pattern to regex const regex = "^" + pattern .replace(/\./g, "\\.") .replace(/\*/g, ".*") .replace(/\?/g, ".") + "$"; return new RegExp(regex, "i").test(pathname); } function shouldInjectUserScript(url, meta) { let matched = false; for (const pattern of meta.matches) { if (urlMatchesPattern(url, pattern)) { matched = true; break; } } if (!matched) return false; for (const pattern of meta.excludes) { if (urlMatchesPattern(url, pattern)) { return false; } } return true; } // --- Browser helpers ------------------------------------------------------- function loadPlaywright() { try { return require("playwright"); } catch (error) { throw new Error( `Playwright is required. Run "npm install" and "npm run install-browsers". Original error: ${error.message}` ); } } // Manual stealth evasions injected into every page before any scripts run. const STEALTH_INIT_SCRIPT = ` (() => { const patchNavigator = () => { try { // Override webdriver getter without using delete (can crash renderer) if (navigator.webdriver !== undefined) { Object.defineProperty(navigator, 'webdriver', { get: () => undefined, configurable: true, enumerable: true }); } } catch (e) {} try { if (!window.chrome) { window.chrome = { runtime: {} }; } else if (!window.chrome.runtime) { window.chrome.runtime = {}; } } catch (e) {} try { const originalQuery = window.navigator.permissions?.query; if (originalQuery) { window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) ); } } catch (e) {} }; if (document.readyState === 'loading') { document.addEventListener('DOMContentLoaded', patchNavigator); } else { patchNavigator(); } })(); `; function buildLaunchArgs(headless) { const args = [ "--disable-blink-features=AutomationControlled", "--disable-web-security", "--disable-features=IsolateOrigins,site-per-process", "--disable-site-isolation-trials", "--disable-infobars", "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-accelerated-2d-canvas", "--disable-gpu", "--window-size=1366,768" ]; if (headless) { args.push("--headless=new"); } return args; } function buildIgnoreDefaultArgs() { return ["--enable-automation"]; } // --- Page helpers ---------------------------------------------------------- async function setupRequestBlocking(page, sourceHostname) { if ( !privacyFiltersAvailable || (filterRules.blockRules.length === 0 && filterRules.importantBlockRules.length === 0) ) { return; } await page.route("**/*", (route) => { try { const request = route.request(); if (request.isNavigationRequest() && request.frame() === page.mainFrame()) { route.continue(); return; } const url = request.url(); const type = request.resourceType(); if (shouldBlockRequest(url, type, sourceHostname)) { route.abort("blockedbyclient"); } else { route.continue(); } } catch { route.continue(); } }); } async function injectCosmeticFilters(page, hostname) { const lines = [COMMON_ANNOYANCE_CSS]; if (privacyFiltersAvailable && filterRules.cosmeticRules.length > 0) { lines.push(...getCosmeticCssForHostname(filterRules, hostname)); } if (lines.length > 0) { try { await page.addStyleTag({ content: lines.join("\n") }); } catch { // Ignore cosmetic injection failures. } } } async function removeCommonAnnoyances(page) { try { await page.evaluate(({ selectors, rootClasses }) => { for (const selector of selectors) { try { document.querySelectorAll(selector).forEach((element) => element.remove()); } catch { // Ignore selectors unsupported by the current browser. } } for (const root of [document.documentElement, document.body].filter(Boolean)) { root.classList.remove(...rootClasses); root.removeAttribute("data-previous-scroll-y"); const overflow = root.style.overflow || ""; const position = root.style.position || ""; if (/hidden|clip/i.test(overflow)) { root.style.removeProperty("overflow"); } if (/fixed/i.test(position)) { root.style.removeProperty("position"); root.style.removeProperty("top"); root.style.removeProperty("left"); root.style.removeProperty("right"); } } }, { selectors: COMMON_ANNOYANCE_SELECTORS, rootClasses: COMMON_ANNOYANCE_ROOT_CLASSES }); } catch { // Ignore cleanup failures; the archive is still useful. } } const GM_MOCK = ` if (typeof GM === "undefined") { window.GM = { xmlHttpRequest: function(details) { fetch(details.url, { method: details.method || "GET", headers: details.headers || {}, body: details.data || null }) .then(response => response.text().then(text => ({ status: response.status, statusText: response.statusText, responseText: text, responseHeaders: Array.from(response.headers.entries()) .map(([k, v]) => k + ": " + v).join("\\r\\n") }))) .then(obj => { if (details.onload) details.onload(obj); }) .catch(err => { if (details.onerror) details.onerror(err); }); } }; } `; async function injectPrivacyUserScripts(page, sourceUrl) { if (!privacyFiltersAvailable || userScriptData.length === 0) return; const matching = userScriptData.filter((us) => shouldInjectUserScript(sourceUrl, us)); if (matching.length === 0) return; // Inject GM API mock first. try { await page.addScriptTag({ content: GM_MOCK }); if (userScriptRequireContent) { await page.addScriptTag({ content: userScriptRequireContent }); } } catch { return; } // Inject only matching userscripts. for (const us of matching) { try { await page.addScriptTag({ content: us.content }); } catch { // Ignore injection failures for individual scripts. } } } // --------------------------------------------------------------------------- // Archiving // --------------------------------------------------------------------------- export async function archivePage(input, options = {}) { const sourceUrl = inputToUrl(input); const archivePath = options.archivePath || defaultArchivePath(); const id = options.id || slugForUrl(sourceUrl); const filePath = path.join(archivePath, `${id}.html`); await fs.mkdir(archivePath, { recursive: true }); const renderedHtml = await renderPage(sourceUrl, options); const baseUrl = findEffectiveBase(renderedHtml, sourceUrl); const sourceHostname = new URL(sourceUrl).hostname; const inliner = new AssetInliner({ userAgent: DEFAULT_USER_AGENT, referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined, shouldBlockAsset: (assetUrl, resourceType) => shouldBlockRequest(assetUrl, resourceType, sourceHostname) }); const inlined = await inliner.inlineHtml(renderedHtml, baseUrl); const finalHtml = addArchiveComment(inlined, sourceUrl); await fs.writeFile(filePath, finalHtml, "utf8"); return { id, filePath, sourceUrl, archivePath, warnings: inliner.warnings, externalAssets: findExternalAssetRefs(finalHtml) }; } export async function renderPage(sourceUrl, options = {}) { const playwright = loadPlaywright(); const hasDisplay = !!(process.env.DISPLAY || process.env.WAYLAND_DISPLAY); const headless = options.headless !== false && !hasDisplay; const browser = await playwright.chromium.launch({ headless, args: buildLaunchArgs(headless), ignoreDefaultArgs: buildIgnoreDefaultArgs() }); try { const context = await browser.newContext({ userAgent: options.userAgent || DEFAULT_USER_AGENT, viewport: VIEWPORT, locale: options.locale || "en-US", timezoneId: options.timezoneId || "America/New_York" }); // Inject stealth evasions into every new page before any scripts run. await context.addInitScript(STEALTH_INIT_SCRIPT); const page = await context.newPage(); const sourceHostname = new URL(sourceUrl).hostname; // Block paywall/tracker requests before the page loads. await setupRequestBlocking(page, sourceHostname); await page.goto(sourceUrl, { waitUntil: "domcontentloaded", timeout: PAGE_TIMEOUT_MS }); // Inject cosmetic CSS and userscripts to strip paywalls / ads. await injectCosmeticFilters(page, sourceHostname); await injectPrivacyUserScripts(page, sourceUrl); // Give the userscripts a moment to run their setTimeout callbacks. const userscriptDelay = options.userscriptDelay || 2000; await page.waitForTimeout(userscriptDelay); await waitForNetworkIdle(page); await removeCommonAnnoyances(page); await snapshotLoadedResourceUrls(page); await snapshotRuntimeStyles(page); return await page.content(); } finally { await browser.close(); } } async function waitForNetworkIdle(page) { try { await page.waitForLoadState("networkidle", { timeout: NETWORK_IDLE_TIMEOUT_MS }); } catch { // Some pages keep sockets open; the DOM snapshot is still useful. } } async function snapshotLoadedResourceUrls(page) { await page.evaluate(() => { document.querySelectorAll("img").forEach((img) => { if (img.currentSrc) { img.setAttribute("src", img.currentSrc); } }); document.querySelectorAll("video,audio").forEach((media) => { if (media.currentSrc) { media.setAttribute("src", media.currentSrc); } }); document.querySelectorAll("iframe").forEach((frame) => { try { const doc = frame.contentDocument; if (doc?.documentElement) { frame.setAttribute("srcdoc", "" + doc.documentElement.outerHTML); frame.removeAttribute("src"); } } catch { // Cross-origin frames are handled later by the asset inliner when possible. } }); }); } async function snapshotRuntimeStyles(page) { await page.evaluate(() => { const serializeRules = (sheet) => { try { return Array.from(sheet.cssRules || []) .map((rule) => rule.cssText) .join("\n"); } catch { return ""; } }; for (const sheet of Array.from(document.styleSheets)) { const css = serializeRules(sheet); if (!css.trim()) { continue; } const owner = sheet.ownerNode; if (owner instanceof HTMLStyleElement && !owner.textContent.trim()) { owner.textContent = css; } } const adoptedStyleSheets = Array.from(document.adoptedStyleSheets || []); adoptedStyleSheets.forEach((sheet, index) => { const css = serializeRules(sheet); if (!css.trim()) { return; } const style = document.createElement("style"); style.setAttribute("data-archiver-adopted-stylesheet", String(index)); style.textContent = css; document.head.appendChild(style); }); }); } function addArchiveComment(html, sourceUrl) { const safeSource = String(sourceUrl).replaceAll("--", "- -"); const comment = ``; if (/]*>/i, (doctype) => `${doctype}\n${comment}`); } return `\n${comment}\n${html}`; } export function findExternalAssetRefs(html) { const refs = new Set(); const assetTagPattern = /<(?:img|source|audio|video|track|embed|object|input|iframe)\b[^>]*>/gi; for (const match of html.matchAll(assetTagPattern)) { const tag = match[0]; for (const attr of ["src", "srcset", "poster", "data"]) { const value = readAttribute(tag, attr); if (!value) { continue; } if (attr === "srcset") { addSrcsetRefs(refs, value); continue; } if (isSelfContainedAssetRef(value)) { continue; } for (const part of value.split(",")) { const candidate = part.trim().split(/\s+/)[0]; if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } } } const linkPattern = /]*>/gi; for (const match of html.matchAll(linkPattern)) { const tag = match[0]; const rel = readAttribute(tag, "rel") || ""; if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) { continue; } const href = readAttribute(tag, "href"); if (href && !isSelfContainedAssetRef(href)) { refs.add(href); } const imageSrcset = readAttribute(tag, "imagesrcset"); if (imageSrcset) { addSrcsetRefs(refs, imageSrcset); } } const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi; for (const match of html.matchAll(cssUrlPattern)) { const candidate = cleanCssUrl(match[2]); if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } return Array.from(refs).sort(); } function addSrcsetRefs(refs, srcset) { for (const part of splitSrcset(srcset)) { const candidate = part.trim().split(/\s+/)[0]; if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } } function isSelfContainedAssetRef(value) { const trimmed = cleanCssUrl(value); return ( !trimmed || trimmed.startsWith("#") || /^%23/i.test(trimmed) || /^(?:data|about|javascript|mailto|tel):/i.test(trimmed) ); } function readAttribute(tag, attr) { const match = findAttribute(tag, attr); return match ? match.value : ""; } function cleanCssUrl(value) { const decoded = String(value) .trim() .replaceAll("&", "&") .replaceAll(""", '"') .replaceAll("'", "'") .replaceAll("'", "'"); const quote = decoded[0]; if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) { return decoded.slice(1, -1).trim(); } return decoded; } function findAttribute(openingTag, attr) { const attrLower = attr.toLowerCase(); const nameMatch = openingTag.match(/^<[^\s/>]+/); let index = nameMatch ? nameMatch[0].length : 1; while (index < openingTag.length) { while (index < openingTag.length && /\s/.test(openingTag[index])) { index += 1; } if (index >= openingTag.length || openingTag[index] === ">" || openingTag[index] === "/") { return null; } const start = index; while (index < openingTag.length && !/[\s=/>]/.test(openingTag[index])) { index += 1; } const name = openingTag.slice(start, index); while (index < openingTag.length && /\s/.test(openingTag[index])) { index += 1; } let value = ""; if (openingTag[index] === "=") { index += 1; while (index < openingTag.length && /\s/.test(openingTag[index])) { index += 1; } const quote = openingTag[index]; if (quote === '"' || quote === "'") { index += 1; const valueStart = index; while (index < openingTag.length && openingTag[index] !== quote) { index += 1; } value = openingTag.slice(valueStart, index); if (openingTag[index] === quote) { index += 1; } } else { const valueStart = index; while (index < openingTag.length && !/[\s>]/.test(openingTag[index])) { index += 1; } value = openingTag.slice(valueStart, index); } } if (name.toLowerCase() === attrLower) { return { start, end: index, value }; } } return null; }