import fs from "node:fs/promises"; import path from "node:path"; import { createRequire } from "node:module"; import { fileURLToPath } from "node:url"; import { AssetInliner, DEFAULT_USER_AGENT, defaultArchivePath, findEffectiveBase, inputToUrl, isHttpUrl, splitSrcset, slugForUrl } from "./asset-inliner.mjs"; const require = createRequire(import.meta.url); const __dirname = path.dirname(fileURLToPath(import.meta.url)); const PAGE_TIMEOUT_MS = 60000; const NETWORK_IDLE_TIMEOUT_MS = 5000; const VIEWPORT = { width: 1366, height: 768 }; const COMMON_ANNOYANCE_SELECTORS = [ "[id^=\"sp_message_container_\"]", "iframe[id^=\"sp_message_iframe_\"]", "iframe[title*=\"consent\" i]", "iframe[title*=\"privacy manager\" i]", "#onetrust-consent-sdk", "#onetrust-banner-sdk", "#didomi-host", "#qc-cmp2-container", ".qc-cmp2-container", "#CybotCookiebotDialog", ".iubenda-cs-container", "#cmpwrapper", "[id^=\"cmpbox\"]", ".fc-consent-root", ".fc-dialog-container", "[aria-modal=\"true\"][id*=\"consent\" i]", "[aria-modal=\"true\"][id*=\"cookie\" i]", "[role=\"dialog\"][aria-label*=\"cookie\" i]", "[role=\"dialog\"][aria-label*=\"consent\" i]", "[id*=\"cookie-banner\" i]", "[class*=\"cookie-banner\" i]", "[id*=\"cookie-consent\" i]", "[class*=\"cookie-consent\" i]", "[id*=\"cookie-notice\" i]", "[class*=\"cookie-notice\" i]", "[id*=\"cookie-popup\" i]", "[class*=\"cookie-popup\" i]", "[id*=\"adblock\" i]", "[class*=\"adblock\" i]", "[id*=\"ad-block\" i]", "[class*=\"ad-block\" i]" ]; const COMMON_ANNOYANCE_ROOT_CLASSES = [ "sp-message-open", "didomi-popup-open", "qc-cmp-ui-showing", "ot-sdk-show-settings", "iubenda-cs-visible" ]; const COMMON_ANNOYANCE_TRIGGER_SELECTORS = [ "a[href*=\"getadmiral.com\" i]", "a[href*=\"%67e%74%61%64mi%72%61l.com\" i]", "a[href*=\"admiraladblock\" i]", "button", "[role=\"dialog\"]", "[aria-modal=\"true\"]" ]; const COMMON_ANNOYANCE_TEXT_PATTERNS = [ "ad blocker detected", "allow ads", "allowlist", "continue using your ad blocker", "disable your ad blocker", "support us by disabling", "support the verge by allowing ads", "turn off your ad blocker", "you are using an ad blocker" ]; const COMMON_ANNOYANCE_CSS = ` ${COMMON_ANNOYANCE_SELECTORS.join(",\n")} { display: none !important; visibility: hidden !important; pointer-events: none !important; } html.sp-message-open, body.sp-message-open, html.didomi-popup-open, body.didomi-popup-open, html.qc-cmp-ui-showing, body.qc-cmp-ui-showing, html.iubenda-cs-visible, body.iubenda-cs-visible { overflow: auto !important; position: static !important; } `; export { DEFAULT_USER_AGENT, defaultArchivePath }; // --------------------------------------------------------------------------- // Privacy filters integration // --------------------------------------------------------------------------- const PRIVACY_FILTERS_DIR = path.join(__dirname, "..", "privacy-filters"); const FILTER_LIST_FILES = [ { id: "bpc-paywall", file: "bpc-paywall-filter.txt" }, { id: "easylist", file: path.join("lists", "easylist.txt") }, { id: "ublock-filters", file: path.join("lists", "ublock-filters.txt") }, { id: "easylist-cookie", file: path.join("lists", "easylist-cookie.txt") }, { id: "ublock-annoyances", file: path.join("lists", "ublock-annoyances.txt") }, { id: "ublock-cookies", file: path.join("lists", "ublock-cookies.txt") } ]; let privacyFiltersAvailable = false; let filterRules = emptyFilterRules(); let userScriptData = []; // { file, content, matches, excludes } let userScriptRequireContent = ""; async function loadPrivacyFilters() { try { const filterSets = []; for (const list of FILTER_LIST_FILES) { try { const filterPath = path.join(PRIVACY_FILTERS_DIR, list.file); const filterContent = await fs.readFile(filterPath, "utf8"); filterSets.push(parseFilterRules(filterContent, { source: list.id })); } catch (error) { if (error?.code !== "ENOENT") { throw error; } } } filterRules = mergeFilterRules(filterSets); const userscriptDir = path.join(PRIVACY_FILTERS_DIR, "userscript"); userScriptRequireContent = await fs.readFile(path.join(userscriptDir, "bpc_func.js"), "utf8"); userScriptData = []; const userScriptFiles = [ "bpc.en.user.js", "bpc.de.user.js", "bpc.es.pt.user.js", "bpc.fi.se.user.js", "bpc.fr.user.js", "bpc.it.user.js", "bpc.nl.user.js", "bpc.pl.user.js" ]; for (const file of userScriptFiles) { const content = await fs.readFile(path.join(userscriptDir, file), "utf8"); const meta = parseUserScriptMetadata(content); userScriptData.push({ file, content, ...meta }); } privacyFiltersAvailable = filterRules.blockRules.length > 0 || filterRules.importantBlockRules.length > 0 || filterRules.allowRules.length > 0 || filterRules.importantAllowRules.length > 0 || filterRules.cosmeticRules.length > 0 || userScriptData.length > 0; } catch { // Privacy filters directory missing or unreadable; archive without them. } } await loadPrivacyFilters(); // --- Adblock filter parsing ------------------------------------------------ const COSMETIC_SEPARATORS = [ { token: "#@?#", kind: "cosmeticException", extended: true }, { token: "#@$#", kind: "styleException" }, { token: "#@%#", kind: "scriptException" }, { token: "#@^", kind: "htmlException" }, { token: "#@#", kind: "cosmeticException" }, { token: "#?#", kind: "extendedCosmetic", extended: true }, { token: "#$#", kind: "style" }, { token: "#%#", kind: "script" }, { token: "#^", kind: "html" }, { token: "##", kind: "cosmetic" } ]; const RESOURCE_TYPE_ALIASES = new Map([ ["beacon", "ping"], ["css", "stylesheet"], ["doc", "document"], ["document", "document"], ["fetch", "xmlhttprequest"], ["font", "font"], ["frame", "subdocument"], ["image", "image"], ["inline-script", "inline-script"], ["media", "media"], ["object", "object"], ["object-subrequest", "object"], ["other", "other"], ["ping", "ping"], ["script", "script"], ["stylesheet", "stylesheet"], ["subdocument", "subdocument"], ["websocket", "websocket"], ["xhr", "xmlhttprequest"], ["xmlhttprequest", "xmlhttprequest"] ]); const SKIP_NETWORK_OPTION_NAMES = new Set([ "cookie", "csp", "cname", "denyallow", "ehide", "elemhide", "ghide", "genericblock", "generichide", "header", "ipaddress", "jsonprune", "method", "permissions", "popunder", "popup", "queryprune", "redirect", "redirect-rule", "removeparam", "replace", "rewrite", "shide", "specifichide", "uritransform", "urlskip", "webrtc", "xmlprune" ]); const MULTI_PART_PUBLIC_SUFFIXES = new Set([ "ac.uk", "co.jp", "co.nz", "co.uk", "com.au", "com.br", "com.mx", "com.tr", "com.tw", "com.cn", "net.au", "net.nz", "org.au", "org.nz", "org.uk" ]); function emptyFilterRules() { return { blockRules: [], importantBlockRules: [], allowRules: [], importantAllowRules: [], cosmeticRules: [], cosmeticExceptionRules: [], badFilterKeys: new Set(), sourceFiles: [], blockRuleIndex: null, importantBlockRuleIndex: null, allowRuleIndex: null, importantAllowRuleIndex: null }; } function mergeFilterRules(filterSets) { const merged = emptyFilterRules(); for (const set of filterSets) { merged.blockRules.push(...set.blockRules); merged.importantBlockRules.push(...set.importantBlockRules); merged.allowRules.push(...set.allowRules); merged.importantAllowRules.push(...set.importantAllowRules); merged.cosmeticRules.push(...set.cosmeticRules); merged.cosmeticExceptionRules.push(...set.cosmeticExceptionRules); merged.sourceFiles.push(...set.sourceFiles); for (const key of set.badFilterKeys) { merged.badFilterKeys.add(key); } } if (merged.badFilterKeys.size > 0) { const isActive = (rule) => !merged.badFilterKeys.has(rule.key); merged.blockRules = merged.blockRules.filter(isActive); merged.importantBlockRules = merged.importantBlockRules.filter(isActive); merged.allowRules = merged.allowRules.filter(isActive); merged.importantAllowRules = merged.importantAllowRules.filter(isActive); } return finalizeFilterRules(merged); } export function parseFilterRules(content, options = {}) { const rules = emptyFilterRules(); if (options.source) { rules.sourceFiles.push(options.source); } let preprocessorDepth = 0; for (const rawLine of content.split("\n")) { const line = rawLine.trim(); if (!line) continue; if (line.startsWith("!#if")) { preprocessorDepth += 1; continue; } if (line.startsWith("!#endif")) { preprocessorDepth = Math.max(0, preprocessorDepth - 1); continue; } if (preprocessorDepth > 0 || line.startsWith("!#") || line.startsWith("!") || line.startsWith("[")) { continue; } const cosmetic = parseCosmeticFilterLine(line, options.source); if (cosmetic) { if (cosmetic.kind === "cosmeticException") { rules.cosmeticExceptionRules.push(cosmetic); } else if (cosmetic.kind === "cosmetic") { rules.cosmeticRules.push(cosmetic); } continue; } if (cosmetic === false) { continue; } const isException = line.startsWith("@@"); const networkLine = isException ? line.slice(2) : line; const rule = parseNetworkRule(networkLine, { exception: isException, source: options.source }); if (!rule) continue; if (rule.badfilter) { rules.badFilterKeys.add(rule.key); } else if (isException && rule.important) { rules.importantAllowRules.push(rule); } else if (isException) { rules.allowRules.push(rule); } else if (rule.important) { rules.importantBlockRules.push(rule); } else { rules.blockRules.push(rule); } } return finalizeFilterRules(rules); } function finalizeFilterRules(rules) { if (rules.badFilterKeys.size > 0) { const isActive = (rule) => !rules.badFilterKeys.has(rule.key); rules.blockRules = rules.blockRules.filter(isActive); rules.importantBlockRules = rules.importantBlockRules.filter(isActive); rules.allowRules = rules.allowRules.filter(isActive); rules.importantAllowRules = rules.importantAllowRules.filter(isActive); } rules.blockRuleIndex = buildNetworkRuleIndex(rules.blockRules); rules.importantBlockRuleIndex = buildNetworkRuleIndex(rules.importantBlockRules); rules.allowRuleIndex = buildNetworkRuleIndex(rules.allowRules); rules.importantAllowRuleIndex = buildNetworkRuleIndex(rules.importantAllowRules); return rules; } function buildNetworkRuleIndex(rules) { const byDomain = new Map(); const wildcardDomainRules = []; const otherRules = []; for (const rule of rules) { if (rule.kind !== "domain") { otherRules.push(rule); continue; } if (rule.domain.includes("*")) { wildcardDomainRules.push(rule); continue; } const bucket = byDomain.get(rule.domain) || []; bucket.push(rule); byDomain.set(rule.domain, bucket); } return { byDomain, wildcardDomainRules, otherRules }; } function parseCosmeticFilterLine(line, source) { const separator = findCosmeticSeparator(line); if (!separator) return null; const domains = line.slice(0, separator.index); const body = line.slice(separator.index + separator.token.length).trim(); if (!body) return false; if (separator.kind === "cosmeticException") { return { kind: "cosmeticException", domains, selector: cosmeticSelectorKey(body), source }; } if (separator.kind === "style") { const css = adguardStyleRuleToCss(body); return css ? { kind: "cosmetic", domains, selector: cosmeticSelectorKey(body), css, source } : false; } if (separator.kind !== "cosmetic" || separator.extended) { return false; } const css = cosmeticSelectorToCss(body); return css ? { kind: "cosmetic", domains, selector: cosmeticSelectorKey(body), css, source } : false; } function findCosmeticSeparator(line) { let best = null; for (const separator of COSMETIC_SEPARATORS) { const index = line.indexOf(separator.token); if ( index >= 0 && (!best || index < best.index || (index === best.index && separator.token.length > best.token.length)) ) { best = { ...separator, index }; } } return best; } function cosmeticSelectorKey(selector) { return selector.trim().replace(/\s+/g, " "); } function parseNetworkRule(line, options = {}) { const split = splitNetworkOptions(line); const parsedOptions = parseNetworkOptions(split.options); const key = networkRuleKey(split.pattern, parsedOptions.optionsForKey, options.exception); if (parsedOptions.badfilter) { return { badfilter: true, key }; } if (!split.pattern || parsedOptions.skip) { return null; } const compiled = compileNetworkPattern(split.pattern, parsedOptions.matchCase); if (!compiled) { return null; } return { ...compiled, key, types: parsedOptions.types, excludedTypes: parsedOptions.excludedTypes, isThirdParty: parsedOptions.isThirdParty, isFirstParty: parsedOptions.isFirstParty, includeDomains: parsedOptions.includeDomains, excludeDomains: parsedOptions.excludeDomains, includeTargetDomains: parsedOptions.includeTargetDomains, excludeTargetDomains: parsedOptions.excludeTargetDomains, important: parsedOptions.important, source: options.source }; } function splitNetworkOptions(line) { const lastDollar = line.lastIndexOf("$"); if (lastDollar <= 0) { return { pattern: line, options: [] }; } const optionText = line.slice(lastDollar + 1); if (!looksLikeFilterOptions(optionText)) { return { pattern: line, options: [] }; } return { pattern: line.slice(0, lastDollar), options: splitFilterOptions(optionText) }; } function looksLikeFilterOptions(optionText) { if (!optionText || /\s/.test(optionText)) return false; const firstOption = optionText.split(",", 1)[0]; return /^~?[a-z][a-z0-9_-]*(?:=|$)/i.test(firstOption); } function splitFilterOptions(optionText) { return optionText .split(",") .map((option) => option.trim()) .filter(Boolean); } function parseNetworkOptions(options) { const parsed = { types: [], excludedTypes: [], isThirdParty: false, isFirstParty: false, includeDomains: [], excludeDomains: [], includeTargetDomains: [], excludeTargetDomains: [], important: false, matchCase: false, badfilter: false, skip: false, optionsForKey: [] }; for (const rawOption of options) { const option = rawOption.trim(); if (!option) continue; const negated = option.startsWith("~"); const optionBody = negated ? option.slice(1) : option; const eqIndex = optionBody.indexOf("="); const name = (eqIndex >= 0 ? optionBody.slice(0, eqIndex) : optionBody).toLowerCase(); const value = eqIndex >= 0 ? optionBody.slice(eqIndex + 1) : ""; if (name !== "badfilter") { parsed.optionsForKey.push(option); } if (name === "badfilter") { parsed.badfilter = true; continue; } if (name === "important") { parsed.important = true; continue; } if (name === "match-case") { parsed.matchCase = true; continue; } if (name === "third-party" || name === "3p" || name === "strict3p") { parsed.isThirdParty = !negated; parsed.isFirstParty = negated; continue; } if (name === "first-party" || name === "1p" || name === "strict1p") { parsed.isFirstParty = !negated; parsed.isThirdParty = negated; continue; } if (name === "domain" || name === "from") { const domains = parseDomainOptionValue(value); parsed.includeDomains.push(...domains.include); parsed.excludeDomains.push(...domains.exclude); continue; } if (name === "to") { const domains = parseDomainOptionValue(value); parsed.includeTargetDomains.push(...domains.include); parsed.excludeTargetDomains.push(...domains.exclude); continue; } const resourceType = RESOURCE_TYPE_ALIASES.get(name); if (resourceType) { if (negated) { parsed.excludedTypes.push(resourceType); } else { parsed.types.push(resourceType); } continue; } if (SKIP_NETWORK_OPTION_NAMES.has(name) || eqIndex >= 0) { parsed.skip = true; } } return parsed; } function parseDomainOptionValue(value) { const include = []; const exclude = []; if (!value) { return { include, exclude }; } for (const rawDomain of value.split("|")) { const domain = rawDomain.trim().toLowerCase(); if (!domain) continue; if (domain.startsWith("~")) { exclude.push(domain.slice(1)); } else { include.push(domain); } } return { include, exclude }; } function networkRuleKey(pattern, options, exception) { return `${exception ? "@@" : ""}${pattern}${options.length > 0 ? `$${options.join(",")}` : ""}`; } function compileNetworkPattern(pattern, matchCase) { const flags = matchCase ? "" : "i"; if (pattern.startsWith("||")) { const domainRule = parseDomainAnchoredPattern(pattern, flags); return domainRule; } if (pattern.startsWith("/") && pattern.endsWith("/") && pattern.length > 1) { try { return { kind: "regex", regex: new RegExp(pattern.slice(1, -1), flags) }; } catch { return null; } } try { return { kind: "pattern", regex: new RegExp(adblockPatternToRegex(pattern), flags) }; } catch { return null; } } function parseDomainAnchoredPattern(pattern, flags) { const domainPath = pattern.slice(2); let domainEnd = 0; while ( domainEnd < domainPath.length && domainPath[domainEnd] !== "/" && domainPath[domainEnd] !== "^" ) { domainEnd += 1; } const domain = domainPath.slice(0, domainEnd).toLowerCase(); if (!domain || /[\\[\]{}()]/.test(domain)) { return null; } const suffix = domainPath.slice(domainEnd); let path = ""; if (suffix.startsWith("/")) { path = suffix; } else if (suffix.startsWith("^/")) { path = suffix.slice(1); } else if (suffix && suffix !== "^") { return null; } let pathRegex = null; if (path) { try { pathRegex = new RegExp("^" + adblockPatternToRegex(path), flags); } catch { return null; } } return { kind: "domain", domain, path, pathRegex }; } function adguardStyleRuleToCss(rule) { if (!rule.includes("{") || !rule.includes("}") || /[\r\n]/.test(rule)) { return null; } return rule; } function cosmeticSelectorToCss(selector) { const trimmed = selector.trim(); if (!trimmed || trimmed.startsWith("+js") || trimmed.startsWith("^")) { return null; } if (trimmed.endsWith(":remove()")) { const baseSelector = trimmed.slice(0, -":remove()".length); return isSupportedCosmeticSelector(baseSelector) ? `${baseSelector} { display: none !important; }` : null; } const styleMatch = trimmed.match(/:style\((.+)\)$/); if (styleMatch) { const baseSelector = trimmed.slice(0, trimmed.lastIndexOf(":style(")); return isSupportedCosmeticSelector(baseSelector) ? `${baseSelector} { ${styleMatch[1]} }` : null; } return isSupportedCosmeticSelector(trimmed) ? `${trimmed} { display: none !important; }` : null; } function isSupportedCosmeticSelector(selector) { if (!selector || /[\r\n{}]/.test(selector)) return false; const unsupportedTokens = [ ":-abp-contains(", ":-abp-has(", ":-abp-properties(", ":contains(", ":has-text(", ":matches-attr(", ":matches-css", ":matches-media", ":matches-path", ":min-text-length(", ":others()", ":remove()", ":upward(", ":watch-attr(", ":xpath(" ]; const lower = selector.toLowerCase(); return !unsupportedTokens.some((token) => lower.includes(token)); } export function getCosmeticCssForHostname(rules, hostname) { const normalizedHostname = String(hostname || "").toLowerCase(); const exceptionKeys = new Set(); for (const exception of rules.cosmeticExceptionRules || []) { if (matchesCosmeticDomains(exception.domains, normalizedHostname)) { exceptionKeys.add(exception.selector); } } const lines = []; for (const rule of rules.cosmeticRules || []) { if ( matchesCosmeticDomains(rule.domains, normalizedHostname) && !exceptionKeys.has(rule.selector) ) { lines.push(rule.css); } } return lines; } function matchesCosmeticDomains(domainSpec, hostname) { if (!domainSpec || domainSpec === "*") return true; const domains = domainSpec .split(",") .map((domain) => domain.trim().toLowerCase()) .filter(Boolean); const positives = []; const negatives = []; for (const domain of domains) { if (domain.startsWith("~")) { negatives.push(domain.slice(1)); } else { positives.push(domain); } } if (negatives.some((domain) => domainMatchesPattern(hostname, domain))) { return false; } if (positives.length === 0) { return true; } return positives.some((domain) => domainMatchesPattern(hostname, domain)); } function matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule) { const normalizedSourceHostname = String(sourceHostname || "").toLowerCase(); if (rule.includeDomains.length > 0) { const ok = rule.includeDomains.some((domain) => domainMatchesPattern(normalizedSourceHostname, domain) ); if (!ok) return false; } if (rule.excludeDomains.length > 0) { const blocked = rule.excludeDomains.some((domain) => domainMatchesPattern(normalizedSourceHostname, domain) ); if (blocked) return false; } if (rule.includeTargetDomains.length > 0) { const ok = rule.includeTargetDomains.some((domain) => domainMatchesPattern(hostname, domain) ); if (!ok) return false; } if (rule.excludeTargetDomains.length > 0) { const blocked = rule.excludeTargetDomains.some((domain) => domainMatchesPattern(hostname, domain) ); if (blocked) return false; } if (rule.excludedTypes.some((type) => resourceTypeMatches(type, resourceType))) { return false; } if (rule.types.length > 0) { if (!rule.types.some((type) => resourceTypeMatches(type, resourceType))) { return false; } } const isThirdParty = isThirdPartyRequest(hostname, normalizedSourceHostname); if (rule.isThirdParty && !isThirdParty) { return false; } if (rule.isFirstParty && isThirdParty) { return false; } if (rule.kind === "domain") { if (!domainPatternMatches(hostname, rule.domain)) return false; if (rule.pathRegex && !rule.pathRegex.test(urlObj.pathname + urlObj.search)) return false; return true; } if (rule.kind === "regex" || rule.kind === "pattern") { return rule.regex.test(url); } return false; } function resourceTypeMatches(filterType, resourceType) { const typeMap = { document: ["document"], font: ["font"], image: ["image"], "inline-script": ["script"], media: ["media"], object: ["object"], other: ["other"], ping: ["ping", "fetch"], script: ["script"], stylesheet: ["stylesheet"], subdocument: ["document", "subdocument"], websocket: ["websocket"], xmlhttprequest: ["fetch", "xhr", "xmlhttprequest"] }; const mapped = typeMap[filterType]; return mapped ? mapped.includes(resourceType) : false; } function domainPatternMatches(hostname, pattern) { const normalized = pattern.replace(/\^$/, "").toLowerCase(); if (!normalized) return false; if (!normalized.includes("*")) { return hostname === normalized || hostname.endsWith("." + normalized); } return domainMatchesPattern(hostname, normalized); } function domainMatchesPattern(hostname, pattern) { const normalizedHostname = String(hostname || "").toLowerCase(); const normalizedPattern = String(pattern || "").replace(/\^$/, "").toLowerCase(); if (!normalizedPattern) return false; if (normalizedPattern === "*") return true; if (!normalizedPattern.includes("*")) { return normalizedHostname === normalizedPattern || normalizedHostname.endsWith("." + normalizedPattern); } const source = normalizedPattern .split("*") .map((part) => part.replace(/[|\\{}()[\]^$+?.]/g, "\\$&")) .join(".*"); const re = new RegExp(`${normalizedPattern.startsWith("*") ? "^" : "(?:^|\\.)"}${source}$`, "i"); return re.test(normalizedHostname); } function isThirdPartyRequest(hostname, sourceHostname) { if (!hostname || !sourceHostname) { return hostname !== sourceHostname; } const requestSite = registrableDomain(hostname); const sourceSite = registrableDomain(sourceHostname); if (!requestSite || !sourceSite) { return hostname !== sourceHostname && !hostname.endsWith("." + sourceHostname); } return requestSite !== sourceSite; } function registrableDomain(hostname) { const normalized = String(hostname || "").toLowerCase().replace(/\.$/, ""); if (!normalized || /^\d{1,3}(?:\.\d{1,3}){3}$/.test(normalized) || normalized === "localhost") { return normalized; } const parts = normalized.split(".").filter(Boolean); if (parts.length <= 2) { return normalized; } const suffix2 = parts.slice(-2).join("."); if (MULTI_PART_PUBLIC_SUFFIXES.has(suffix2) && parts.length >= 3) { return parts.slice(-3).join("."); } return parts.slice(-2).join("."); } function adblockPatternToRegex(pattern) { let source = ""; let remaining = pattern; let anchoredStart = false; let anchoredEnd = false; if (remaining.startsWith("|")) { anchoredStart = true; remaining = remaining.slice(1); } if (remaining.endsWith("|")) { anchoredEnd = true; remaining = remaining.slice(0, -1); } for (const ch of remaining) { if (ch === "*") { source += ".*"; } else if (ch === "^") { source += "(?:[^A-Za-z0-9_.%-]|$)"; } else { source += ch.replace(/[|\\{}()[\]^$+?.]/g, "\\$&"); } } return `${anchoredStart ? "^" : ""}${source}${anchoredEnd ? "$" : ""}`; } export function shouldBlockRequestWithRules(rules, url, resourceType, sourceHostname) { if (url === sourceHostname || (sourceHostname && url.startsWith(sourceHostname + "/"))) { return false; } let urlObj; try { urlObj = new URL(url); } catch { return false; } const hostname = urlObj.hostname; for (const rule of networkRuleCandidates(rules.importantAllowRules, rules.importantAllowRuleIndex, hostname)) { if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) { return false; } } for (const rule of networkRuleCandidates(rules.importantBlockRules, rules.importantBlockRuleIndex, hostname)) { if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) { return true; } } for (const rule of networkRuleCandidates(rules.allowRules, rules.allowRuleIndex, hostname)) { if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) { return false; } } for (const rule of networkRuleCandidates(rules.blockRules, rules.blockRuleIndex, hostname)) { if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) { return true; } } return false; } function shouldBlockRequest(url, resourceType, sourceHostname) { return shouldBlockRequestWithRules(filterRules, url, resourceType, sourceHostname); } function* networkRuleCandidates(rules = [], index, hostname) { if (!index) { yield* rules; return; } for (const suffix of hostnameSuffixes(hostname)) { const bucket = index.byDomain.get(suffix); if (bucket) { yield* bucket; } } yield* index.wildcardDomainRules; yield* index.otherRules; } function hostnameSuffixes(hostname) { const normalized = String(hostname || "").toLowerCase(); if (!normalized) return [""]; const labels = normalized.split(".").filter(Boolean); const suffixes = []; for (let index = 0; index < labels.length; index += 1) { suffixes.push(labels.slice(index).join(".")); } return suffixes; } // --- Userscript metadata parsing ------------------------------------------- function parseUserScriptMetadata(content) { const metaBlock = content.match(/\/\/\s*==UserScript==([\s\S]*?)\/\/\s*==\/UserScript==/); const matches = []; const excludes = []; if (!metaBlock) return { matches, excludes }; const lines = metaBlock[1].split("\n"); for (const line of lines) { const matchMatch = line.match(/@match\s+(.+)/); if (matchMatch) { matches.push(matchMatch[1].trim()); continue; } const excludeMatch = line.match(/@exclude\s+(.+)/); if (excludeMatch) { excludes.push(excludeMatch[1].trim()); } } return { matches, excludes }; } function urlMatchesPattern(url, pattern) { // Simple glob-style pattern matching for userscript @match // Format: *://*.example.com/* or http://example.com/path try { const urlObj = new URL(url); const protocol = urlObj.protocol.slice(0, -1); // "http" or "https" const hostname = urlObj.hostname; const pathname = urlObj.pathname; // Split pattern const protoEnd = pattern.indexOf("://"); if (protoEnd < 0) return false; const patternProto = pattern.slice(0, protoEnd); const rest = pattern.slice(protoEnd + 3); // Protocol match if (patternProto !== "*" && patternProto !== protocol) return false; // Split rest into host and path const slashIdx = rest.indexOf("/"); const patternHost = slashIdx >= 0 ? rest.slice(0, slashIdx) : rest; const patternPath = slashIdx >= 0 ? rest.slice(slashIdx) : "/"; // Host match if (!matchHost(hostname, patternHost)) return false; // Path match if (!matchPath(pathname, patternPath)) return false; return true; } catch { return false; } } function matchHost(hostname, pattern) { if (pattern === "*") return true; if (pattern.startsWith("*.")) { const suffix = pattern.slice(2); return hostname === suffix || hostname.endsWith("." + suffix); } return hostname === pattern; } function matchPath(pathname, pattern) { if (pattern === "/*") return true; // Convert glob pattern to regex const regex = "^" + pattern .replace(/\./g, "\\.") .replace(/\*/g, ".*") .replace(/\?/g, ".") + "$"; return new RegExp(regex, "i").test(pathname); } function shouldInjectUserScript(url, meta) { let matched = false; for (const pattern of meta.matches) { if (urlMatchesPattern(url, pattern)) { matched = true; break; } } if (!matched) return false; for (const pattern of meta.excludes) { if (urlMatchesPattern(url, pattern)) { return false; } } return true; } // --- Browser helpers ------------------------------------------------------- function loadPlaywright() { try { return require("playwright"); } catch (error) { throw new Error( `Playwright is required. Run "npm install" and "npm run install-browsers". Original error: ${error.message}` ); } } // Manual stealth evasions injected into every page before any scripts run. const STEALTH_INIT_SCRIPT = ` (() => { const patchNavigator = () => { try { // Override webdriver getter without using delete (can crash renderer) if (navigator.webdriver !== undefined) { Object.defineProperty(navigator, 'webdriver', { get: () => undefined, configurable: true, enumerable: true }); } } catch (e) {} try { if (!window.chrome) { window.chrome = { runtime: {} }; } else if (!window.chrome.runtime) { window.chrome.runtime = {}; } } catch (e) {} try { const originalQuery = window.navigator.permissions?.query; if (originalQuery) { window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) ); } } catch (e) {} }; if (document.readyState === 'loading') { document.addEventListener('DOMContentLoaded', patchNavigator); } else { patchNavigator(); } })(); `; function buildLaunchArgs(headless) { const args = [ "--disable-blink-features=AutomationControlled", "--disable-web-security", "--disable-features=IsolateOrigins,site-per-process", "--disable-site-isolation-trials", "--disable-infobars", "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-accelerated-2d-canvas", "--disable-gpu", "--window-size=1366,768" ]; if (headless) { args.push("--headless=new"); } return args; } function buildIgnoreDefaultArgs() { return ["--enable-automation"]; } // --- Page helpers ---------------------------------------------------------- async function setupRequestBlocking(page, sourceHostname) { if ( !privacyFiltersAvailable || (filterRules.blockRules.length === 0 && filterRules.importantBlockRules.length === 0) ) { return; } await page.route("**/*", (route) => { try { const request = route.request(); if (request.isNavigationRequest() && request.frame() === page.mainFrame()) { route.continue(); return; } const url = request.url(); const type = request.resourceType(); if (shouldBlockRequest(url, type, sourceHostname)) { route.abort("blockedbyclient"); } else { route.continue(); } } catch { route.continue(); } }); } async function injectCosmeticFilters(page, hostname) { const lines = [COMMON_ANNOYANCE_CSS]; if (privacyFiltersAvailable && filterRules.cosmeticRules.length > 0) { lines.push(...getCosmeticCssForHostname(filterRules, hostname)); } if (lines.length > 0) { try { await page.addStyleTag({ content: lines.join("\n") }); } catch { // Ignore cosmetic injection failures. } } } async function removeCommonAnnoyances(page) { try { await page.evaluate(({ selectors, rootClasses, triggerSelectors, textPatterns }) => { const annoyanceText = textPatterns.map((pattern) => pattern.toLowerCase()); const textFor = (element) => (element.innerText || element.textContent || "") .replace(/\s+/g, " ") .trim() .toLowerCase(); const isTextTrigger = (element) => { const text = textFor(element); if (!text) return false; if (annoyanceText.some((pattern) => text.includes(pattern))) return true; return text.includes("ad blocker") && /allow|disable|turn off|subscribe|support/.test(text); }; const isLinkTrigger = (element) => { const href = `${element.getAttribute("href") || ""} ${element.href || ""}`.toLowerCase(); return href.includes("getadmiral.com") || href.includes("%67e%74%61%64mi%72%61l.com") || href.includes("admiraladblock"); }; const findModalRoot = (element) => { let node = element; let best = null; while (node && node !== document.body && node !== document.documentElement) { const style = window.getComputedStyle(node); const rect = node.getBoundingClientRect(); const zIndex = Number.parseInt(style.zIndex, 10); const positioned = /fixed|absolute|sticky/i.test(style.position); const wide = rect.width >= window.innerWidth * 0.5; const tall = rect.height >= window.innerHeight * 0.4; const fullScreen = rect.width >= window.innerWidth * 0.9 && rect.height >= window.innerHeight * 0.9; if ((positioned && (wide || tall || zIndex >= 1000)) || fullScreen) { best = node; } node = node.parentElement; } return best || element.closest("[role=\"dialog\"], [aria-modal=\"true\"]") || element; }; for (const selector of triggerSelectors) { try { document.querySelectorAll(selector).forEach((element) => { if (!isLinkTrigger(element) && !isTextTrigger(element)) return; const root = findModalRoot(element); if (root && root !== document.body && root !== document.documentElement) { root.remove(); } }); } catch { // Ignore selectors unsupported by the current browser. } } for (const selector of selectors) { try { document.querySelectorAll(selector).forEach((element) => element.remove()); } catch { // Ignore selectors unsupported by the current browser. } } for (const root of [document.documentElement, document.body].filter(Boolean)) { root.classList.remove(...rootClasses); root.removeAttribute("data-previous-scroll-y"); const overflow = root.style.overflow || ""; const position = root.style.position || ""; if (/hidden|clip/i.test(overflow)) { root.style.removeProperty("overflow"); } if (/fixed/i.test(position)) { root.style.removeProperty("position"); root.style.removeProperty("top"); root.style.removeProperty("left"); root.style.removeProperty("right"); } } }, { selectors: COMMON_ANNOYANCE_SELECTORS, rootClasses: COMMON_ANNOYANCE_ROOT_CLASSES, triggerSelectors: COMMON_ANNOYANCE_TRIGGER_SELECTORS, textPatterns: COMMON_ANNOYANCE_TEXT_PATTERNS }); } catch { // Ignore cleanup failures; the archive is still useful. } } const GM_MOCK = ` if (typeof GM === "undefined") { window.GM = { xmlHttpRequest: function(details) { fetch(details.url, { method: details.method || "GET", headers: details.headers || {}, body: details.data || null }) .then(response => response.text().then(text => ({ status: response.status, statusText: response.statusText, responseText: text, responseHeaders: Array.from(response.headers.entries()) .map(([k, v]) => k + ": " + v).join("\\r\\n") }))) .then(obj => { if (details.onload) details.onload(obj); }) .catch(err => { if (details.onerror) details.onerror(err); }); } }; } `; async function injectPrivacyUserScripts(page, sourceUrl) { if (!privacyFiltersAvailable || userScriptData.length === 0) return; const matching = userScriptData.filter((us) => shouldInjectUserScript(sourceUrl, us)); if (matching.length === 0) return; // Inject GM API mock first. try { await page.addScriptTag({ content: GM_MOCK }); if (userScriptRequireContent) { await page.addScriptTag({ content: userScriptRequireContent }); } } catch { return; } // Inject only matching userscripts. for (const us of matching) { try { await page.addScriptTag({ content: us.content }); } catch { // Ignore injection failures for individual scripts. } } } // --------------------------------------------------------------------------- // Archiving // --------------------------------------------------------------------------- export async function archivePage(input, options = {}) { const sourceUrl = inputToUrl(input); const archivePath = options.archivePath || defaultArchivePath(); const id = options.id || slugForUrl(sourceUrl); const filePath = path.join(archivePath, `${id}.html`); await fs.mkdir(archivePath, { recursive: true }); const renderedHtml = await renderPage(sourceUrl, options); const baseUrl = findEffectiveBase(renderedHtml, sourceUrl); const sourceHostname = new URL(sourceUrl).hostname; const inliner = new AssetInliner({ userAgent: DEFAULT_USER_AGENT, referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined, shouldBlockAsset: (assetUrl, resourceType) => shouldBlockRequest(assetUrl, resourceType, sourceHostname) }); const inlined = await inliner.inlineHtml(renderedHtml, baseUrl); const finalHtml = addArchiveComment(inlined, sourceUrl); await fs.writeFile(filePath, finalHtml, "utf8"); return { id, filePath, sourceUrl, archivePath, warnings: inliner.warnings, externalAssets: findExternalAssetRefs(finalHtml) }; } export async function renderPage(sourceUrl, options = {}) { const playwright = loadPlaywright(); const hasDisplay = !!(process.env.DISPLAY || process.env.WAYLAND_DISPLAY); const headless = options.headless !== false && !hasDisplay; const browser = await playwright.chromium.launch({ headless, args: buildLaunchArgs(headless), ignoreDefaultArgs: buildIgnoreDefaultArgs() }); try { const context = await browser.newContext({ userAgent: options.userAgent || DEFAULT_USER_AGENT, viewport: VIEWPORT, locale: options.locale || "en-US", timezoneId: options.timezoneId || "America/New_York" }); // Inject stealth evasions into every new page before any scripts run. await context.addInitScript(STEALTH_INIT_SCRIPT); const page = await context.newPage(); const sourceHostname = new URL(sourceUrl).hostname; // Block paywall/tracker requests before the page loads. await setupRequestBlocking(page, sourceHostname); await page.goto(sourceUrl, { waitUntil: "domcontentloaded", timeout: PAGE_TIMEOUT_MS }); // Inject cosmetic CSS and userscripts to strip paywalls / ads. await injectCosmeticFilters(page, sourceHostname); await injectPrivacyUserScripts(page, sourceUrl); // Give the userscripts a moment to run their setTimeout callbacks. const userscriptDelay = options.userscriptDelay || 2000; await page.waitForTimeout(userscriptDelay); await waitForNetworkIdle(page); await removeCommonAnnoyances(page); await snapshotLoadedResourceUrls(page); await snapshotRuntimeStyles(page); return await page.content(); } finally { await browser.close(); } } async function waitForNetworkIdle(page) { try { await page.waitForLoadState("networkidle", { timeout: NETWORK_IDLE_TIMEOUT_MS }); } catch { // Some pages keep sockets open; the DOM snapshot is still useful. } } async function snapshotLoadedResourceUrls(page) { await page.evaluate(() => { document.querySelectorAll("img").forEach((img) => { if (img.currentSrc) { img.setAttribute("src", img.currentSrc); } }); document.querySelectorAll("video,audio").forEach((media) => { if (media.currentSrc) { media.setAttribute("src", media.currentSrc); } }); document.querySelectorAll("iframe").forEach((frame) => { try { const doc = frame.contentDocument; if (doc?.documentElement) { frame.setAttribute("srcdoc", "" + doc.documentElement.outerHTML); frame.removeAttribute("src"); } } catch { // Cross-origin frames are handled later by the asset inliner when possible. } }); }); } async function snapshotRuntimeStyles(page) { await page.evaluate(() => { const serializeRules = (sheet) => { try { return Array.from(sheet.cssRules || []) .map((rule) => rule.cssText) .join("\n"); } catch { return ""; } }; for (const sheet of Array.from(document.styleSheets)) { const css = serializeRules(sheet); if (!css.trim()) { continue; } const owner = sheet.ownerNode; if (owner instanceof HTMLStyleElement && !owner.textContent.trim()) { owner.textContent = css; } } const adoptedStyleSheets = Array.from(document.adoptedStyleSheets || []); adoptedStyleSheets.forEach((sheet, index) => { const css = serializeRules(sheet); if (!css.trim()) { return; } const style = document.createElement("style"); style.setAttribute("data-archiver-adopted-stylesheet", String(index)); style.textContent = css; document.head.appendChild(style); }); }); } function addArchiveComment(html, sourceUrl) { const safeSource = String(sourceUrl).replaceAll("--", "- -"); const comment = ``; if (/]*>/i, (doctype) => `${doctype}\n${comment}`); } return `\n${comment}\n${html}`; } export function findExternalAssetRefs(html) { const refs = new Set(); const assetTagPattern = /<(?:img|source|audio|video|track|embed|object|input|iframe)\b[^>]*>/gi; for (const match of html.matchAll(assetTagPattern)) { const tag = match[0]; for (const attr of ["src", "srcset", "poster", "data"]) { const value = readAttribute(tag, attr); if (!value) { continue; } if (attr === "srcset") { addSrcsetRefs(refs, value); continue; } if (isSelfContainedAssetRef(value)) { continue; } for (const part of value.split(",")) { const candidate = part.trim().split(/\s+/)[0]; if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } } } const linkPattern = /]*>/gi; for (const match of html.matchAll(linkPattern)) { const tag = match[0]; const rel = readAttribute(tag, "rel") || ""; if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) { continue; } const href = readAttribute(tag, "href"); if (href && !isSelfContainedAssetRef(href)) { refs.add(href); } const imageSrcset = readAttribute(tag, "imagesrcset"); if (imageSrcset) { addSrcsetRefs(refs, imageSrcset); } } const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi; for (const match of html.matchAll(cssUrlPattern)) { const candidate = cleanCssUrl(match[2]); if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } return Array.from(refs).sort(); } function addSrcsetRefs(refs, srcset) { for (const part of splitSrcset(srcset)) { const candidate = part.trim().split(/\s+/)[0]; if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } } function isSelfContainedAssetRef(value) { const trimmed = cleanCssUrl(value); return ( !trimmed || trimmed.startsWith("#") || /^%23/i.test(trimmed) || /^(?:data|about|javascript|mailto|tel):/i.test(trimmed) ); } function readAttribute(tag, attr) { const match = findAttribute(tag, attr); return match ? match.value : ""; } function cleanCssUrl(value) { const decoded = String(value) .trim() .replaceAll("&", "&") .replaceAll(""", '"') .replaceAll("'", "'") .replaceAll("'", "'"); const quote = decoded[0]; if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) { return decoded.slice(1, -1).trim(); } return decoded; } function findAttribute(openingTag, attr) { const attrLower = attr.toLowerCase(); const nameMatch = openingTag.match(/^<[^\s/>]+/); let index = nameMatch ? nameMatch[0].length : 1; while (index < openingTag.length) { while (index < openingTag.length && /\s/.test(openingTag[index])) { index += 1; } if (index >= openingTag.length || openingTag[index] === ">" || openingTag[index] === "/") { return null; } const start = index; while (index < openingTag.length && !/[\s=/>]/.test(openingTag[index])) { index += 1; } const name = openingTag.slice(start, index); while (index < openingTag.length && /\s/.test(openingTag[index])) { index += 1; } let value = ""; if (openingTag[index] === "=") { index += 1; while (index < openingTag.length && /\s/.test(openingTag[index])) { index += 1; } const quote = openingTag[index]; if (quote === '"' || quote === "'") { index += 1; const valueStart = index; while (index < openingTag.length && openingTag[index] !== quote) { index += 1; } value = openingTag.slice(valueStart, index); if (openingTag[index] === quote) { index += 1; } } else { const valueStart = index; while (index < openingTag.length && !/[\s>]/.test(openingTag[index])) { index += 1; } value = openingTag.slice(valueStart, index); } } if (name.toLowerCase() === attrLower) { return { start, end: index, value }; } } return null; }