import fs from "node:fs/promises"; import path from "node:path"; import { createRequire } from "node:module"; import { fileURLToPath } from "node:url"; import { AssetInliner, DEFAULT_USER_AGENT, defaultArchivePath, findEffectiveBase, inputToUrl, isHttpUrl, slugForUrl } from "./asset-inliner.mjs"; const require = createRequire(import.meta.url); const __dirname = path.dirname(fileURLToPath(import.meta.url)); const PAGE_TIMEOUT_MS = 60000; const NETWORK_IDLE_TIMEOUT_MS = 5000; const VIEWPORT = { width: 1366, height: 768 }; export { DEFAULT_USER_AGENT, defaultArchivePath }; // --------------------------------------------------------------------------- // Privacy filters integration // --------------------------------------------------------------------------- const PRIVACY_FILTERS_DIR = path.join(__dirname, "..", "privacy-filters"); let privacyFiltersAvailable = false; let filterRules = { blockRules: [], allowRules: [], cosmeticRules: [] }; let userScriptData = []; // { file, content, matches, excludes } let userScriptRequireContent = ""; async function loadPrivacyFilters() { try { const filterPath = path.join(PRIVACY_FILTERS_DIR, "bpc-paywall-filter.txt"); const filterContent = await fs.readFile(filterPath, "utf8"); filterRules = parseFilterRules(filterContent); const userscriptDir = path.join(PRIVACY_FILTERS_DIR, "userscript"); userScriptRequireContent = await fs.readFile(path.join(userscriptDir, "bpc_func.js"), "utf8"); const userScriptFiles = [ "bpc.en.user.js", "bpc.de.user.js", "bpc.es.pt.user.js", "bpc.fi.se.user.js", "bpc.fr.user.js", "bpc.it.user.js", "bpc.nl.user.js", "bpc.pl.user.js" ]; for (const file of userScriptFiles) { const content = await fs.readFile(path.join(userscriptDir, file), "utf8"); const meta = parseUserScriptMetadata(content); userScriptData.push({ file, content, ...meta }); } privacyFiltersAvailable = true; } catch { // Privacy filters directory missing or unreadable; archive without them. } } await loadPrivacyFilters(); // --- Adblock filter parsing ------------------------------------------------ function parseFilterRules(content) { const blockRules = []; const allowRules = []; const cosmeticRules = []; let inPreprocessor = false; for (const rawLine of content.split("\n")) { const line = rawLine.trim(); if (!line) continue; if (line.startsWith("!#if")) { inPreprocessor = true; continue; } if (line.startsWith("!#endif")) { inPreprocessor = false; continue; } if (inPreprocessor || line.startsWith("!#") || line.startsWith("!")) continue; // Cosmetic exception (#@#) – skip. if (line.includes("#@#")) continue; // Exception network rules if (line.startsWith("@@")) { const rule = parseNetworkRule(line.slice(2)); if (rule) allowRules.push(rule); continue; } // Cosmetic filters const hashIdx = line.indexOf("##"); if (hashIdx >= 0) { const domains = line.slice(0, hashIdx); const selector = line.slice(hashIdx + 2); if (!selector.startsWith("+js")) { const css = cosmeticSelectorToCss(selector); if (css) { cosmeticRules.push({ domains, css }); } } continue; } // Network rules const rule = parseNetworkRule(line); if (rule) blockRules.push(rule); } return { blockRules, allowRules, cosmeticRules }; } function parseNetworkRule(line) { let options = []; let pattern = line; const lastDollar = line.lastIndexOf("$"); if (lastDollar > 0) { const optsStr = line.slice(lastDollar + 1); if (/^[a-z,=~_.\-|0-9]+$/i.test(optsStr)) { options = optsStr.split(","); pattern = line.slice(0, lastDollar); } } if (!pattern) return null; const types = options.filter((o) => [ "document", "font", "image", "inline-script", "media", "object", "other", "script", "stylesheet", "subdocument", "xmlhttprequest" ].includes(o) ); const isThirdParty = options.includes("third-party"); const isFirstParty = options.includes("~third-party"); const important = options.includes("important"); let includeDomains = []; let excludeDomains = []; const domainOpt = options.find((o) => o.startsWith("domain=")); if (domainOpt) { for (const d of domainOpt.slice(7).split("|")) { if (d.startsWith("~")) { excludeDomains.push(d.slice(1)); } else { includeDomains.push(d); } } } if (pattern.startsWith("||")) { let domainPath = pattern.slice(2).replace(/\^$/, ""); let [domain, ...pathParts] = domainPath.split("/"); let path = pathParts.length > 0 ? "/" + pathParts.join("/") : ""; return { kind: "domain", domain, path, types, isThirdParty, isFirstParty, includeDomains, excludeDomains, important }; } if (pattern.startsWith("/") && pattern.endsWith("/") && pattern.length > 1) { const regex = pattern.slice(1, -1); return { kind: "regex", regex, types, isThirdParty, isFirstParty, includeDomains, excludeDomains, important }; } return { kind: "pattern", regex: adblockPatternToRegex(pattern), types, isThirdParty, isFirstParty, includeDomains, excludeDomains, important }; } function cosmeticSelectorToCss(selector) { if (selector.endsWith(":remove()")) { const baseSelector = selector.slice(0, -":remove()".length); return baseSelector ? `${baseSelector} { display: none !important; }` : null; } const styleMatch = selector.match(/:style\((.+)\)$/); if (styleMatch) { const baseSelector = selector.slice(0, selector.lastIndexOf(":style(")); return `${baseSelector} { ${styleMatch[1]} }`; } if ( selector.includes(":remove()") || selector.includes(":matches-css") || selector.includes(":matches-media") || selector.includes(":xpath(") || selector.includes(":upward(") || selector.includes(":matches-path") ) { return null; } return `${selector} { display: none !important; }`; } function matchesCosmeticDomains(domainSpec, hostname) { if (!domainSpec || domainSpec === "*") return true; const domains = domainSpec.split(","); const hasNegated = domains.some((d) => d.startsWith("~")); if (hasNegated) { for (const d of domains) { if (d.startsWith("~")) { const neg = d.slice(1); if (hostname === neg || hostname.endsWith("." + neg)) { return false; } } } return true; } return domains.some((d) => hostname === d || hostname.endsWith("." + d)); } function matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule) { if (rule.includeDomains.length > 0) { const ok = rule.includeDomains.some( (d) => sourceHostname === d || sourceHostname.endsWith("." + d) ); if (!ok) return false; } if (rule.excludeDomains.length > 0) { const blocked = rule.excludeDomains.some( (d) => sourceHostname === d || sourceHostname.endsWith("." + d) ); if (blocked) return false; } if (rule.types.length > 0) { if (!rule.types.some((type) => resourceTypeMatches(type, resourceType))) { return false; } } if (rule.isThirdParty) { const is3p = hostname !== sourceHostname && !hostname.endsWith("." + sourceHostname); if (!is3p) return false; } if (rule.isFirstParty) { const is3p = hostname !== sourceHostname && !hostname.endsWith("." + sourceHostname); if (is3p) return false; } if (rule.kind === "domain") { if (!domainPatternMatches(hostname, rule.domain)) return false; if (rule.path) { const pathRe = new RegExp("^" + adblockPatternToRegex(rule.path), "i"); if (!pathRe.test(urlObj.pathname + urlObj.search)) return false; } return true; } if (rule.kind === "regex") { try { const re = new RegExp(rule.regex, "i"); return re.test(url); } catch { return false; } } if (rule.kind === "pattern") { try { const re = new RegExp(rule.regex, "i"); return re.test(url); } catch { return false; } } return false; } function resourceTypeMatches(filterType, resourceType) { const typeMap = { document: ["document"], font: ["font"], image: ["image"], "inline-script": ["script"], media: ["media"], object: ["object"], other: ["other"], script: ["script"], stylesheet: ["stylesheet"], subdocument: ["document"], xmlhttprequest: ["fetch", "xhr"] }; const mapped = typeMap[filterType]; return mapped ? mapped.includes(resourceType) : false; } function domainPatternMatches(hostname, pattern) { const normalized = pattern.replace(/\^$/, "").toLowerCase(); if (!normalized) return false; if (!normalized.includes("*")) { return hostname === normalized || hostname.endsWith("." + normalized); } const re = new RegExp( "^" + normalized .split("*") .map((part) => part.replace(/[|\\{}()[\]^$+?.]/g, "\\$&")) .join("[^.]*") + "$", "i" ); return re.test(hostname); } function adblockPatternToRegex(pattern) { let source = ""; let remaining = pattern; let anchoredStart = false; let anchoredEnd = false; if (remaining.startsWith("|")) { anchoredStart = true; remaining = remaining.slice(1); } if (remaining.endsWith("|")) { anchoredEnd = true; remaining = remaining.slice(0, -1); } for (const ch of remaining) { if (ch === "*") { source += ".*"; } else if (ch === "^") { source += "(?:[^A-Za-z0-9_.%-]|$)"; } else { source += ch.replace(/[|\\{}()[\]^$+?.]/g, "\\$&"); } } return `${anchoredStart ? "^" : ""}${source}${anchoredEnd ? "$" : ""}`; } function shouldBlockRequest(url, resourceType, sourceHostname) { if (url === sourceHostname || url.startsWith(sourceHostname + "/")) { return false; } let urlObj; try { urlObj = new URL(url); } catch { return false; } const hostname = urlObj.hostname; for (const rule of filterRules.allowRules) { if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) { return false; } } for (const rule of filterRules.blockRules) { if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) { return true; } } return false; } // --- Userscript metadata parsing ------------------------------------------- function parseUserScriptMetadata(content) { const metaBlock = content.match(/\/\/\s*==UserScript==([\s\S]*?)\/\/\s*==\/UserScript==/); const matches = []; const excludes = []; if (!metaBlock) return { matches, excludes }; const lines = metaBlock[1].split("\n"); for (const line of lines) { const matchMatch = line.match(/@match\s+(.+)/); if (matchMatch) { matches.push(matchMatch[1].trim()); continue; } const excludeMatch = line.match(/@exclude\s+(.+)/); if (excludeMatch) { excludes.push(excludeMatch[1].trim()); } } return { matches, excludes }; } function urlMatchesPattern(url, pattern) { // Simple glob-style pattern matching for userscript @match // Format: *://*.example.com/* or http://example.com/path try { const urlObj = new URL(url); const protocol = urlObj.protocol.slice(0, -1); // "http" or "https" const hostname = urlObj.hostname; const pathname = urlObj.pathname; // Split pattern const protoEnd = pattern.indexOf("://"); if (protoEnd < 0) return false; const patternProto = pattern.slice(0, protoEnd); const rest = pattern.slice(protoEnd + 3); // Protocol match if (patternProto !== "*" && patternProto !== protocol) return false; // Split rest into host and path const slashIdx = rest.indexOf("/"); const patternHost = slashIdx >= 0 ? rest.slice(0, slashIdx) : rest; const patternPath = slashIdx >= 0 ? rest.slice(slashIdx) : "/"; // Host match if (!matchHost(hostname, patternHost)) return false; // Path match if (!matchPath(pathname, patternPath)) return false; return true; } catch { return false; } } function matchHost(hostname, pattern) { if (pattern === "*") return true; if (pattern.startsWith("*.")) { const suffix = pattern.slice(2); return hostname === suffix || hostname.endsWith("." + suffix); } return hostname === pattern; } function matchPath(pathname, pattern) { if (pattern === "/*") return true; // Convert glob pattern to regex const regex = "^" + pattern .replace(/\./g, "\\.") .replace(/\*/g, ".*") .replace(/\?/g, ".") + "$"; return new RegExp(regex, "i").test(pathname); } function shouldInjectUserScript(url, meta) { let matched = false; for (const pattern of meta.matches) { if (urlMatchesPattern(url, pattern)) { matched = true; break; } } if (!matched) return false; for (const pattern of meta.excludes) { if (urlMatchesPattern(url, pattern)) { return false; } } return true; } // --- Browser helpers ------------------------------------------------------- function loadPlaywright() { try { return require("playwright"); } catch (error) { throw new Error( `Playwright is required. Run "npm install" and "npm run install-browsers". Original error: ${error.message}` ); } } // Manual stealth evasions injected into every page before any scripts run. const STEALTH_INIT_SCRIPT = ` (() => { const patchNavigator = () => { try { // Override webdriver getter without using delete (can crash renderer) if (navigator.webdriver !== undefined) { Object.defineProperty(navigator, 'webdriver', { get: () => undefined, configurable: true, enumerable: true }); } } catch (e) {} try { if (!window.chrome) { window.chrome = { runtime: {} }; } else if (!window.chrome.runtime) { window.chrome.runtime = {}; } } catch (e) {} try { const originalQuery = window.navigator.permissions?.query; if (originalQuery) { window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : originalQuery(parameters) ); } } catch (e) {} }; if (document.readyState === 'loading') { document.addEventListener('DOMContentLoaded', patchNavigator); } else { patchNavigator(); } })(); `; function buildLaunchArgs(headless) { const args = [ "--disable-blink-features=AutomationControlled", "--disable-web-security", "--disable-features=IsolateOrigins,site-per-process", "--disable-site-isolation-trials", "--disable-infobars", "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", "--disable-accelerated-2d-canvas", "--disable-gpu", "--window-size=1366,768" ]; if (headless) { args.push("--headless=new"); } return args; } function buildIgnoreDefaultArgs() { return ["--enable-automation"]; } // --- Page helpers ---------------------------------------------------------- async function setupRequestBlocking(page, sourceHostname) { if (!privacyFiltersAvailable || filterRules.blockRules.length === 0) return; await page.route("**/*", (route) => { try { const request = route.request(); if (request.isNavigationRequest() && request.frame() === page.mainFrame()) { route.continue(); return; } const url = request.url(); const type = request.resourceType(); if (shouldBlockRequest(url, type, sourceHostname)) { route.abort("blockedbyclient"); } else { route.continue(); } } catch { route.continue(); } }); } async function injectCosmeticFilters(page, hostname) { if (!privacyFiltersAvailable || filterRules.cosmeticRules.length === 0) return; const lines = []; for (const rule of filterRules.cosmeticRules) { if (matchesCosmeticDomains(rule.domains, hostname)) { lines.push(rule.css); } } if (lines.length > 0) { try { await page.addStyleTag({ content: lines.join("\n") }); } catch { // Ignore cosmetic injection failures. } } } const GM_MOCK = ` if (typeof GM === "undefined") { window.GM = { xmlHttpRequest: function(details) { fetch(details.url, { method: details.method || "GET", headers: details.headers || {}, body: details.data || null }) .then(response => response.text().then(text => ({ status: response.status, statusText: response.statusText, responseText: text, responseHeaders: Array.from(response.headers.entries()) .map(([k, v]) => k + ": " + v).join("\\r\\n") }))) .then(obj => { if (details.onload) details.onload(obj); }) .catch(err => { if (details.onerror) details.onerror(err); }); } }; } `; async function injectPrivacyUserScripts(page, sourceUrl) { if (!privacyFiltersAvailable || userScriptData.length === 0) return; const matching = userScriptData.filter((us) => shouldInjectUserScript(sourceUrl, us)); if (matching.length === 0) return; // Inject GM API mock first. try { await page.addScriptTag({ content: GM_MOCK }); if (userScriptRequireContent) { await page.addScriptTag({ content: userScriptRequireContent }); } } catch { return; } // Inject only matching userscripts. for (const us of matching) { try { await page.addScriptTag({ content: us.content }); } catch { // Ignore injection failures for individual scripts. } } } // --------------------------------------------------------------------------- // Archiving // --------------------------------------------------------------------------- export async function archivePage(input, options = {}) { const sourceUrl = inputToUrl(input); const archivePath = options.archivePath || defaultArchivePath(); const id = options.id || slugForUrl(sourceUrl); const filePath = path.join(archivePath, `${id}.html`); await fs.mkdir(archivePath, { recursive: true }); const renderedHtml = await renderPage(sourceUrl, options); const baseUrl = findEffectiveBase(renderedHtml, sourceUrl); const inliner = new AssetInliner({ userAgent: DEFAULT_USER_AGENT, referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined }); const inlined = await inliner.inlineHtml(renderedHtml, baseUrl); const finalHtml = addArchiveComment(inlined, sourceUrl); await fs.writeFile(filePath, finalHtml, "utf8"); return { id, filePath, sourceUrl, archivePath, warnings: inliner.warnings, externalAssets: findExternalAssetRefs(finalHtml) }; } export async function renderPage(sourceUrl, options = {}) { const playwright = loadPlaywright(); const hasDisplay = !!(process.env.DISPLAY || process.env.WAYLAND_DISPLAY); const headless = options.headless !== false && !hasDisplay; const browser = await playwright.chromium.launch({ headless, args: buildLaunchArgs(headless), ignoreDefaultArgs: buildIgnoreDefaultArgs() }); try { const context = await browser.newContext({ userAgent: options.userAgent || DEFAULT_USER_AGENT, viewport: VIEWPORT, locale: options.locale || "en-US", timezoneId: options.timezoneId || "America/New_York" }); // Inject stealth evasions into every new page before any scripts run. await context.addInitScript(STEALTH_INIT_SCRIPT); const page = await context.newPage(); const sourceHostname = new URL(sourceUrl).hostname; // Block paywall/tracker requests before the page loads. await setupRequestBlocking(page, sourceHostname); await page.goto(sourceUrl, { waitUntil: "domcontentloaded", timeout: PAGE_TIMEOUT_MS }); // Inject cosmetic CSS and userscripts to strip paywalls / ads. await injectCosmeticFilters(page, sourceHostname); await injectPrivacyUserScripts(page, sourceUrl); // Give the userscripts a moment to run their setTimeout callbacks. const userscriptDelay = options.userscriptDelay || 2000; await page.waitForTimeout(userscriptDelay); await waitForNetworkIdle(page); await snapshotLoadedResourceUrls(page); return await page.content(); } finally { await browser.close(); } } async function waitForNetworkIdle(page) { try { await page.waitForLoadState("networkidle", { timeout: NETWORK_IDLE_TIMEOUT_MS }); } catch { // Some pages keep sockets open; the DOM snapshot is still useful. } } async function snapshotLoadedResourceUrls(page) { await page.evaluate(() => { document.querySelectorAll("img").forEach((img) => { if (img.currentSrc) { img.setAttribute("src", img.currentSrc); } }); document.querySelectorAll("video,audio").forEach((media) => { if (media.currentSrc) { media.setAttribute("src", media.currentSrc); } }); document.querySelectorAll("iframe").forEach((frame) => { try { const doc = frame.contentDocument; if (doc?.documentElement) { frame.setAttribute("srcdoc", "" + doc.documentElement.outerHTML); frame.removeAttribute("src"); } } catch { // Cross-origin frames are handled later by the asset inliner when possible. } }); }); } function addArchiveComment(html, sourceUrl) { const safeSource = String(sourceUrl).replaceAll("--", "- -"); const comment = ``; if (/]*>/i, (doctype) => `${doctype}\n${comment}`); } return `\n${comment}\n${html}`; } export function findExternalAssetRefs(html) { const refs = new Set(); const assetTagPattern = /<(?:img|source|audio|video|track|embed|object|input|iframe)\b[^>]*>/gi; for (const match of html.matchAll(assetTagPattern)) { const tag = match[0]; for (const attr of ["src", "srcset", "poster", "data"]) { const value = readAttribute(tag, attr); if (!value || isSelfContainedAssetRef(value)) { continue; } for (const part of value.split(",")) { const candidate = part.trim().split(/\s+/)[0]; if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } } } const linkPattern = /]*>/gi; for (const match of html.matchAll(linkPattern)) { const tag = match[0]; const rel = readAttribute(tag, "rel") || ""; if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) { continue; } const href = readAttribute(tag, "href"); if (href && !isSelfContainedAssetRef(href)) { refs.add(href); } } const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi; for (const match of html.matchAll(cssUrlPattern)) { const candidate = cleanCssUrl(match[2]); if (candidate && !isSelfContainedAssetRef(candidate)) { refs.add(candidate); } } return Array.from(refs).sort(); } function isSelfContainedAssetRef(value) { const trimmed = cleanCssUrl(value); return ( !trimmed || trimmed.startsWith("#") || /^%23/i.test(trimmed) || /^(?:data|about|javascript|mailto|tel):/i.test(trimmed) ); } function readAttribute(tag, attr) { const match = findAttribute(tag, attr); return match ? match.value : ""; } function cleanCssUrl(value) { const decoded = String(value) .trim() .replaceAll("&", "&") .replaceAll(""", '"') .replaceAll("'", "'") .replaceAll("'", "'"); const quote = decoded[0]; if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) { return decoded.slice(1, -1).trim(); } return decoded; } function findAttribute(openingTag, attr) { const attrLower = attr.toLowerCase(); const nameMatch = openingTag.match(/^<[^\s/>]+/); let index = nameMatch ? nameMatch[0].length : 1; while (index < openingTag.length) { while (index < openingTag.length && /\s/.test(openingTag[index])) { index += 1; } if (index >= openingTag.length || openingTag[index] === ">" || openingTag[index] === "/") { return null; } const start = index; while (index < openingTag.length && !/[\s=/>]/.test(openingTag[index])) { index += 1; } const name = openingTag.slice(start, index); while (index < openingTag.length && /\s/.test(openingTag[index])) { index += 1; } let value = ""; if (openingTag[index] === "=") { index += 1; while (index < openingTag.length && /\s/.test(openingTag[index])) { index += 1; } const quote = openingTag[index]; if (quote === '"' || quote === "'") { index += 1; const valueStart = index; while (index < openingTag.length && openingTag[index] !== quote) { index += 1; } value = openingTag.slice(valueStart, index); if (openingTag[index] === quote) { index += 1; } } else { const valueStart = index; while (index < openingTag.length && !/[\s>]/.test(openingTag[index])) { index += 1; } value = openingTag.slice(valueStart, index); } } if (name.toLowerCase() === attrLower) { return { start, end: index, value }; } } return null; }