This commit is contained in:
2026-05-15 09:12:28 -07:00
parent 55703bb7ed
commit 2e350ce3dc
18 changed files with 14733 additions and 18 deletions

View File

@@ -1,6 +1,7 @@
import fs from "node:fs/promises";
import path from "node:path";
import { createRequire } from "node:module";
import { fileURLToPath } from "node:url";
import {
AssetInliner,
DEFAULT_USER_AGENT,
@@ -12,15 +13,595 @@ import {
} from "./asset-inliner.mjs";
const require = createRequire(import.meta.url);
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const PAGE_TIMEOUT_MS = 60000;
const NETWORK_IDLE_TIMEOUT_MS = 5000;
const VIEWPORT = {
width: 1024,
width: 1366,
height: 768
};
export { DEFAULT_USER_AGENT, defaultArchivePath };
// ---------------------------------------------------------------------------
// Privacy filters integration
// ---------------------------------------------------------------------------
const PRIVACY_FILTERS_DIR = path.join(__dirname, "..", "privacy-filters");
let privacyFiltersAvailable = false;
let filterRules = { blockRules: [], allowRules: [], cosmeticRules: [] };
let userScriptData = []; // { file, content, matches, excludes }
async function loadPrivacyFilters() {
try {
const filterPath = path.join(PRIVACY_FILTERS_DIR, "bpc-paywall-filter.txt");
const filterContent = await fs.readFile(filterPath, "utf8");
filterRules = parseFilterRules(filterContent);
const userscriptDir = path.join(PRIVACY_FILTERS_DIR, "userscript");
const userScriptFiles = [
"bpc.en.user.js",
"bpc.de.user.js",
"bpc.es.pt.user.js",
"bpc.fi.se.user.js",
"bpc.fr.user.js",
"bpc.it.user.js",
"bpc.nl.user.js",
"bpc.pl.user.js"
];
for (const file of userScriptFiles) {
const content = await fs.readFile(path.join(userscriptDir, file), "utf8");
const meta = parseUserScriptMetadata(content);
userScriptData.push({ file, content, ...meta });
}
privacyFiltersAvailable = true;
} catch {
// Privacy filters directory missing or unreadable; archive without them.
}
}
await loadPrivacyFilters();
// --- Adblock filter parsing ------------------------------------------------
function parseFilterRules(content) {
const blockRules = [];
const allowRules = [];
const cosmeticRules = [];
let inPreprocessor = false;
for (const rawLine of content.split("\n")) {
const line = rawLine.trim();
if (!line) continue;
if (line.startsWith("!#if")) {
inPreprocessor = true;
continue;
}
if (line.startsWith("!#endif")) {
inPreprocessor = false;
continue;
}
if (inPreprocessor || line.startsWith("!#") || line.startsWith("!")) continue;
// Cosmetic exception (#@#) skip.
if (line.includes("#@#")) continue;
// Exception network rules
if (line.startsWith("@@")) {
const rule = parseNetworkRule(line.slice(2));
if (rule) allowRules.push(rule);
continue;
}
// Cosmetic filters
const hashIdx = line.indexOf("##");
if (hashIdx >= 0) {
const domains = line.slice(0, hashIdx);
const selector = line.slice(hashIdx + 2);
if (!selector.startsWith("+js")) {
const css = cosmeticSelectorToCss(selector);
if (css) {
cosmeticRules.push({ domains, css });
}
}
continue;
}
// Network rules
const rule = parseNetworkRule(line);
if (rule) blockRules.push(rule);
}
return { blockRules, allowRules, cosmeticRules };
}
function parseNetworkRule(line) {
let options = [];
let pattern = line;
const lastDollar = line.lastIndexOf("$");
if (lastDollar > 0) {
const optsStr = line.slice(lastDollar + 1);
if (/^[a-z,=~\-|0-9]+$/i.test(optsStr)) {
options = optsStr.split(",");
pattern = line.slice(0, lastDollar);
}
}
if (!pattern) return null;
const type = options.find((o) =>
["script", "stylesheet", "image", "media", "xmlhttprequest", "other", "inline-script"].includes(o)
);
const isThirdParty = options.includes("third-party");
const isFirstParty = options.includes("~third-party");
const important = options.includes("important");
let includeDomains = [];
let excludeDomains = [];
const domainOpt = options.find((o) => o.startsWith("domain="));
if (domainOpt) {
for (const d of domainOpt.slice(7).split("|")) {
if (d.startsWith("~")) {
excludeDomains.push(d.slice(1));
} else {
includeDomains.push(d);
}
}
}
if (pattern.startsWith("||")) {
let domainPath = pattern.slice(2).replace(/\^$/, "");
let [domain, ...pathParts] = domainPath.split("/");
let path = pathParts.length > 0 ? "/" + pathParts.join("/") : "";
return {
kind: "domain",
domain,
path,
type,
isThirdParty,
isFirstParty,
includeDomains,
excludeDomains,
important
};
}
if (pattern.startsWith("/")) {
const lastSlash = pattern.lastIndexOf("/");
if (lastSlash > 0) {
const regex = pattern.slice(1, lastSlash);
return {
kind: "regex",
regex,
type,
isThirdParty,
isFirstParty,
includeDomains,
excludeDomains,
important
};
}
}
return null;
}
function cosmeticSelectorToCss(selector) {
const styleMatch = selector.match(/:style\((.+)\)$/);
if (styleMatch) {
const baseSelector = selector.slice(0, selector.lastIndexOf(":style("));
return `${baseSelector} { ${styleMatch[1]} }`;
}
if (
selector.includes(":remove()") ||
selector.includes(":matches-css") ||
selector.includes(":matches-media") ||
selector.includes(":xpath(") ||
selector.includes(":upward(") ||
selector.includes(":matches-path")
) {
return null;
}
return `${selector} { display: none !important; }`;
}
function matchesCosmeticDomains(domainSpec, hostname) {
if (!domainSpec || domainSpec === "*") return true;
const domains = domainSpec.split(",");
const hasNegated = domains.some((d) => d.startsWith("~"));
if (hasNegated) {
for (const d of domains) {
if (d.startsWith("~")) {
const neg = d.slice(1);
if (hostname === neg || hostname.endsWith("." + neg)) {
return false;
}
}
}
return true;
}
return domains.some((d) => hostname === d || hostname.endsWith("." + d));
}
function matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule) {
if (rule.includeDomains.length > 0) {
const ok = rule.includeDomains.some(
(d) => sourceHostname === d || sourceHostname.endsWith("." + d)
);
if (!ok) return false;
}
if (rule.excludeDomains.length > 0) {
const blocked = rule.excludeDomains.some(
(d) => sourceHostname === d || sourceHostname.endsWith("." + d)
);
if (blocked) return false;
}
if (rule.type) {
const typeMap = {
script: "script",
stylesheet: "stylesheet",
image: "image",
media: "media",
xmlhttprequest: "xhr",
other: "other",
"inline-script": "script"
};
if (typeMap[rule.type] && resourceType !== typeMap[rule.type]) {
return false;
}
}
if (rule.isThirdParty) {
const is3p = hostname !== sourceHostname && !hostname.endsWith("." + sourceHostname);
if (!is3p) return false;
}
if (rule.isFirstParty) {
const is3p = hostname !== sourceHostname && !hostname.endsWith("." + sourceHostname);
if (is3p) return false;
}
if (rule.kind === "domain") {
const domainRe = new RegExp(
"^" + rule.domain.replace(/\./g, "\\.").replace(/\*/g, "[^.]*") + "$",
"i"
);
if (!domainRe.test(hostname)) return false;
if (rule.path) {
const pathRe = new RegExp(
"^" + rule.path.replace(/\./g, "\\.").replace(/\*/g, ".*").replace(/\?/g, "\\?").replace(/\^/g, ""),
"i"
);
if (!pathRe.test(urlObj.pathname)) return false;
}
return true;
}
if (rule.kind === "regex") {
try {
const re = new RegExp(rule.regex, "i");
return re.test(url);
} catch {
return false;
}
}
return false;
}
function shouldBlockRequest(url, resourceType, sourceHostname) {
if (url === sourceHostname || url.startsWith(sourceHostname + "/")) {
return false;
}
let urlObj;
try {
urlObj = new URL(url);
} catch {
return false;
}
const hostname = urlObj.hostname;
for (const rule of filterRules.allowRules) {
if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) {
return false;
}
}
for (const rule of filterRules.blockRules) {
if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) {
return true;
}
}
return false;
}
// --- Userscript metadata parsing -------------------------------------------
function parseUserScriptMetadata(content) {
const metaBlock = content.match(/\/\/\s*==UserScript==([\s\S]*?)\/\/\s*==\/UserScript==/);
const matches = [];
const excludes = [];
if (!metaBlock) return { matches, excludes };
const lines = metaBlock[1].split("\n");
for (const line of lines) {
const matchMatch = line.match(/@match\s+(.+)/);
if (matchMatch) {
matches.push(matchMatch[1].trim());
continue;
}
const excludeMatch = line.match(/@exclude\s+(.+)/);
if (excludeMatch) {
excludes.push(excludeMatch[1].trim());
}
}
return { matches, excludes };
}
function urlMatchesPattern(url, pattern) {
// Simple glob-style pattern matching for userscript @match
// Format: *://*.example.com/* or http://example.com/path
try {
const urlObj = new URL(url);
const protocol = urlObj.protocol.slice(0, -1); // "http" or "https"
const hostname = urlObj.hostname;
const pathname = urlObj.pathname;
// Split pattern
const protoEnd = pattern.indexOf("://");
if (protoEnd < 0) return false;
const patternProto = pattern.slice(0, protoEnd);
const rest = pattern.slice(protoEnd + 3);
// Protocol match
if (patternProto !== "*" && patternProto !== protocol) return false;
// Split rest into host and path
const slashIdx = rest.indexOf("/");
const patternHost = slashIdx >= 0 ? rest.slice(0, slashIdx) : rest;
const patternPath = slashIdx >= 0 ? rest.slice(slashIdx) : "/";
// Host match
if (!matchHost(hostname, patternHost)) return false;
// Path match
if (!matchPath(pathname, patternPath)) return false;
return true;
} catch {
return false;
}
}
function matchHost(hostname, pattern) {
if (pattern === "*") return true;
if (pattern.startsWith("*.")) {
const suffix = pattern.slice(2);
return hostname === suffix || hostname.endsWith("." + suffix);
}
return hostname === pattern;
}
function matchPath(pathname, pattern) {
if (pattern === "/*") return true;
// Convert glob pattern to regex
const regex = "^" + pattern
.replace(/\./g, "\\.")
.replace(/\*/g, ".*")
.replace(/\?/g, ".")
+ "$";
return new RegExp(regex, "i").test(pathname);
}
function shouldInjectUserScript(url, meta) {
let matched = false;
for (const pattern of meta.matches) {
if (urlMatchesPattern(url, pattern)) {
matched = true;
break;
}
}
if (!matched) return false;
for (const pattern of meta.excludes) {
if (urlMatchesPattern(url, pattern)) {
return false;
}
}
return true;
}
// --- Browser helpers -------------------------------------------------------
function loadPlaywright() {
try {
return require("playwright");
} catch (error) {
throw new Error(
`Playwright is required. Run "npm install" and "npm run install-browsers". Original error: ${error.message}`
);
}
}
// Manual stealth evasions injected into every page before any scripts run.
const STEALTH_INIT_SCRIPT = `
(() => {
const patchNavigator = () => {
try {
// Override webdriver getter without using delete (can crash renderer)
if (navigator.webdriver !== undefined) {
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined,
configurable: true,
enumerable: true
});
}
} catch (e) {}
try {
if (!window.chrome) {
window.chrome = { runtime: {} };
} else if (!window.chrome.runtime) {
window.chrome.runtime = {};
}
} catch (e) {}
try {
const originalQuery = window.navigator.permissions?.query;
if (originalQuery) {
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications'
? Promise.resolve({ state: Notification.permission })
: originalQuery(parameters)
);
}
} catch (e) {}
};
if (document.readyState === 'loading') {
document.addEventListener('DOMContentLoaded', patchNavigator);
} else {
patchNavigator();
}
})();
`;
function buildLaunchArgs(headless) {
const args = [
"--disable-blink-features=AutomationControlled",
"--disable-web-security",
"--disable-features=IsolateOrigins,site-per-process",
"--disable-site-isolation-trials",
"--disable-infobars",
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-accelerated-2d-canvas",
"--disable-gpu",
"--window-size=1366,768"
];
if (headless) {
args.push("--headless=new");
}
return args;
}
function buildIgnoreDefaultArgs() {
return ["--enable-automation"];
}
// --- Page helpers ----------------------------------------------------------
async function setupRequestBlocking(page, sourceHostname) {
if (!privacyFiltersAvailable || filterRules.blockRules.length === 0) return;
await page.route("**/*", (route) => {
try {
const request = route.request();
if (request.isNavigationRequest()) {
route.continue();
return;
}
const url = request.url();
const type = request.resourceType();
if (shouldBlockRequest(url, type, sourceHostname)) {
route.abort("blockedbyclient");
} else {
route.continue();
}
} catch {
route.continue();
}
});
}
async function injectCosmeticFilters(page, hostname) {
if (!privacyFiltersAvailable || filterRules.cosmeticRules.length === 0) return;
const lines = [];
for (const rule of filterRules.cosmeticRules) {
if (matchesCosmeticDomains(rule.domains, hostname)) {
lines.push(rule.css);
}
}
if (lines.length > 0) {
try {
await page.addStyleTag({ content: lines.join("\n") });
} catch {
// Ignore cosmetic injection failures.
}
}
}
const GM_MOCK = `
if (typeof GM === "undefined") {
window.GM = {
xmlHttpRequest: function(details) {
fetch(details.url, {
method: details.method || "GET",
headers: details.headers || {},
body: details.data || null
})
.then(response => response.text().then(text => ({
status: response.status,
statusText: response.statusText,
responseText: text,
responseHeaders: Array.from(response.headers.entries())
.map(([k, v]) => k + ": " + v).join("\\r\\n")
})))
.then(obj => {
if (details.onload) details.onload(obj);
})
.catch(err => {
if (details.onerror) details.onerror(err);
});
}
};
}
`;
async function injectPrivacyUserScripts(page, sourceUrl) {
if (!privacyFiltersAvailable || userScriptData.length === 0) return;
const matching = userScriptData.filter((us) => shouldInjectUserScript(sourceUrl, us));
if (matching.length === 0) return;
// Inject GM API mock first.
try {
await page.addScriptTag({ content: GM_MOCK });
} catch {
return;
}
// Inject only matching userscripts.
for (const us of matching) {
try {
await page.addScriptTag({ content: us.content });
} catch {
// Ignore injection failures for individual scripts.
}
}
}
// ---------------------------------------------------------------------------
// Archiving
// ---------------------------------------------------------------------------
export async function archivePage(input, options = {}) {
const sourceUrl = inputToUrl(input);
const archivePath = options.archivePath || defaultArchivePath();
@@ -29,7 +610,7 @@ export async function archivePage(input, options = {}) {
await fs.mkdir(archivePath, { recursive: true });
const renderedHtml = await renderPage(sourceUrl);
const renderedHtml = await renderPage(sourceUrl, options);
const baseUrl = findEffectiveBase(renderedHtml, sourceUrl);
const inliner = new AssetInliner({
userAgent: DEFAULT_USER_AGENT,
@@ -50,21 +631,48 @@ export async function archivePage(input, options = {}) {
};
}
export async function renderPage(sourceUrl) {
export async function renderPage(sourceUrl, options = {}) {
const playwright = loadPlaywright();
const browser = await playwright.chromium.launch({ headless: true });
const hasDisplay = !!(process.env.DISPLAY || process.env.WAYLAND_DISPLAY);
const headless = options.headless !== false && !hasDisplay;
const browser = await playwright.chromium.launch({
headless,
args: buildLaunchArgs(headless),
ignoreDefaultArgs: buildIgnoreDefaultArgs()
});
try {
const context = await browser.newContext({
userAgent: DEFAULT_USER_AGENT,
viewport: VIEWPORT
userAgent: options.userAgent || DEFAULT_USER_AGENT,
viewport: VIEWPORT,
locale: options.locale || "en-US",
timezoneId: options.timezoneId || "America/New_York"
});
// Inject stealth evasions into every new page before any scripts run.
await context.addInitScript(STEALTH_INIT_SCRIPT);
const page = await context.newPage();
const sourceHostname = new URL(sourceUrl).hostname;
// Block paywall/tracker requests before the page loads.
await setupRequestBlocking(page, sourceHostname);
await page.goto(sourceUrl, {
waitUntil: "domcontentloaded",
timeout: PAGE_TIMEOUT_MS
});
// Inject cosmetic CSS and userscripts to strip paywalls / ads.
await injectCosmeticFilters(page, sourceHostname);
await injectPrivacyUserScripts(page, sourceUrl);
// Give the userscripts a moment to run their setTimeout callbacks.
const userscriptDelay = options.userscriptDelay || 2000;
await page.waitForTimeout(userscriptDelay);
await waitForNetworkIdle(page);
await snapshotLoadedResourceUrls(page);
@@ -112,16 +720,6 @@ async function snapshotLoadedResourceUrls(page) {
});
}
function loadPlaywright() {
try {
return require("playwright");
} catch (error) {
throw new Error(
`Playwright is required. Run "npm install" and "npm run install-browsers". Original error: ${error.message}`
);
}
}
function addArchiveComment(html, sourceUrl) {
const safeSource = String(sourceUrl).replaceAll("--", "- -");
const comment = `<!-- Archived locally. Source: ${safeSource}. Created: ${new Date().toISOString()}. -->`;