2026-05-14 08:12:13 -07:00
|
|
|
|
import fs from "node:fs/promises";
|
|
|
|
|
|
import path from "node:path";
|
|
|
|
|
|
import { createRequire } from "node:module";
|
2026-05-15 09:12:28 -07:00
|
|
|
|
import { fileURLToPath } from "node:url";
|
2026-05-14 08:12:13 -07:00
|
|
|
|
import {
|
|
|
|
|
|
AssetInliner,
|
|
|
|
|
|
DEFAULT_USER_AGENT,
|
|
|
|
|
|
defaultArchivePath,
|
|
|
|
|
|
findEffectiveBase,
|
|
|
|
|
|
inputToUrl,
|
|
|
|
|
|
isHttpUrl,
|
2026-05-15 01:00:27 -07:00
|
|
|
|
slugForUrl
|
2026-05-14 08:12:13 -07:00
|
|
|
|
} from "./asset-inliner.mjs";
|
|
|
|
|
|
|
|
|
|
|
|
const require = createRequire(import.meta.url);
|
2026-05-15 09:12:28 -07:00
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
2026-05-15 01:00:27 -07:00
|
|
|
|
const PAGE_TIMEOUT_MS = 60000;
|
|
|
|
|
|
const NETWORK_IDLE_TIMEOUT_MS = 5000;
|
|
|
|
|
|
const VIEWPORT = {
|
2026-05-15 09:12:28 -07:00
|
|
|
|
width: 1366,
|
2026-05-15 01:00:27 -07:00
|
|
|
|
height: 768
|
|
|
|
|
|
};
|
2026-05-14 08:12:13 -07:00
|
|
|
|
|
|
|
|
|
|
export { DEFAULT_USER_AGENT, defaultArchivePath };
|
|
|
|
|
|
|
2026-05-15 09:12:28 -07:00
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
// Privacy filters integration
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
const PRIVACY_FILTERS_DIR = path.join(__dirname, "..", "privacy-filters");
|
|
|
|
|
|
|
|
|
|
|
|
let privacyFiltersAvailable = false;
|
|
|
|
|
|
let filterRules = { blockRules: [], allowRules: [], cosmeticRules: [] };
|
|
|
|
|
|
let userScriptData = []; // { file, content, matches, excludes }
|
|
|
|
|
|
|
|
|
|
|
|
async function loadPrivacyFilters() {
|
|
|
|
|
|
try {
|
|
|
|
|
|
const filterPath = path.join(PRIVACY_FILTERS_DIR, "bpc-paywall-filter.txt");
|
|
|
|
|
|
const filterContent = await fs.readFile(filterPath, "utf8");
|
|
|
|
|
|
filterRules = parseFilterRules(filterContent);
|
|
|
|
|
|
|
|
|
|
|
|
const userscriptDir = path.join(PRIVACY_FILTERS_DIR, "userscript");
|
|
|
|
|
|
const userScriptFiles = [
|
|
|
|
|
|
"bpc.en.user.js",
|
|
|
|
|
|
"bpc.de.user.js",
|
|
|
|
|
|
"bpc.es.pt.user.js",
|
|
|
|
|
|
"bpc.fi.se.user.js",
|
|
|
|
|
|
"bpc.fr.user.js",
|
|
|
|
|
|
"bpc.it.user.js",
|
|
|
|
|
|
"bpc.nl.user.js",
|
|
|
|
|
|
"bpc.pl.user.js"
|
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
|
|
for (const file of userScriptFiles) {
|
|
|
|
|
|
const content = await fs.readFile(path.join(userscriptDir, file), "utf8");
|
|
|
|
|
|
const meta = parseUserScriptMetadata(content);
|
|
|
|
|
|
userScriptData.push({ file, content, ...meta });
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
privacyFiltersAvailable = true;
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
// Privacy filters directory missing or unreadable; archive without them.
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
await loadPrivacyFilters();
|
|
|
|
|
|
|
|
|
|
|
|
// --- Adblock filter parsing ------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
function parseFilterRules(content) {
|
|
|
|
|
|
const blockRules = [];
|
|
|
|
|
|
const allowRules = [];
|
|
|
|
|
|
const cosmeticRules = [];
|
|
|
|
|
|
let inPreprocessor = false;
|
|
|
|
|
|
|
|
|
|
|
|
for (const rawLine of content.split("\n")) {
|
|
|
|
|
|
const line = rawLine.trim();
|
|
|
|
|
|
if (!line) continue;
|
|
|
|
|
|
|
|
|
|
|
|
if (line.startsWith("!#if")) {
|
|
|
|
|
|
inPreprocessor = true;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (line.startsWith("!#endif")) {
|
|
|
|
|
|
inPreprocessor = false;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (inPreprocessor || line.startsWith("!#") || line.startsWith("!")) continue;
|
|
|
|
|
|
|
|
|
|
|
|
// Cosmetic exception (#@#) – skip.
|
|
|
|
|
|
if (line.includes("#@#")) continue;
|
|
|
|
|
|
|
|
|
|
|
|
// Exception network rules
|
|
|
|
|
|
if (line.startsWith("@@")) {
|
|
|
|
|
|
const rule = parseNetworkRule(line.slice(2));
|
|
|
|
|
|
if (rule) allowRules.push(rule);
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Cosmetic filters
|
|
|
|
|
|
const hashIdx = line.indexOf("##");
|
|
|
|
|
|
if (hashIdx >= 0) {
|
|
|
|
|
|
const domains = line.slice(0, hashIdx);
|
|
|
|
|
|
const selector = line.slice(hashIdx + 2);
|
|
|
|
|
|
if (!selector.startsWith("+js")) {
|
|
|
|
|
|
const css = cosmeticSelectorToCss(selector);
|
|
|
|
|
|
if (css) {
|
|
|
|
|
|
cosmeticRules.push({ domains, css });
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Network rules
|
|
|
|
|
|
const rule = parseNetworkRule(line);
|
|
|
|
|
|
if (rule) blockRules.push(rule);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return { blockRules, allowRules, cosmeticRules };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function parseNetworkRule(line) {
|
|
|
|
|
|
let options = [];
|
|
|
|
|
|
let pattern = line;
|
|
|
|
|
|
|
|
|
|
|
|
const lastDollar = line.lastIndexOf("$");
|
|
|
|
|
|
if (lastDollar > 0) {
|
|
|
|
|
|
const optsStr = line.slice(lastDollar + 1);
|
|
|
|
|
|
if (/^[a-z,=~\-|0-9]+$/i.test(optsStr)) {
|
|
|
|
|
|
options = optsStr.split(",");
|
|
|
|
|
|
pattern = line.slice(0, lastDollar);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!pattern) return null;
|
|
|
|
|
|
|
|
|
|
|
|
const type = options.find((o) =>
|
|
|
|
|
|
["script", "stylesheet", "image", "media", "xmlhttprequest", "other", "inline-script"].includes(o)
|
|
|
|
|
|
);
|
|
|
|
|
|
const isThirdParty = options.includes("third-party");
|
|
|
|
|
|
const isFirstParty = options.includes("~third-party");
|
|
|
|
|
|
const important = options.includes("important");
|
|
|
|
|
|
|
|
|
|
|
|
let includeDomains = [];
|
|
|
|
|
|
let excludeDomains = [];
|
|
|
|
|
|
const domainOpt = options.find((o) => o.startsWith("domain="));
|
|
|
|
|
|
if (domainOpt) {
|
|
|
|
|
|
for (const d of domainOpt.slice(7).split("|")) {
|
|
|
|
|
|
if (d.startsWith("~")) {
|
|
|
|
|
|
excludeDomains.push(d.slice(1));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
includeDomains.push(d);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (pattern.startsWith("||")) {
|
|
|
|
|
|
let domainPath = pattern.slice(2).replace(/\^$/, "");
|
|
|
|
|
|
let [domain, ...pathParts] = domainPath.split("/");
|
|
|
|
|
|
let path = pathParts.length > 0 ? "/" + pathParts.join("/") : "";
|
|
|
|
|
|
return {
|
|
|
|
|
|
kind: "domain",
|
|
|
|
|
|
domain,
|
|
|
|
|
|
path,
|
|
|
|
|
|
type,
|
|
|
|
|
|
isThirdParty,
|
|
|
|
|
|
isFirstParty,
|
|
|
|
|
|
includeDomains,
|
|
|
|
|
|
excludeDomains,
|
|
|
|
|
|
important
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (pattern.startsWith("/")) {
|
|
|
|
|
|
const lastSlash = pattern.lastIndexOf("/");
|
|
|
|
|
|
if (lastSlash > 0) {
|
|
|
|
|
|
const regex = pattern.slice(1, lastSlash);
|
|
|
|
|
|
return {
|
|
|
|
|
|
kind: "regex",
|
|
|
|
|
|
regex,
|
|
|
|
|
|
type,
|
|
|
|
|
|
isThirdParty,
|
|
|
|
|
|
isFirstParty,
|
|
|
|
|
|
includeDomains,
|
|
|
|
|
|
excludeDomains,
|
|
|
|
|
|
important
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function cosmeticSelectorToCss(selector) {
|
|
|
|
|
|
const styleMatch = selector.match(/:style\((.+)\)$/);
|
|
|
|
|
|
if (styleMatch) {
|
|
|
|
|
|
const baseSelector = selector.slice(0, selector.lastIndexOf(":style("));
|
|
|
|
|
|
return `${baseSelector} { ${styleMatch[1]} }`;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (
|
|
|
|
|
|
selector.includes(":remove()") ||
|
|
|
|
|
|
selector.includes(":matches-css") ||
|
|
|
|
|
|
selector.includes(":matches-media") ||
|
|
|
|
|
|
selector.includes(":xpath(") ||
|
|
|
|
|
|
selector.includes(":upward(") ||
|
|
|
|
|
|
selector.includes(":matches-path")
|
|
|
|
|
|
) {
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return `${selector} { display: none !important; }`;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function matchesCosmeticDomains(domainSpec, hostname) {
|
|
|
|
|
|
if (!domainSpec || domainSpec === "*") return true;
|
|
|
|
|
|
const domains = domainSpec.split(",");
|
|
|
|
|
|
|
|
|
|
|
|
const hasNegated = domains.some((d) => d.startsWith("~"));
|
|
|
|
|
|
if (hasNegated) {
|
|
|
|
|
|
for (const d of domains) {
|
|
|
|
|
|
if (d.startsWith("~")) {
|
|
|
|
|
|
const neg = d.slice(1);
|
|
|
|
|
|
if (hostname === neg || hostname.endsWith("." + neg)) {
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return domains.some((d) => hostname === d || hostname.endsWith("." + d));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule) {
|
|
|
|
|
|
if (rule.includeDomains.length > 0) {
|
|
|
|
|
|
const ok = rule.includeDomains.some(
|
|
|
|
|
|
(d) => sourceHostname === d || sourceHostname.endsWith("." + d)
|
|
|
|
|
|
);
|
|
|
|
|
|
if (!ok) return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (rule.excludeDomains.length > 0) {
|
|
|
|
|
|
const blocked = rule.excludeDomains.some(
|
|
|
|
|
|
(d) => sourceHostname === d || sourceHostname.endsWith("." + d)
|
|
|
|
|
|
);
|
|
|
|
|
|
if (blocked) return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (rule.type) {
|
|
|
|
|
|
const typeMap = {
|
|
|
|
|
|
script: "script",
|
|
|
|
|
|
stylesheet: "stylesheet",
|
|
|
|
|
|
image: "image",
|
|
|
|
|
|
media: "media",
|
|
|
|
|
|
xmlhttprequest: "xhr",
|
|
|
|
|
|
other: "other",
|
|
|
|
|
|
"inline-script": "script"
|
|
|
|
|
|
};
|
|
|
|
|
|
if (typeMap[rule.type] && resourceType !== typeMap[rule.type]) {
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (rule.isThirdParty) {
|
|
|
|
|
|
const is3p = hostname !== sourceHostname && !hostname.endsWith("." + sourceHostname);
|
|
|
|
|
|
if (!is3p) return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (rule.isFirstParty) {
|
|
|
|
|
|
const is3p = hostname !== sourceHostname && !hostname.endsWith("." + sourceHostname);
|
|
|
|
|
|
if (is3p) return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (rule.kind === "domain") {
|
|
|
|
|
|
const domainRe = new RegExp(
|
|
|
|
|
|
"^" + rule.domain.replace(/\./g, "\\.").replace(/\*/g, "[^.]*") + "$",
|
|
|
|
|
|
"i"
|
|
|
|
|
|
);
|
|
|
|
|
|
if (!domainRe.test(hostname)) return false;
|
|
|
|
|
|
|
|
|
|
|
|
if (rule.path) {
|
|
|
|
|
|
const pathRe = new RegExp(
|
|
|
|
|
|
"^" + rule.path.replace(/\./g, "\\.").replace(/\*/g, ".*").replace(/\?/g, "\\?").replace(/\^/g, ""),
|
|
|
|
|
|
"i"
|
|
|
|
|
|
);
|
|
|
|
|
|
if (!pathRe.test(urlObj.pathname)) return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (rule.kind === "regex") {
|
|
|
|
|
|
try {
|
|
|
|
|
|
const re = new RegExp(rule.regex, "i");
|
|
|
|
|
|
return re.test(url);
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function shouldBlockRequest(url, resourceType, sourceHostname) {
|
|
|
|
|
|
if (url === sourceHostname || url.startsWith(sourceHostname + "/")) {
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
let urlObj;
|
|
|
|
|
|
try {
|
|
|
|
|
|
urlObj = new URL(url);
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
const hostname = urlObj.hostname;
|
|
|
|
|
|
|
|
|
|
|
|
for (const rule of filterRules.allowRules) {
|
|
|
|
|
|
if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) {
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (const rule of filterRules.blockRules) {
|
|
|
|
|
|
if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) {
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// --- Userscript metadata parsing -------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
function parseUserScriptMetadata(content) {
|
|
|
|
|
|
const metaBlock = content.match(/\/\/\s*==UserScript==([\s\S]*?)\/\/\s*==\/UserScript==/);
|
|
|
|
|
|
const matches = [];
|
|
|
|
|
|
const excludes = [];
|
|
|
|
|
|
if (!metaBlock) return { matches, excludes };
|
|
|
|
|
|
|
|
|
|
|
|
const lines = metaBlock[1].split("\n");
|
|
|
|
|
|
for (const line of lines) {
|
|
|
|
|
|
const matchMatch = line.match(/@match\s+(.+)/);
|
|
|
|
|
|
if (matchMatch) {
|
|
|
|
|
|
matches.push(matchMatch[1].trim());
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
const excludeMatch = line.match(/@exclude\s+(.+)/);
|
|
|
|
|
|
if (excludeMatch) {
|
|
|
|
|
|
excludes.push(excludeMatch[1].trim());
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return { matches, excludes };
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function urlMatchesPattern(url, pattern) {
|
|
|
|
|
|
// Simple glob-style pattern matching for userscript @match
|
|
|
|
|
|
// Format: *://*.example.com/* or http://example.com/path
|
|
|
|
|
|
try {
|
|
|
|
|
|
const urlObj = new URL(url);
|
|
|
|
|
|
const protocol = urlObj.protocol.slice(0, -1); // "http" or "https"
|
|
|
|
|
|
const hostname = urlObj.hostname;
|
|
|
|
|
|
const pathname = urlObj.pathname;
|
|
|
|
|
|
|
|
|
|
|
|
// Split pattern
|
|
|
|
|
|
const protoEnd = pattern.indexOf("://");
|
|
|
|
|
|
if (protoEnd < 0) return false;
|
|
|
|
|
|
const patternProto = pattern.slice(0, protoEnd);
|
|
|
|
|
|
const rest = pattern.slice(protoEnd + 3);
|
|
|
|
|
|
|
|
|
|
|
|
// Protocol match
|
|
|
|
|
|
if (patternProto !== "*" && patternProto !== protocol) return false;
|
|
|
|
|
|
|
|
|
|
|
|
// Split rest into host and path
|
|
|
|
|
|
const slashIdx = rest.indexOf("/");
|
|
|
|
|
|
const patternHost = slashIdx >= 0 ? rest.slice(0, slashIdx) : rest;
|
|
|
|
|
|
const patternPath = slashIdx >= 0 ? rest.slice(slashIdx) : "/";
|
|
|
|
|
|
|
|
|
|
|
|
// Host match
|
|
|
|
|
|
if (!matchHost(hostname, patternHost)) return false;
|
|
|
|
|
|
|
|
|
|
|
|
// Path match
|
|
|
|
|
|
if (!matchPath(pathname, patternPath)) return false;
|
|
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function matchHost(hostname, pattern) {
|
|
|
|
|
|
if (pattern === "*") return true;
|
|
|
|
|
|
if (pattern.startsWith("*.")) {
|
|
|
|
|
|
const suffix = pattern.slice(2);
|
|
|
|
|
|
return hostname === suffix || hostname.endsWith("." + suffix);
|
|
|
|
|
|
}
|
|
|
|
|
|
return hostname === pattern;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function matchPath(pathname, pattern) {
|
|
|
|
|
|
if (pattern === "/*") return true;
|
|
|
|
|
|
// Convert glob pattern to regex
|
|
|
|
|
|
const regex = "^" + pattern
|
|
|
|
|
|
.replace(/\./g, "\\.")
|
|
|
|
|
|
.replace(/\*/g, ".*")
|
|
|
|
|
|
.replace(/\?/g, ".")
|
|
|
|
|
|
+ "$";
|
|
|
|
|
|
return new RegExp(regex, "i").test(pathname);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function shouldInjectUserScript(url, meta) {
|
|
|
|
|
|
let matched = false;
|
|
|
|
|
|
for (const pattern of meta.matches) {
|
|
|
|
|
|
if (urlMatchesPattern(url, pattern)) {
|
|
|
|
|
|
matched = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (!matched) return false;
|
|
|
|
|
|
|
|
|
|
|
|
for (const pattern of meta.excludes) {
|
|
|
|
|
|
if (urlMatchesPattern(url, pattern)) {
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// --- Browser helpers -------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
function loadPlaywright() {
|
|
|
|
|
|
try {
|
|
|
|
|
|
return require("playwright");
|
|
|
|
|
|
} catch (error) {
|
|
|
|
|
|
throw new Error(
|
|
|
|
|
|
`Playwright is required. Run "npm install" and "npm run install-browsers". Original error: ${error.message}`
|
|
|
|
|
|
);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Manual stealth evasions injected into every page before any scripts run.
|
|
|
|
|
|
const STEALTH_INIT_SCRIPT = `
|
|
|
|
|
|
(() => {
|
|
|
|
|
|
const patchNavigator = () => {
|
|
|
|
|
|
try {
|
|
|
|
|
|
// Override webdriver getter without using delete (can crash renderer)
|
|
|
|
|
|
if (navigator.webdriver !== undefined) {
|
|
|
|
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|
|
|
|
|
get: () => undefined,
|
|
|
|
|
|
configurable: true,
|
|
|
|
|
|
enumerable: true
|
|
|
|
|
|
});
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch (e) {}
|
|
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
if (!window.chrome) {
|
|
|
|
|
|
window.chrome = { runtime: {} };
|
|
|
|
|
|
} else if (!window.chrome.runtime) {
|
|
|
|
|
|
window.chrome.runtime = {};
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch (e) {}
|
|
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
const originalQuery = window.navigator.permissions?.query;
|
|
|
|
|
|
if (originalQuery) {
|
|
|
|
|
|
window.navigator.permissions.query = (parameters) => (
|
|
|
|
|
|
parameters.name === 'notifications'
|
|
|
|
|
|
? Promise.resolve({ state: Notification.permission })
|
|
|
|
|
|
: originalQuery(parameters)
|
|
|
|
|
|
);
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch (e) {}
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
if (document.readyState === 'loading') {
|
|
|
|
|
|
document.addEventListener('DOMContentLoaded', patchNavigator);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
patchNavigator();
|
|
|
|
|
|
}
|
|
|
|
|
|
})();
|
|
|
|
|
|
`;
|
|
|
|
|
|
|
|
|
|
|
|
function buildLaunchArgs(headless) {
|
|
|
|
|
|
const args = [
|
|
|
|
|
|
"--disable-blink-features=AutomationControlled",
|
|
|
|
|
|
"--disable-web-security",
|
|
|
|
|
|
"--disable-features=IsolateOrigins,site-per-process",
|
|
|
|
|
|
"--disable-site-isolation-trials",
|
|
|
|
|
|
"--disable-infobars",
|
|
|
|
|
|
"--no-sandbox",
|
|
|
|
|
|
"--disable-setuid-sandbox",
|
|
|
|
|
|
"--disable-dev-shm-usage",
|
|
|
|
|
|
"--disable-accelerated-2d-canvas",
|
|
|
|
|
|
"--disable-gpu",
|
|
|
|
|
|
"--window-size=1366,768"
|
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
|
|
if (headless) {
|
|
|
|
|
|
args.push("--headless=new");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return args;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function buildIgnoreDefaultArgs() {
|
|
|
|
|
|
return ["--enable-automation"];
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// --- Page helpers ----------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
async function setupRequestBlocking(page, sourceHostname) {
|
|
|
|
|
|
if (!privacyFiltersAvailable || filterRules.blockRules.length === 0) return;
|
|
|
|
|
|
|
|
|
|
|
|
await page.route("**/*", (route) => {
|
|
|
|
|
|
try {
|
|
|
|
|
|
const request = route.request();
|
|
|
|
|
|
if (request.isNavigationRequest()) {
|
|
|
|
|
|
route.continue();
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
const url = request.url();
|
|
|
|
|
|
const type = request.resourceType();
|
|
|
|
|
|
if (shouldBlockRequest(url, type, sourceHostname)) {
|
|
|
|
|
|
route.abort("blockedbyclient");
|
|
|
|
|
|
} else {
|
|
|
|
|
|
route.continue();
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
route.continue();
|
|
|
|
|
|
}
|
|
|
|
|
|
});
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
async function injectCosmeticFilters(page, hostname) {
|
|
|
|
|
|
if (!privacyFiltersAvailable || filterRules.cosmeticRules.length === 0) return;
|
|
|
|
|
|
|
|
|
|
|
|
const lines = [];
|
|
|
|
|
|
for (const rule of filterRules.cosmeticRules) {
|
|
|
|
|
|
if (matchesCosmeticDomains(rule.domains, hostname)) {
|
|
|
|
|
|
lines.push(rule.css);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (lines.length > 0) {
|
|
|
|
|
|
try {
|
|
|
|
|
|
await page.addStyleTag({ content: lines.join("\n") });
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
// Ignore cosmetic injection failures.
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const GM_MOCK = `
|
|
|
|
|
|
if (typeof GM === "undefined") {
|
|
|
|
|
|
window.GM = {
|
|
|
|
|
|
xmlHttpRequest: function(details) {
|
|
|
|
|
|
fetch(details.url, {
|
|
|
|
|
|
method: details.method || "GET",
|
|
|
|
|
|
headers: details.headers || {},
|
|
|
|
|
|
body: details.data || null
|
|
|
|
|
|
})
|
|
|
|
|
|
.then(response => response.text().then(text => ({
|
|
|
|
|
|
status: response.status,
|
|
|
|
|
|
statusText: response.statusText,
|
|
|
|
|
|
responseText: text,
|
|
|
|
|
|
responseHeaders: Array.from(response.headers.entries())
|
|
|
|
|
|
.map(([k, v]) => k + ": " + v).join("\\r\\n")
|
|
|
|
|
|
})))
|
|
|
|
|
|
.then(obj => {
|
|
|
|
|
|
if (details.onload) details.onload(obj);
|
|
|
|
|
|
})
|
|
|
|
|
|
.catch(err => {
|
|
|
|
|
|
if (details.onerror) details.onerror(err);
|
|
|
|
|
|
});
|
|
|
|
|
|
}
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
`;
|
|
|
|
|
|
|
|
|
|
|
|
async function injectPrivacyUserScripts(page, sourceUrl) {
|
|
|
|
|
|
if (!privacyFiltersAvailable || userScriptData.length === 0) return;
|
|
|
|
|
|
|
|
|
|
|
|
const matching = userScriptData.filter((us) => shouldInjectUserScript(sourceUrl, us));
|
|
|
|
|
|
if (matching.length === 0) return;
|
|
|
|
|
|
|
|
|
|
|
|
// Inject GM API mock first.
|
|
|
|
|
|
try {
|
|
|
|
|
|
await page.addScriptTag({ content: GM_MOCK });
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Inject only matching userscripts.
|
|
|
|
|
|
for (const us of matching) {
|
|
|
|
|
|
try {
|
|
|
|
|
|
await page.addScriptTag({ content: us.content });
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
// Ignore injection failures for individual scripts.
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
// Archiving
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
|
2026-05-14 08:12:13 -07:00
|
|
|
|
export async function archivePage(input, options = {}) {
|
|
|
|
|
|
const sourceUrl = inputToUrl(input);
|
|
|
|
|
|
const archivePath = options.archivePath || defaultArchivePath();
|
|
|
|
|
|
const id = options.id || slugForUrl(sourceUrl);
|
|
|
|
|
|
const filePath = path.join(archivePath, `${id}.html`);
|
|
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
|
await fs.mkdir(archivePath, { recursive: true });
|
2026-05-14 08:12:13 -07:00
|
|
|
|
|
2026-05-15 09:12:28 -07:00
|
|
|
|
const renderedHtml = await renderPage(sourceUrl, options);
|
2026-05-15 01:00:27 -07:00
|
|
|
|
const baseUrl = findEffectiveBase(renderedHtml, sourceUrl);
|
2026-05-14 08:12:13 -07:00
|
|
|
|
const inliner = new AssetInliner({
|
2026-05-15 01:00:27 -07:00
|
|
|
|
userAgent: DEFAULT_USER_AGENT,
|
|
|
|
|
|
referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined
|
2026-05-14 08:12:13 -07:00
|
|
|
|
});
|
2026-05-15 01:00:27 -07:00
|
|
|
|
|
2026-05-14 08:12:13 -07:00
|
|
|
|
const inlined = await inliner.inlineHtml(renderedHtml, baseUrl);
|
2026-05-15 01:00:27 -07:00
|
|
|
|
const finalHtml = addArchiveComment(inlined, sourceUrl);
|
2026-05-14 08:12:13 -07:00
|
|
|
|
await fs.writeFile(filePath, finalHtml, "utf8");
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
|
id,
|
|
|
|
|
|
filePath,
|
|
|
|
|
|
sourceUrl,
|
|
|
|
|
|
archivePath,
|
|
|
|
|
|
warnings: inliner.warnings,
|
|
|
|
|
|
externalAssets: findExternalAssetRefs(finalHtml)
|
|
|
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-15 09:12:28 -07:00
|
|
|
|
export async function renderPage(sourceUrl, options = {}) {
|
2026-05-14 08:12:13 -07:00
|
|
|
|
const playwright = loadPlaywright();
|
2026-05-15 09:12:28 -07:00
|
|
|
|
|
|
|
|
|
|
const hasDisplay = !!(process.env.DISPLAY || process.env.WAYLAND_DISPLAY);
|
|
|
|
|
|
const headless = options.headless !== false && !hasDisplay;
|
|
|
|
|
|
|
|
|
|
|
|
const browser = await playwright.chromium.launch({
|
|
|
|
|
|
headless,
|
|
|
|
|
|
args: buildLaunchArgs(headless),
|
|
|
|
|
|
ignoreDefaultArgs: buildIgnoreDefaultArgs()
|
|
|
|
|
|
});
|
2026-05-15 01:00:27 -07:00
|
|
|
|
|
2026-05-14 08:12:13 -07:00
|
|
|
|
try {
|
|
|
|
|
|
const context = await browser.newContext({
|
2026-05-15 09:12:28 -07:00
|
|
|
|
userAgent: options.userAgent || DEFAULT_USER_AGENT,
|
|
|
|
|
|
viewport: VIEWPORT,
|
|
|
|
|
|
locale: options.locale || "en-US",
|
|
|
|
|
|
timezoneId: options.timezoneId || "America/New_York"
|
2026-05-14 08:12:13 -07:00
|
|
|
|
});
|
2026-05-15 09:12:28 -07:00
|
|
|
|
|
|
|
|
|
|
// Inject stealth evasions into every new page before any scripts run.
|
|
|
|
|
|
await context.addInitScript(STEALTH_INIT_SCRIPT);
|
|
|
|
|
|
|
2026-05-14 08:12:13 -07:00
|
|
|
|
const page = await context.newPage();
|
2026-05-15 09:12:28 -07:00
|
|
|
|
const sourceHostname = new URL(sourceUrl).hostname;
|
|
|
|
|
|
|
|
|
|
|
|
// Block paywall/tracker requests before the page loads.
|
|
|
|
|
|
await setupRequestBlocking(page, sourceHostname);
|
2026-05-14 08:12:13 -07:00
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
|
await page.goto(sourceUrl, {
|
|
|
|
|
|
waitUntil: "domcontentloaded",
|
|
|
|
|
|
timeout: PAGE_TIMEOUT_MS
|
|
|
|
|
|
});
|
2026-05-15 09:12:28 -07:00
|
|
|
|
|
|
|
|
|
|
// Inject cosmetic CSS and userscripts to strip paywalls / ads.
|
|
|
|
|
|
await injectCosmeticFilters(page, sourceHostname);
|
|
|
|
|
|
await injectPrivacyUserScripts(page, sourceUrl);
|
|
|
|
|
|
|
|
|
|
|
|
// Give the userscripts a moment to run their setTimeout callbacks.
|
|
|
|
|
|
const userscriptDelay = options.userscriptDelay || 2000;
|
|
|
|
|
|
await page.waitForTimeout(userscriptDelay);
|
|
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
|
await waitForNetworkIdle(page);
|
|
|
|
|
|
await snapshotLoadedResourceUrls(page);
|
2026-05-14 08:12:13 -07:00
|
|
|
|
|
|
|
|
|
|
return await page.content();
|
|
|
|
|
|
} finally {
|
|
|
|
|
|
await browser.close();
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
|
async function waitForNetworkIdle(page) {
|
2026-05-14 08:12:13 -07:00
|
|
|
|
try {
|
2026-05-15 01:00:27 -07:00
|
|
|
|
await page.waitForLoadState("networkidle", {
|
|
|
|
|
|
timeout: NETWORK_IDLE_TIMEOUT_MS
|
2026-05-14 08:12:13 -07:00
|
|
|
|
});
|
2026-05-15 01:00:27 -07:00
|
|
|
|
} catch {
|
|
|
|
|
|
// Some pages keep sockets open; the DOM snapshot is still useful.
|
2026-05-14 08:12:13 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
|
async function snapshotLoadedResourceUrls(page) {
|
|
|
|
|
|
await page.evaluate(() => {
|
|
|
|
|
|
document.querySelectorAll("img").forEach((img) => {
|
|
|
|
|
|
if (img.currentSrc) {
|
|
|
|
|
|
img.setAttribute("src", img.currentSrc);
|
2026-05-14 08:12:13 -07:00
|
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
|
});
|
2026-05-14 08:12:13 -07:00
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
|
document.querySelectorAll("video,audio").forEach((media) => {
|
|
|
|
|
|
if (media.currentSrc) {
|
|
|
|
|
|
media.setAttribute("src", media.currentSrc);
|
2026-05-14 08:12:13 -07:00
|
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
|
});
|
2026-05-14 08:12:13 -07:00
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
|
document.querySelectorAll("iframe").forEach((frame) => {
|
|
|
|
|
|
try {
|
|
|
|
|
|
const doc = frame.contentDocument;
|
|
|
|
|
|
if (doc?.documentElement) {
|
|
|
|
|
|
frame.setAttribute("srcdoc", "<!doctype html>" + doc.documentElement.outerHTML);
|
|
|
|
|
|
frame.removeAttribute("src");
|
2026-05-14 08:12:13 -07:00
|
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
|
} catch {
|
|
|
|
|
|
// Cross-origin frames are handled later by the asset inliner when possible.
|
2026-05-14 08:12:13 -07:00
|
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
|
});
|
|
|
|
|
|
});
|
2026-05-14 08:12:13 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-15 01:00:27 -07:00
|
|
|
|
function addArchiveComment(html, sourceUrl) {
|
|
|
|
|
|
const safeSource = String(sourceUrl).replaceAll("--", "- -");
|
|
|
|
|
|
const comment = `<!-- Archived locally. Source: ${safeSource}. Created: ${new Date().toISOString()}. -->`;
|
2026-05-14 08:12:13 -07:00
|
|
|
|
if (/<!doctype/i.test(html)) {
|
|
|
|
|
|
return html.replace(/<!doctype[^>]*>/i, (doctype) => `${doctype}\n${comment}`);
|
|
|
|
|
|
}
|
|
|
|
|
|
return `<!doctype html>\n${comment}\n${html}`;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
export function findExternalAssetRefs(html) {
|
|
|
|
|
|
const refs = new Set();
|
2026-05-14 09:11:05 -07:00
|
|
|
|
const attrPattern = /\s(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi;
|
2026-05-14 08:12:13 -07:00
|
|
|
|
for (const match of html.matchAll(attrPattern)) {
|
|
|
|
|
|
if (isSelfContainedAssetRef(match[2])) {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
for (const part of match[2].split(",")) {
|
|
|
|
|
|
const candidate = part.trim().split(/\s+/)[0];
|
|
|
|
|
|
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
|
|
|
|
|
refs.add(candidate);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
|
|
2026-05-14 08:12:13 -07:00
|
|
|
|
const linkPattern = /<link\b[^>]*>/gi;
|
|
|
|
|
|
for (const match of html.matchAll(linkPattern)) {
|
|
|
|
|
|
const tag = match[0];
|
|
|
|
|
|
const rel = readAttribute(tag, "rel") || "";
|
|
|
|
|
|
if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
const href = readAttribute(tag, "href");
|
|
|
|
|
|
if (href && !isSelfContainedAssetRef(href)) {
|
|
|
|
|
|
refs.add(href);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
|
|
2026-05-14 08:12:13 -07:00
|
|
|
|
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
|
|
|
|
|
|
for (const match of html.matchAll(cssUrlPattern)) {
|
2026-05-14 09:11:05 -07:00
|
|
|
|
const candidate = cleanCssUrl(match[2]);
|
2026-05-14 08:12:13 -07:00
|
|
|
|
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
|
|
|
|
|
refs.add(candidate);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2026-05-15 01:00:27 -07:00
|
|
|
|
|
2026-05-14 08:12:13 -07:00
|
|
|
|
return Array.from(refs).sort();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function isSelfContainedAssetRef(value) {
|
2026-05-14 09:11:05 -07:00
|
|
|
|
const trimmed = cleanCssUrl(value);
|
2026-05-14 08:12:13 -07:00
|
|
|
|
return (
|
|
|
|
|
|
!trimmed ||
|
|
|
|
|
|
trimmed.startsWith("#") ||
|
2026-05-14 09:11:05 -07:00
|
|
|
|
/^%23/i.test(trimmed) ||
|
2026-05-14 08:12:13 -07:00
|
|
|
|
/^(?:data|about|javascript|mailto|tel):/i.test(trimmed)
|
|
|
|
|
|
);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function readAttribute(tag, attr) {
|
|
|
|
|
|
const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"));
|
|
|
|
|
|
return match ? match[2] ?? match[3] ?? match[4] ?? "" : "";
|
|
|
|
|
|
}
|
2026-05-14 09:11:05 -07:00
|
|
|
|
|
|
|
|
|
|
function cleanCssUrl(value) {
|
|
|
|
|
|
const decoded = String(value)
|
|
|
|
|
|
.trim()
|
|
|
|
|
|
.replaceAll("&", "&")
|
|
|
|
|
|
.replaceAll(""", '"')
|
|
|
|
|
|
.replaceAll("'", "'")
|
|
|
|
|
|
.replaceAll("'", "'");
|
|
|
|
|
|
const quote = decoded[0];
|
|
|
|
|
|
if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) {
|
|
|
|
|
|
return decoded.slice(1, -1).trim();
|
|
|
|
|
|
}
|
|
|
|
|
|
return decoded;
|
|
|
|
|
|
}
|