Files
sigilbox/src/archiver.mjs

1666 lines
45 KiB
JavaScript
Raw Normal View History

2026-05-14 08:12:13 -07:00
import fs from "node:fs/promises";
import path from "node:path";
import { createRequire } from "node:module";
2026-05-15 09:12:28 -07:00
import { fileURLToPath } from "node:url";
2026-05-14 08:12:13 -07:00
import {
AssetInliner,
DEFAULT_USER_AGENT,
defaultArchivePath,
findEffectiveBase,
inputToUrl,
isHttpUrl,
2026-05-16 16:05:32 -07:00
splitSrcset,
2026-05-15 01:00:27 -07:00
slugForUrl
2026-05-14 08:12:13 -07:00
} from "./asset-inliner.mjs";
const require = createRequire(import.meta.url);
2026-05-15 09:12:28 -07:00
const __dirname = path.dirname(fileURLToPath(import.meta.url));
2026-05-15 01:00:27 -07:00
const PAGE_TIMEOUT_MS = 60000;
const NETWORK_IDLE_TIMEOUT_MS = 5000;
const VIEWPORT = {
2026-05-15 09:12:28 -07:00
width: 1366,
2026-05-15 01:00:27 -07:00
height: 768
};
2026-05-14 08:12:13 -07:00
2026-05-16 22:32:30 -07:00
const COMMON_ANNOYANCE_SELECTORS = [
"[id^=\"sp_message_container_\"]",
"iframe[id^=\"sp_message_iframe_\"]",
"iframe[title*=\"consent\" i]",
"iframe[title*=\"privacy manager\" i]",
"#onetrust-consent-sdk",
"#onetrust-banner-sdk",
"#didomi-host",
"#qc-cmp2-container",
".qc-cmp2-container",
"#CybotCookiebotDialog",
".iubenda-cs-container",
"#cmpwrapper",
"[id^=\"cmpbox\"]",
".fc-consent-root",
".fc-dialog-container",
"[aria-modal=\"true\"][id*=\"consent\" i]",
"[aria-modal=\"true\"][id*=\"cookie\" i]",
"[role=\"dialog\"][aria-label*=\"cookie\" i]",
"[role=\"dialog\"][aria-label*=\"consent\" i]",
"[id*=\"cookie-banner\" i]",
"[class*=\"cookie-banner\" i]",
"[id*=\"cookie-consent\" i]",
"[class*=\"cookie-consent\" i]",
"[id*=\"cookie-notice\" i]",
"[class*=\"cookie-notice\" i]",
"[id*=\"cookie-popup\" i]",
"[class*=\"cookie-popup\" i]",
"[id*=\"adblock\" i]",
"[class*=\"adblock\" i]",
"[id*=\"ad-block\" i]",
"[class*=\"ad-block\" i]"
];
const COMMON_ANNOYANCE_ROOT_CLASSES = [
"sp-message-open",
"didomi-popup-open",
"qc-cmp-ui-showing",
"ot-sdk-show-settings",
"iubenda-cs-visible"
];
const COMMON_ANNOYANCE_CSS = `
${COMMON_ANNOYANCE_SELECTORS.join(",\n")} {
display: none !important;
visibility: hidden !important;
pointer-events: none !important;
}
html.sp-message-open,
body.sp-message-open,
html.didomi-popup-open,
body.didomi-popup-open,
html.qc-cmp-ui-showing,
body.qc-cmp-ui-showing,
html.iubenda-cs-visible,
body.iubenda-cs-visible {
overflow: auto !important;
position: static !important;
}
`;
2026-05-14 08:12:13 -07:00
export { DEFAULT_USER_AGENT, defaultArchivePath };
2026-05-15 09:12:28 -07:00
// ---------------------------------------------------------------------------
// Privacy filters integration
// ---------------------------------------------------------------------------
const PRIVACY_FILTERS_DIR = path.join(__dirname, "..", "privacy-filters");
2026-05-16 22:07:39 -07:00
const FILTER_LIST_FILES = [
{ id: "bpc-paywall", file: "bpc-paywall-filter.txt" },
{ id: "easylist", file: path.join("lists", "easylist.txt") },
{ id: "ublock-filters", file: path.join("lists", "ublock-filters.txt") },
{ id: "easylist-cookie", file: path.join("lists", "easylist-cookie.txt") },
{ id: "ublock-annoyances", file: path.join("lists", "ublock-annoyances.txt") },
{ id: "ublock-cookies", file: path.join("lists", "ublock-cookies.txt") }
];
2026-05-15 09:12:28 -07:00
let privacyFiltersAvailable = false;
2026-05-16 22:07:39 -07:00
let filterRules = emptyFilterRules();
2026-05-15 09:12:28 -07:00
let userScriptData = []; // { file, content, matches, excludes }
2026-05-15 09:25:19 -07:00
let userScriptRequireContent = "";
2026-05-15 09:12:28 -07:00
async function loadPrivacyFilters() {
try {
2026-05-16 22:07:39 -07:00
const filterSets = [];
for (const list of FILTER_LIST_FILES) {
try {
const filterPath = path.join(PRIVACY_FILTERS_DIR, list.file);
const filterContent = await fs.readFile(filterPath, "utf8");
filterSets.push(parseFilterRules(filterContent, { source: list.id }));
} catch (error) {
if (error?.code !== "ENOENT") {
throw error;
}
}
}
filterRules = mergeFilterRules(filterSets);
2026-05-15 09:12:28 -07:00
const userscriptDir = path.join(PRIVACY_FILTERS_DIR, "userscript");
2026-05-15 09:25:19 -07:00
userScriptRequireContent = await fs.readFile(path.join(userscriptDir, "bpc_func.js"), "utf8");
2026-05-16 22:07:39 -07:00
userScriptData = [];
2026-05-15 09:12:28 -07:00
const userScriptFiles = [
"bpc.en.user.js",
"bpc.de.user.js",
"bpc.es.pt.user.js",
"bpc.fi.se.user.js",
"bpc.fr.user.js",
"bpc.it.user.js",
"bpc.nl.user.js",
"bpc.pl.user.js"
];
for (const file of userScriptFiles) {
const content = await fs.readFile(path.join(userscriptDir, file), "utf8");
const meta = parseUserScriptMetadata(content);
userScriptData.push({ file, content, ...meta });
}
2026-05-16 22:07:39 -07:00
privacyFiltersAvailable =
filterRules.blockRules.length > 0 ||
filterRules.importantBlockRules.length > 0 ||
filterRules.allowRules.length > 0 ||
filterRules.importantAllowRules.length > 0 ||
filterRules.cosmeticRules.length > 0 ||
userScriptData.length > 0;
2026-05-15 09:12:28 -07:00
} catch {
// Privacy filters directory missing or unreadable; archive without them.
}
}
await loadPrivacyFilters();
// --- Adblock filter parsing ------------------------------------------------
2026-05-16 22:07:39 -07:00
const COSMETIC_SEPARATORS = [
{ token: "#@?#", kind: "cosmeticException", extended: true },
{ token: "#@$#", kind: "styleException" },
{ token: "#@%#", kind: "scriptException" },
{ token: "#@^", kind: "htmlException" },
{ token: "#@#", kind: "cosmeticException" },
{ token: "#?#", kind: "extendedCosmetic", extended: true },
{ token: "#$#", kind: "style" },
{ token: "#%#", kind: "script" },
{ token: "#^", kind: "html" },
{ token: "##", kind: "cosmetic" }
];
const RESOURCE_TYPE_ALIASES = new Map([
["beacon", "ping"],
["css", "stylesheet"],
["doc", "document"],
["document", "document"],
["fetch", "xmlhttprequest"],
["font", "font"],
["frame", "subdocument"],
["image", "image"],
["inline-script", "inline-script"],
["media", "media"],
["object", "object"],
["object-subrequest", "object"],
["other", "other"],
["ping", "ping"],
["script", "script"],
["stylesheet", "stylesheet"],
["subdocument", "subdocument"],
["websocket", "websocket"],
["xhr", "xmlhttprequest"],
["xmlhttprequest", "xmlhttprequest"]
]);
const SKIP_NETWORK_OPTION_NAMES = new Set([
"cookie",
"csp",
"cname",
"denyallow",
"ehide",
"elemhide",
"ghide",
"genericblock",
"generichide",
"header",
"ipaddress",
"jsonprune",
"method",
"permissions",
"popunder",
"popup",
"queryprune",
"redirect",
"redirect-rule",
"removeparam",
"replace",
"rewrite",
"shide",
"specifichide",
"uritransform",
"urlskip",
"webrtc",
"xmlprune"
]);
const MULTI_PART_PUBLIC_SUFFIXES = new Set([
"ac.uk",
"co.jp",
"co.nz",
"co.uk",
"com.au",
"com.br",
"com.mx",
"com.tr",
"com.tw",
"com.cn",
"net.au",
"net.nz",
"org.au",
"org.nz",
"org.uk"
]);
function emptyFilterRules() {
return {
blockRules: [],
importantBlockRules: [],
allowRules: [],
importantAllowRules: [],
cosmeticRules: [],
cosmeticExceptionRules: [],
badFilterKeys: new Set(),
sourceFiles: [],
blockRuleIndex: null,
importantBlockRuleIndex: null,
allowRuleIndex: null,
importantAllowRuleIndex: null
};
}
function mergeFilterRules(filterSets) {
const merged = emptyFilterRules();
for (const set of filterSets) {
merged.blockRules.push(...set.blockRules);
merged.importantBlockRules.push(...set.importantBlockRules);
merged.allowRules.push(...set.allowRules);
merged.importantAllowRules.push(...set.importantAllowRules);
merged.cosmeticRules.push(...set.cosmeticRules);
merged.cosmeticExceptionRules.push(...set.cosmeticExceptionRules);
merged.sourceFiles.push(...set.sourceFiles);
for (const key of set.badFilterKeys) {
merged.badFilterKeys.add(key);
}
}
if (merged.badFilterKeys.size > 0) {
const isActive = (rule) => !merged.badFilterKeys.has(rule.key);
merged.blockRules = merged.blockRules.filter(isActive);
merged.importantBlockRules = merged.importantBlockRules.filter(isActive);
merged.allowRules = merged.allowRules.filter(isActive);
merged.importantAllowRules = merged.importantAllowRules.filter(isActive);
}
return finalizeFilterRules(merged);
}
export function parseFilterRules(content, options = {}) {
const rules = emptyFilterRules();
if (options.source) {
rules.sourceFiles.push(options.source);
}
let preprocessorDepth = 0;
2026-05-15 09:12:28 -07:00
for (const rawLine of content.split("\n")) {
const line = rawLine.trim();
if (!line) continue;
if (line.startsWith("!#if")) {
2026-05-16 22:07:39 -07:00
preprocessorDepth += 1;
2026-05-15 09:12:28 -07:00
continue;
}
if (line.startsWith("!#endif")) {
2026-05-16 22:07:39 -07:00
preprocessorDepth = Math.max(0, preprocessorDepth - 1);
continue;
}
if (preprocessorDepth > 0 || line.startsWith("!#") || line.startsWith("!") || line.startsWith("[")) {
continue;
}
const cosmetic = parseCosmeticFilterLine(line, options.source);
if (cosmetic) {
if (cosmetic.kind === "cosmeticException") {
rules.cosmeticExceptionRules.push(cosmetic);
} else if (cosmetic.kind === "cosmetic") {
rules.cosmeticRules.push(cosmetic);
}
2026-05-15 09:12:28 -07:00
continue;
}
2026-05-16 22:07:39 -07:00
if (cosmetic === false) {
continue;
}
const isException = line.startsWith("@@");
const networkLine = isException ? line.slice(2) : line;
const rule = parseNetworkRule(networkLine, {
exception: isException,
source: options.source
});
if (!rule) continue;
if (rule.badfilter) {
rules.badFilterKeys.add(rule.key);
} else if (isException && rule.important) {
rules.importantAllowRules.push(rule);
} else if (isException) {
rules.allowRules.push(rule);
} else if (rule.important) {
rules.importantBlockRules.push(rule);
} else {
rules.blockRules.push(rule);
}
}
return finalizeFilterRules(rules);
}
2026-05-15 09:12:28 -07:00
2026-05-16 22:07:39 -07:00
function finalizeFilterRules(rules) {
if (rules.badFilterKeys.size > 0) {
const isActive = (rule) => !rules.badFilterKeys.has(rule.key);
rules.blockRules = rules.blockRules.filter(isActive);
rules.importantBlockRules = rules.importantBlockRules.filter(isActive);
rules.allowRules = rules.allowRules.filter(isActive);
rules.importantAllowRules = rules.importantAllowRules.filter(isActive);
}
rules.blockRuleIndex = buildNetworkRuleIndex(rules.blockRules);
rules.importantBlockRuleIndex = buildNetworkRuleIndex(rules.importantBlockRules);
rules.allowRuleIndex = buildNetworkRuleIndex(rules.allowRules);
rules.importantAllowRuleIndex = buildNetworkRuleIndex(rules.importantAllowRules);
return rules;
}
2026-05-15 09:12:28 -07:00
2026-05-16 22:07:39 -07:00
function buildNetworkRuleIndex(rules) {
const byDomain = new Map();
const wildcardDomainRules = [];
const otherRules = [];
for (const rule of rules) {
if (rule.kind !== "domain") {
otherRules.push(rule);
continue;
}
if (rule.domain.includes("*")) {
wildcardDomainRules.push(rule);
2026-05-15 09:12:28 -07:00
continue;
}
2026-05-16 22:07:39 -07:00
const bucket = byDomain.get(rule.domain) || [];
bucket.push(rule);
byDomain.set(rule.domain, bucket);
}
return { byDomain, wildcardDomainRules, otherRules };
}
2026-05-15 09:12:28 -07:00
2026-05-16 22:07:39 -07:00
function parseCosmeticFilterLine(line, source) {
const separator = findCosmeticSeparator(line);
if (!separator) return null;
const domains = line.slice(0, separator.index);
const body = line.slice(separator.index + separator.token.length).trim();
if (!body) return false;
if (separator.kind === "cosmeticException") {
return {
kind: "cosmeticException",
domains,
selector: cosmeticSelectorKey(body),
source
};
}
if (separator.kind === "style") {
const css = adguardStyleRuleToCss(body);
return css
? {
kind: "cosmetic",
domains,
selector: cosmeticSelectorKey(body),
css,
source
2026-05-15 09:12:28 -07:00
}
2026-05-16 22:07:39 -07:00
: false;
}
if (separator.kind !== "cosmetic" || separator.extended) {
return false;
}
const css = cosmeticSelectorToCss(body);
return css
? {
kind: "cosmetic",
domains,
selector: cosmeticSelectorKey(body),
css,
source
2026-05-15 09:12:28 -07:00
}
2026-05-16 22:07:39 -07:00
: false;
}
2026-05-15 09:12:28 -07:00
2026-05-16 22:07:39 -07:00
function findCosmeticSeparator(line) {
let best = null;
for (const separator of COSMETIC_SEPARATORS) {
const index = line.indexOf(separator.token);
if (
index >= 0 &&
(!best || index < best.index || (index === best.index && separator.token.length > best.token.length))
) {
best = { ...separator, index };
}
2026-05-15 09:12:28 -07:00
}
2026-05-16 22:07:39 -07:00
return best;
}
2026-05-15 09:12:28 -07:00
2026-05-16 22:07:39 -07:00
function cosmeticSelectorKey(selector) {
return selector.trim().replace(/\s+/g, " ");
2026-05-15 09:12:28 -07:00
}
2026-05-16 22:07:39 -07:00
function parseNetworkRule(line, options = {}) {
const split = splitNetworkOptions(line);
const parsedOptions = parseNetworkOptions(split.options);
const key = networkRuleKey(split.pattern, parsedOptions.optionsForKey, options.exception);
if (parsedOptions.badfilter) {
return { badfilter: true, key };
}
if (!split.pattern || parsedOptions.skip) {
return null;
}
const compiled = compileNetworkPattern(split.pattern, parsedOptions.matchCase);
if (!compiled) {
return null;
}
return {
...compiled,
key,
types: parsedOptions.types,
excludedTypes: parsedOptions.excludedTypes,
isThirdParty: parsedOptions.isThirdParty,
isFirstParty: parsedOptions.isFirstParty,
includeDomains: parsedOptions.includeDomains,
excludeDomains: parsedOptions.excludeDomains,
includeTargetDomains: parsedOptions.includeTargetDomains,
excludeTargetDomains: parsedOptions.excludeTargetDomains,
important: parsedOptions.important,
source: options.source
};
}
2026-05-15 09:12:28 -07:00
2026-05-16 22:07:39 -07:00
function splitNetworkOptions(line) {
2026-05-15 09:12:28 -07:00
const lastDollar = line.lastIndexOf("$");
2026-05-16 22:07:39 -07:00
if (lastDollar <= 0) {
return { pattern: line, options: [] };
}
const optionText = line.slice(lastDollar + 1);
if (!looksLikeFilterOptions(optionText)) {
return { pattern: line, options: [] };
}
return {
pattern: line.slice(0, lastDollar),
options: splitFilterOptions(optionText)
};
}
function looksLikeFilterOptions(optionText) {
if (!optionText || /\s/.test(optionText)) return false;
const firstOption = optionText.split(",", 1)[0];
return /^~?[a-z][a-z0-9_-]*(?:=|$)/i.test(firstOption);
}
function splitFilterOptions(optionText) {
return optionText
.split(",")
.map((option) => option.trim())
.filter(Boolean);
}
function parseNetworkOptions(options) {
const parsed = {
types: [],
excludedTypes: [],
isThirdParty: false,
isFirstParty: false,
includeDomains: [],
excludeDomains: [],
includeTargetDomains: [],
excludeTargetDomains: [],
important: false,
matchCase: false,
badfilter: false,
skip: false,
optionsForKey: []
};
for (const rawOption of options) {
const option = rawOption.trim();
if (!option) continue;
const negated = option.startsWith("~");
const optionBody = negated ? option.slice(1) : option;
const eqIndex = optionBody.indexOf("=");
const name = (eqIndex >= 0 ? optionBody.slice(0, eqIndex) : optionBody).toLowerCase();
const value = eqIndex >= 0 ? optionBody.slice(eqIndex + 1) : "";
if (name !== "badfilter") {
parsed.optionsForKey.push(option);
}
if (name === "badfilter") {
parsed.badfilter = true;
continue;
}
if (name === "important") {
parsed.important = true;
continue;
}
if (name === "match-case") {
parsed.matchCase = true;
continue;
}
if (name === "third-party" || name === "3p" || name === "strict3p") {
parsed.isThirdParty = !negated;
parsed.isFirstParty = negated;
continue;
}
if (name === "first-party" || name === "1p" || name === "strict1p") {
parsed.isFirstParty = !negated;
parsed.isThirdParty = negated;
continue;
}
if (name === "domain" || name === "from") {
const domains = parseDomainOptionValue(value);
parsed.includeDomains.push(...domains.include);
parsed.excludeDomains.push(...domains.exclude);
continue;
}
if (name === "to") {
const domains = parseDomainOptionValue(value);
parsed.includeTargetDomains.push(...domains.include);
parsed.excludeTargetDomains.push(...domains.exclude);
continue;
}
const resourceType = RESOURCE_TYPE_ALIASES.get(name);
if (resourceType) {
if (negated) {
parsed.excludedTypes.push(resourceType);
2026-05-15 09:12:28 -07:00
} else {
2026-05-16 22:07:39 -07:00
parsed.types.push(resourceType);
2026-05-15 09:12:28 -07:00
}
2026-05-16 22:07:39 -07:00
continue;
}
if (SKIP_NETWORK_OPTION_NAMES.has(name) || eqIndex >= 0) {
parsed.skip = true;
}
}
return parsed;
}
function parseDomainOptionValue(value) {
const include = [];
const exclude = [];
if (!value) {
return { include, exclude };
}
for (const rawDomain of value.split("|")) {
const domain = rawDomain.trim().toLowerCase();
if (!domain) continue;
if (domain.startsWith("~")) {
exclude.push(domain.slice(1));
} else {
include.push(domain);
2026-05-15 09:12:28 -07:00
}
}
2026-05-16 22:07:39 -07:00
return { include, exclude };
}
function networkRuleKey(pattern, options, exception) {
return `${exception ? "@@" : ""}${pattern}${options.length > 0 ? `$${options.join(",")}` : ""}`;
}
function compileNetworkPattern(pattern, matchCase) {
const flags = matchCase ? "" : "i";
2026-05-15 09:12:28 -07:00
if (pattern.startsWith("||")) {
2026-05-16 22:07:39 -07:00
const domainRule = parseDomainAnchoredPattern(pattern, flags);
return domainRule;
2026-05-15 09:12:28 -07:00
}
2026-05-15 09:25:19 -07:00
if (pattern.startsWith("/") && pattern.endsWith("/") && pattern.length > 1) {
2026-05-16 22:07:39 -07:00
try {
return {
kind: "regex",
regex: new RegExp(pattern.slice(1, -1), flags)
};
} catch {
return null;
}
}
try {
2026-05-15 09:25:19 -07:00
return {
2026-05-16 22:07:39 -07:00
kind: "pattern",
regex: new RegExp(adblockPatternToRegex(pattern), flags)
2026-05-15 09:25:19 -07:00
};
2026-05-16 22:07:39 -07:00
} catch {
return null;
}
}
function parseDomainAnchoredPattern(pattern, flags) {
const domainPath = pattern.slice(2);
let domainEnd = 0;
while (
domainEnd < domainPath.length &&
domainPath[domainEnd] !== "/" &&
domainPath[domainEnd] !== "^"
) {
domainEnd += 1;
}
const domain = domainPath.slice(0, domainEnd).toLowerCase();
if (!domain || /[\\[\]{}()]/.test(domain)) {
return null;
}
const suffix = domainPath.slice(domainEnd);
let path = "";
if (suffix.startsWith("/")) {
path = suffix;
} else if (suffix.startsWith("^/")) {
path = suffix.slice(1);
} else if (suffix && suffix !== "^") {
return null;
}
let pathRegex = null;
if (path) {
try {
pathRegex = new RegExp("^" + adblockPatternToRegex(path), flags);
} catch {
return null;
}
2026-05-15 09:12:28 -07:00
}
2026-05-15 09:25:19 -07:00
return {
2026-05-16 22:07:39 -07:00
kind: "domain",
domain,
path,
pathRegex
2026-05-15 09:25:19 -07:00
};
2026-05-15 09:12:28 -07:00
}
2026-05-16 22:07:39 -07:00
function adguardStyleRuleToCss(rule) {
if (!rule.includes("{") || !rule.includes("}") || /[\r\n]/.test(rule)) {
return null;
}
return rule;
}
2026-05-15 09:12:28 -07:00
function cosmeticSelectorToCss(selector) {
2026-05-16 22:07:39 -07:00
const trimmed = selector.trim();
if (!trimmed || trimmed.startsWith("+js") || trimmed.startsWith("^")) {
return null;
}
if (trimmed.endsWith(":remove()")) {
const baseSelector = trimmed.slice(0, -":remove()".length);
return isSupportedCosmeticSelector(baseSelector)
? `${baseSelector} { display: none !important; }`
: null;
2026-05-15 09:25:19 -07:00
}
2026-05-16 22:07:39 -07:00
const styleMatch = trimmed.match(/:style\((.+)\)$/);
2026-05-15 09:12:28 -07:00
if (styleMatch) {
2026-05-16 22:07:39 -07:00
const baseSelector = trimmed.slice(0, trimmed.lastIndexOf(":style("));
return isSupportedCosmeticSelector(baseSelector)
? `${baseSelector} { ${styleMatch[1]} }`
: null;
2026-05-15 09:12:28 -07:00
}
2026-05-16 22:07:39 -07:00
return isSupportedCosmeticSelector(trimmed)
? `${trimmed} { display: none !important; }`
: null;
}
function isSupportedCosmeticSelector(selector) {
if (!selector || /[\r\n{}]/.test(selector)) return false;
const unsupportedTokens = [
":-abp-contains(",
":-abp-has(",
":-abp-properties(",
":contains(",
":has-text(",
":matches-attr(",
":matches-css",
":matches-media",
":matches-path",
":min-text-length(",
":others()",
":remove()",
":upward(",
":watch-attr(",
":xpath("
];
const lower = selector.toLowerCase();
return !unsupportedTokens.some((token) => lower.includes(token));
}
export function getCosmeticCssForHostname(rules, hostname) {
const normalizedHostname = String(hostname || "").toLowerCase();
const exceptionKeys = new Set();
for (const exception of rules.cosmeticExceptionRules || []) {
if (matchesCosmeticDomains(exception.domains, normalizedHostname)) {
exceptionKeys.add(exception.selector);
}
2026-05-15 09:12:28 -07:00
}
2026-05-16 22:07:39 -07:00
const lines = [];
for (const rule of rules.cosmeticRules || []) {
if (
matchesCosmeticDomains(rule.domains, normalizedHostname) &&
!exceptionKeys.has(rule.selector)
) {
lines.push(rule.css);
}
}
return lines;
2026-05-15 09:12:28 -07:00
}
function matchesCosmeticDomains(domainSpec, hostname) {
if (!domainSpec || domainSpec === "*") return true;
2026-05-16 22:07:39 -07:00
const domains = domainSpec
.split(",")
.map((domain) => domain.trim().toLowerCase())
.filter(Boolean);
const positives = [];
const negatives = [];
for (const domain of domains) {
if (domain.startsWith("~")) {
negatives.push(domain.slice(1));
} else {
positives.push(domain);
2026-05-15 09:12:28 -07:00
}
}
2026-05-16 22:07:39 -07:00
if (negatives.some((domain) => domainMatchesPattern(hostname, domain))) {
return false;
}
if (positives.length === 0) {
return true;
}
return positives.some((domain) => domainMatchesPattern(hostname, domain));
2026-05-15 09:12:28 -07:00
}
function matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule) {
2026-05-16 22:07:39 -07:00
const normalizedSourceHostname = String(sourceHostname || "").toLowerCase();
2026-05-15 09:12:28 -07:00
if (rule.includeDomains.length > 0) {
2026-05-16 22:07:39 -07:00
const ok = rule.includeDomains.some((domain) =>
domainMatchesPattern(normalizedSourceHostname, domain)
2026-05-15 09:12:28 -07:00
);
if (!ok) return false;
}
if (rule.excludeDomains.length > 0) {
2026-05-16 22:07:39 -07:00
const blocked = rule.excludeDomains.some((domain) =>
domainMatchesPattern(normalizedSourceHostname, domain)
);
if (blocked) return false;
}
if (rule.includeTargetDomains.length > 0) {
const ok = rule.includeTargetDomains.some((domain) =>
domainMatchesPattern(hostname, domain)
);
if (!ok) return false;
}
if (rule.excludeTargetDomains.length > 0) {
const blocked = rule.excludeTargetDomains.some((domain) =>
domainMatchesPattern(hostname, domain)
2026-05-15 09:12:28 -07:00
);
if (blocked) return false;
}
2026-05-16 22:07:39 -07:00
if (rule.excludedTypes.some((type) => resourceTypeMatches(type, resourceType))) {
return false;
}
2026-05-15 09:25:19 -07:00
if (rule.types.length > 0) {
if (!rule.types.some((type) => resourceTypeMatches(type, resourceType))) {
2026-05-15 09:12:28 -07:00
return false;
}
}
2026-05-16 22:07:39 -07:00
const isThirdParty = isThirdPartyRequest(hostname, normalizedSourceHostname);
if (rule.isThirdParty && !isThirdParty) {
return false;
2026-05-15 09:12:28 -07:00
}
2026-05-16 22:07:39 -07:00
if (rule.isFirstParty && isThirdParty) {
return false;
2026-05-15 09:12:28 -07:00
}
if (rule.kind === "domain") {
2026-05-15 09:25:19 -07:00
if (!domainPatternMatches(hostname, rule.domain)) return false;
2026-05-16 22:07:39 -07:00
if (rule.pathRegex && !rule.pathRegex.test(urlObj.pathname + urlObj.search)) return false;
2026-05-15 09:12:28 -07:00
return true;
}
2026-05-16 22:07:39 -07:00
if (rule.kind === "regex" || rule.kind === "pattern") {
return rule.regex.test(url);
2026-05-15 09:25:19 -07:00
}
2026-05-15 09:12:28 -07:00
return false;
}
2026-05-15 09:25:19 -07:00
function resourceTypeMatches(filterType, resourceType) {
const typeMap = {
document: ["document"],
font: ["font"],
image: ["image"],
"inline-script": ["script"],
media: ["media"],
object: ["object"],
other: ["other"],
2026-05-16 22:07:39 -07:00
ping: ["ping", "fetch"],
2026-05-15 09:25:19 -07:00
script: ["script"],
stylesheet: ["stylesheet"],
2026-05-16 22:07:39 -07:00
subdocument: ["document", "subdocument"],
websocket: ["websocket"],
xmlhttprequest: ["fetch", "xhr", "xmlhttprequest"]
2026-05-15 09:25:19 -07:00
};
const mapped = typeMap[filterType];
return mapped ? mapped.includes(resourceType) : false;
}
function domainPatternMatches(hostname, pattern) {
const normalized = pattern.replace(/\^$/, "").toLowerCase();
if (!normalized) return false;
if (!normalized.includes("*")) {
return hostname === normalized || hostname.endsWith("." + normalized);
}
2026-05-16 22:07:39 -07:00
return domainMatchesPattern(hostname, normalized);
}
function domainMatchesPattern(hostname, pattern) {
const normalizedHostname = String(hostname || "").toLowerCase();
const normalizedPattern = String(pattern || "").replace(/\^$/, "").toLowerCase();
if (!normalizedPattern) return false;
if (normalizedPattern === "*") return true;
if (!normalizedPattern.includes("*")) {
return normalizedHostname === normalizedPattern || normalizedHostname.endsWith("." + normalizedPattern);
}
const source = normalizedPattern
.split("*")
.map((part) => part.replace(/[|\\{}()[\]^$+?.]/g, "\\$&"))
.join(".*");
const re = new RegExp(`${normalizedPattern.startsWith("*") ? "^" : "(?:^|\\.)"}${source}$`, "i");
return re.test(normalizedHostname);
}
function isThirdPartyRequest(hostname, sourceHostname) {
if (!hostname || !sourceHostname) {
return hostname !== sourceHostname;
}
const requestSite = registrableDomain(hostname);
const sourceSite = registrableDomain(sourceHostname);
if (!requestSite || !sourceSite) {
return hostname !== sourceHostname && !hostname.endsWith("." + sourceHostname);
}
return requestSite !== sourceSite;
}
function registrableDomain(hostname) {
const normalized = String(hostname || "").toLowerCase().replace(/\.$/, "");
if (!normalized || /^\d{1,3}(?:\.\d{1,3}){3}$/.test(normalized) || normalized === "localhost") {
return normalized;
}
const parts = normalized.split(".").filter(Boolean);
if (parts.length <= 2) {
return normalized;
}
const suffix2 = parts.slice(-2).join(".");
if (MULTI_PART_PUBLIC_SUFFIXES.has(suffix2) && parts.length >= 3) {
return parts.slice(-3).join(".");
}
return parts.slice(-2).join(".");
2026-05-15 09:25:19 -07:00
}
function adblockPatternToRegex(pattern) {
let source = "";
let remaining = pattern;
let anchoredStart = false;
let anchoredEnd = false;
if (remaining.startsWith("|")) {
anchoredStart = true;
remaining = remaining.slice(1);
}
if (remaining.endsWith("|")) {
anchoredEnd = true;
remaining = remaining.slice(0, -1);
}
for (const ch of remaining) {
if (ch === "*") {
source += ".*";
} else if (ch === "^") {
source += "(?:[^A-Za-z0-9_.%-]|$)";
} else {
source += ch.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
}
}
return `${anchoredStart ? "^" : ""}${source}${anchoredEnd ? "$" : ""}`;
}
2026-05-16 22:07:39 -07:00
export function shouldBlockRequestWithRules(rules, url, resourceType, sourceHostname) {
if (url === sourceHostname || (sourceHostname && url.startsWith(sourceHostname + "/"))) {
2026-05-15 09:12:28 -07:00
return false;
}
let urlObj;
try {
urlObj = new URL(url);
} catch {
return false;
}
const hostname = urlObj.hostname;
2026-05-16 22:07:39 -07:00
for (const rule of networkRuleCandidates(rules.importantAllowRules, rules.importantAllowRuleIndex, hostname)) {
2026-05-15 09:12:28 -07:00
if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) {
return false;
}
}
2026-05-16 22:07:39 -07:00
for (const rule of networkRuleCandidates(rules.importantBlockRules, rules.importantBlockRuleIndex, hostname)) {
if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) {
return true;
}
}
for (const rule of networkRuleCandidates(rules.allowRules, rules.allowRuleIndex, hostname)) {
if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) {
return false;
}
}
for (const rule of networkRuleCandidates(rules.blockRules, rules.blockRuleIndex, hostname)) {
2026-05-15 09:12:28 -07:00
if (matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname, rule)) {
return true;
}
}
return false;
}
2026-05-16 22:07:39 -07:00
function shouldBlockRequest(url, resourceType, sourceHostname) {
return shouldBlockRequestWithRules(filterRules, url, resourceType, sourceHostname);
}
function* networkRuleCandidates(rules = [], index, hostname) {
if (!index) {
yield* rules;
return;
}
for (const suffix of hostnameSuffixes(hostname)) {
const bucket = index.byDomain.get(suffix);
if (bucket) {
yield* bucket;
}
}
yield* index.wildcardDomainRules;
yield* index.otherRules;
}
function hostnameSuffixes(hostname) {
const normalized = String(hostname || "").toLowerCase();
if (!normalized) return [""];
const labels = normalized.split(".").filter(Boolean);
const suffixes = [];
for (let index = 0; index < labels.length; index += 1) {
suffixes.push(labels.slice(index).join("."));
}
return suffixes;
}
2026-05-15 09:12:28 -07:00
// --- Userscript metadata parsing -------------------------------------------
function parseUserScriptMetadata(content) {
const metaBlock = content.match(/\/\/\s*==UserScript==([\s\S]*?)\/\/\s*==\/UserScript==/);
const matches = [];
const excludes = [];
if (!metaBlock) return { matches, excludes };
const lines = metaBlock[1].split("\n");
for (const line of lines) {
const matchMatch = line.match(/@match\s+(.+)/);
if (matchMatch) {
matches.push(matchMatch[1].trim());
continue;
}
const excludeMatch = line.match(/@exclude\s+(.+)/);
if (excludeMatch) {
excludes.push(excludeMatch[1].trim());
}
}
return { matches, excludes };
}
function urlMatchesPattern(url, pattern) {
// Simple glob-style pattern matching for userscript @match
// Format: *://*.example.com/* or http://example.com/path
try {
const urlObj = new URL(url);
const protocol = urlObj.protocol.slice(0, -1); // "http" or "https"
const hostname = urlObj.hostname;
const pathname = urlObj.pathname;
// Split pattern
const protoEnd = pattern.indexOf("://");
if (protoEnd < 0) return false;
const patternProto = pattern.slice(0, protoEnd);
const rest = pattern.slice(protoEnd + 3);
// Protocol match
if (patternProto !== "*" && patternProto !== protocol) return false;
// Split rest into host and path
const slashIdx = rest.indexOf("/");
const patternHost = slashIdx >= 0 ? rest.slice(0, slashIdx) : rest;
const patternPath = slashIdx >= 0 ? rest.slice(slashIdx) : "/";
// Host match
if (!matchHost(hostname, patternHost)) return false;
// Path match
if (!matchPath(pathname, patternPath)) return false;
return true;
} catch {
return false;
}
}
function matchHost(hostname, pattern) {
if (pattern === "*") return true;
if (pattern.startsWith("*.")) {
const suffix = pattern.slice(2);
return hostname === suffix || hostname.endsWith("." + suffix);
}
return hostname === pattern;
}
function matchPath(pathname, pattern) {
if (pattern === "/*") return true;
// Convert glob pattern to regex
const regex = "^" + pattern
.replace(/\./g, "\\.")
.replace(/\*/g, ".*")
.replace(/\?/g, ".")
+ "$";
return new RegExp(regex, "i").test(pathname);
}
function shouldInjectUserScript(url, meta) {
let matched = false;
for (const pattern of meta.matches) {
if (urlMatchesPattern(url, pattern)) {
matched = true;
break;
}
}
if (!matched) return false;
for (const pattern of meta.excludes) {
if (urlMatchesPattern(url, pattern)) {
return false;
}
}
return true;
}
// --- Browser helpers -------------------------------------------------------
function loadPlaywright() {
try {
return require("playwright");
} catch (error) {
throw new Error(
`Playwright is required. Run "npm install" and "npm run install-browsers". Original error: ${error.message}`
);
}
}
// Manual stealth evasions injected into every page before any scripts run.
const STEALTH_INIT_SCRIPT = `
(() => {
const patchNavigator = () => {
try {
// Override webdriver getter without using delete (can crash renderer)
if (navigator.webdriver !== undefined) {
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined,
configurable: true,
enumerable: true
});
}
} catch (e) {}
try {
if (!window.chrome) {
window.chrome = { runtime: {} };
} else if (!window.chrome.runtime) {
window.chrome.runtime = {};
}
} catch (e) {}
try {
const originalQuery = window.navigator.permissions?.query;
if (originalQuery) {
window.navigator.permissions.query = (parameters) => (
parameters.name === 'notifications'
? Promise.resolve({ state: Notification.permission })
: originalQuery(parameters)
);
}
} catch (e) {}
};
if (document.readyState === 'loading') {
document.addEventListener('DOMContentLoaded', patchNavigator);
} else {
patchNavigator();
}
})();
`;
function buildLaunchArgs(headless) {
const args = [
"--disable-blink-features=AutomationControlled",
"--disable-web-security",
"--disable-features=IsolateOrigins,site-per-process",
"--disable-site-isolation-trials",
"--disable-infobars",
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-accelerated-2d-canvas",
"--disable-gpu",
"--window-size=1366,768"
];
if (headless) {
args.push("--headless=new");
}
return args;
}
function buildIgnoreDefaultArgs() {
return ["--enable-automation"];
}
// --- Page helpers ----------------------------------------------------------
async function setupRequestBlocking(page, sourceHostname) {
2026-05-16 22:07:39 -07:00
if (
!privacyFiltersAvailable ||
(filterRules.blockRules.length === 0 && filterRules.importantBlockRules.length === 0)
) {
return;
}
2026-05-15 09:12:28 -07:00
await page.route("**/*", (route) => {
try {
const request = route.request();
2026-05-15 09:25:19 -07:00
if (request.isNavigationRequest() && request.frame() === page.mainFrame()) {
2026-05-15 09:12:28 -07:00
route.continue();
return;
}
const url = request.url();
const type = request.resourceType();
if (shouldBlockRequest(url, type, sourceHostname)) {
route.abort("blockedbyclient");
} else {
route.continue();
}
} catch {
route.continue();
}
});
}
async function injectCosmeticFilters(page, hostname) {
2026-05-16 22:32:30 -07:00
const lines = [COMMON_ANNOYANCE_CSS];
if (privacyFiltersAvailable && filterRules.cosmeticRules.length > 0) {
lines.push(...getCosmeticCssForHostname(filterRules, hostname));
}
2026-05-15 09:12:28 -07:00
if (lines.length > 0) {
try {
await page.addStyleTag({ content: lines.join("\n") });
} catch {
// Ignore cosmetic injection failures.
}
}
}
2026-05-16 22:32:30 -07:00
async function removeCommonAnnoyances(page) {
try {
await page.evaluate(({ selectors, rootClasses }) => {
for (const selector of selectors) {
try {
document.querySelectorAll(selector).forEach((element) => element.remove());
} catch {
// Ignore selectors unsupported by the current browser.
}
}
for (const root of [document.documentElement, document.body].filter(Boolean)) {
root.classList.remove(...rootClasses);
root.removeAttribute("data-previous-scroll-y");
const overflow = root.style.overflow || "";
const position = root.style.position || "";
if (/hidden|clip/i.test(overflow)) {
root.style.removeProperty("overflow");
}
if (/fixed/i.test(position)) {
root.style.removeProperty("position");
root.style.removeProperty("top");
root.style.removeProperty("left");
root.style.removeProperty("right");
}
}
}, {
selectors: COMMON_ANNOYANCE_SELECTORS,
rootClasses: COMMON_ANNOYANCE_ROOT_CLASSES
});
} catch {
// Ignore cleanup failures; the archive is still useful.
}
}
2026-05-15 09:12:28 -07:00
const GM_MOCK = `
if (typeof GM === "undefined") {
window.GM = {
xmlHttpRequest: function(details) {
fetch(details.url, {
method: details.method || "GET",
headers: details.headers || {},
body: details.data || null
})
.then(response => response.text().then(text => ({
status: response.status,
statusText: response.statusText,
responseText: text,
responseHeaders: Array.from(response.headers.entries())
.map(([k, v]) => k + ": " + v).join("\\r\\n")
})))
.then(obj => {
if (details.onload) details.onload(obj);
})
.catch(err => {
if (details.onerror) details.onerror(err);
});
}
};
}
`;
async function injectPrivacyUserScripts(page, sourceUrl) {
if (!privacyFiltersAvailable || userScriptData.length === 0) return;
const matching = userScriptData.filter((us) => shouldInjectUserScript(sourceUrl, us));
if (matching.length === 0) return;
// Inject GM API mock first.
try {
await page.addScriptTag({ content: GM_MOCK });
2026-05-15 09:25:19 -07:00
if (userScriptRequireContent) {
await page.addScriptTag({ content: userScriptRequireContent });
}
2026-05-15 09:12:28 -07:00
} catch {
return;
}
// Inject only matching userscripts.
for (const us of matching) {
try {
await page.addScriptTag({ content: us.content });
} catch {
// Ignore injection failures for individual scripts.
}
}
}
// ---------------------------------------------------------------------------
// Archiving
// ---------------------------------------------------------------------------
2026-05-14 08:12:13 -07:00
export async function archivePage(input, options = {}) {
const sourceUrl = inputToUrl(input);
const archivePath = options.archivePath || defaultArchivePath();
const id = options.id || slugForUrl(sourceUrl);
const filePath = path.join(archivePath, `${id}.html`);
2026-05-15 01:00:27 -07:00
await fs.mkdir(archivePath, { recursive: true });
2026-05-14 08:12:13 -07:00
2026-05-15 09:12:28 -07:00
const renderedHtml = await renderPage(sourceUrl, options);
2026-05-15 01:00:27 -07:00
const baseUrl = findEffectiveBase(renderedHtml, sourceUrl);
2026-05-16 22:07:39 -07:00
const sourceHostname = new URL(sourceUrl).hostname;
2026-05-14 08:12:13 -07:00
const inliner = new AssetInliner({
2026-05-15 01:00:27 -07:00
userAgent: DEFAULT_USER_AGENT,
2026-05-16 22:07:39 -07:00
referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined,
shouldBlockAsset: (assetUrl, resourceType) =>
shouldBlockRequest(assetUrl, resourceType, sourceHostname)
2026-05-14 08:12:13 -07:00
});
2026-05-15 01:00:27 -07:00
2026-05-14 08:12:13 -07:00
const inlined = await inliner.inlineHtml(renderedHtml, baseUrl);
2026-05-15 01:00:27 -07:00
const finalHtml = addArchiveComment(inlined, sourceUrl);
2026-05-14 08:12:13 -07:00
await fs.writeFile(filePath, finalHtml, "utf8");
return {
id,
filePath,
sourceUrl,
archivePath,
warnings: inliner.warnings,
externalAssets: findExternalAssetRefs(finalHtml)
};
}
2026-05-15 09:12:28 -07:00
export async function renderPage(sourceUrl, options = {}) {
2026-05-14 08:12:13 -07:00
const playwright = loadPlaywright();
2026-05-15 09:12:28 -07:00
const hasDisplay = !!(process.env.DISPLAY || process.env.WAYLAND_DISPLAY);
const headless = options.headless !== false && !hasDisplay;
const browser = await playwright.chromium.launch({
headless,
args: buildLaunchArgs(headless),
ignoreDefaultArgs: buildIgnoreDefaultArgs()
});
2026-05-15 01:00:27 -07:00
2026-05-14 08:12:13 -07:00
try {
const context = await browser.newContext({
2026-05-15 09:12:28 -07:00
userAgent: options.userAgent || DEFAULT_USER_AGENT,
viewport: VIEWPORT,
locale: options.locale || "en-US",
timezoneId: options.timezoneId || "America/New_York"
2026-05-14 08:12:13 -07:00
});
2026-05-15 09:12:28 -07:00
// Inject stealth evasions into every new page before any scripts run.
await context.addInitScript(STEALTH_INIT_SCRIPT);
2026-05-14 08:12:13 -07:00
const page = await context.newPage();
2026-05-15 09:12:28 -07:00
const sourceHostname = new URL(sourceUrl).hostname;
// Block paywall/tracker requests before the page loads.
await setupRequestBlocking(page, sourceHostname);
2026-05-14 08:12:13 -07:00
2026-05-15 01:00:27 -07:00
await page.goto(sourceUrl, {
waitUntil: "domcontentloaded",
timeout: PAGE_TIMEOUT_MS
});
2026-05-15 09:12:28 -07:00
// Inject cosmetic CSS and userscripts to strip paywalls / ads.
await injectCosmeticFilters(page, sourceHostname);
await injectPrivacyUserScripts(page, sourceUrl);
// Give the userscripts a moment to run their setTimeout callbacks.
const userscriptDelay = options.userscriptDelay || 2000;
await page.waitForTimeout(userscriptDelay);
2026-05-15 01:00:27 -07:00
await waitForNetworkIdle(page);
2026-05-16 22:32:30 -07:00
await removeCommonAnnoyances(page);
2026-05-15 01:00:27 -07:00
await snapshotLoadedResourceUrls(page);
2026-05-16 16:05:32 -07:00
await snapshotRuntimeStyles(page);
2026-05-14 08:12:13 -07:00
return await page.content();
} finally {
await browser.close();
}
}
2026-05-15 01:00:27 -07:00
async function waitForNetworkIdle(page) {
2026-05-14 08:12:13 -07:00
try {
2026-05-15 01:00:27 -07:00
await page.waitForLoadState("networkidle", {
timeout: NETWORK_IDLE_TIMEOUT_MS
2026-05-14 08:12:13 -07:00
});
2026-05-15 01:00:27 -07:00
} catch {
// Some pages keep sockets open; the DOM snapshot is still useful.
2026-05-14 08:12:13 -07:00
}
}
2026-05-15 01:00:27 -07:00
async function snapshotLoadedResourceUrls(page) {
await page.evaluate(() => {
document.querySelectorAll("img").forEach((img) => {
if (img.currentSrc) {
img.setAttribute("src", img.currentSrc);
2026-05-14 08:12:13 -07:00
}
2026-05-15 01:00:27 -07:00
});
2026-05-14 08:12:13 -07:00
2026-05-15 01:00:27 -07:00
document.querySelectorAll("video,audio").forEach((media) => {
if (media.currentSrc) {
media.setAttribute("src", media.currentSrc);
2026-05-14 08:12:13 -07:00
}
2026-05-15 01:00:27 -07:00
});
2026-05-14 08:12:13 -07:00
2026-05-15 01:00:27 -07:00
document.querySelectorAll("iframe").forEach((frame) => {
try {
const doc = frame.contentDocument;
if (doc?.documentElement) {
frame.setAttribute("srcdoc", "<!doctype html>" + doc.documentElement.outerHTML);
frame.removeAttribute("src");
2026-05-14 08:12:13 -07:00
}
2026-05-15 01:00:27 -07:00
} catch {
// Cross-origin frames are handled later by the asset inliner when possible.
2026-05-14 08:12:13 -07:00
}
2026-05-15 01:00:27 -07:00
});
});
2026-05-14 08:12:13 -07:00
}
2026-05-16 16:05:32 -07:00
async function snapshotRuntimeStyles(page) {
await page.evaluate(() => {
const serializeRules = (sheet) => {
try {
return Array.from(sheet.cssRules || [])
.map((rule) => rule.cssText)
.join("\n");
} catch {
return "";
}
};
for (const sheet of Array.from(document.styleSheets)) {
const css = serializeRules(sheet);
if (!css.trim()) {
continue;
}
const owner = sheet.ownerNode;
if (owner instanceof HTMLStyleElement && !owner.textContent.trim()) {
owner.textContent = css;
}
}
const adoptedStyleSheets = Array.from(document.adoptedStyleSheets || []);
adoptedStyleSheets.forEach((sheet, index) => {
const css = serializeRules(sheet);
if (!css.trim()) {
return;
}
const style = document.createElement("style");
style.setAttribute("data-archiver-adopted-stylesheet", String(index));
style.textContent = css;
document.head.appendChild(style);
});
});
}
2026-05-15 01:00:27 -07:00
function addArchiveComment(html, sourceUrl) {
const safeSource = String(sourceUrl).replaceAll("--", "- -");
const comment = `<!-- Archived locally. Source: ${safeSource}. Created: ${new Date().toISOString()}. -->`;
2026-05-14 08:12:13 -07:00
if (/<!doctype/i.test(html)) {
return html.replace(/<!doctype[^>]*>/i, (doctype) => `${doctype}\n${comment}`);
}
return `<!doctype html>\n${comment}\n${html}`;
}
export function findExternalAssetRefs(html) {
const refs = new Set();
2026-05-15 09:25:19 -07:00
const assetTagPattern = /<(?:img|source|audio|video|track|embed|object|input|iframe)\b[^>]*>/gi;
for (const match of html.matchAll(assetTagPattern)) {
const tag = match[0];
for (const attr of ["src", "srcset", "poster", "data"]) {
const value = readAttribute(tag, attr);
2026-05-16 16:05:32 -07:00
if (!value) {
continue;
}
if (attr === "srcset") {
addSrcsetRefs(refs, value);
continue;
}
if (isSelfContainedAssetRef(value)) {
2026-05-15 09:25:19 -07:00
continue;
}
for (const part of value.split(",")) {
const candidate = part.trim().split(/\s+/)[0];
if (candidate && !isSelfContainedAssetRef(candidate)) {
refs.add(candidate);
}
2026-05-14 08:12:13 -07:00
}
}
}
2026-05-15 01:00:27 -07:00
2026-05-14 08:12:13 -07:00
const linkPattern = /<link\b[^>]*>/gi;
for (const match of html.matchAll(linkPattern)) {
const tag = match[0];
const rel = readAttribute(tag, "rel") || "";
if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) {
continue;
}
const href = readAttribute(tag, "href");
if (href && !isSelfContainedAssetRef(href)) {
refs.add(href);
}
2026-05-16 16:05:32 -07:00
const imageSrcset = readAttribute(tag, "imagesrcset");
if (imageSrcset) {
addSrcsetRefs(refs, imageSrcset);
}
2026-05-14 08:12:13 -07:00
}
2026-05-15 01:00:27 -07:00
2026-05-14 08:12:13 -07:00
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
for (const match of html.matchAll(cssUrlPattern)) {
2026-05-14 09:11:05 -07:00
const candidate = cleanCssUrl(match[2]);
2026-05-14 08:12:13 -07:00
if (candidate && !isSelfContainedAssetRef(candidate)) {
refs.add(candidate);
}
}
2026-05-15 01:00:27 -07:00
2026-05-14 08:12:13 -07:00
return Array.from(refs).sort();
}
2026-05-16 16:05:32 -07:00
function addSrcsetRefs(refs, srcset) {
for (const part of splitSrcset(srcset)) {
const candidate = part.trim().split(/\s+/)[0];
if (candidate && !isSelfContainedAssetRef(candidate)) {
refs.add(candidate);
}
}
}
2026-05-14 08:12:13 -07:00
function isSelfContainedAssetRef(value) {
2026-05-14 09:11:05 -07:00
const trimmed = cleanCssUrl(value);
2026-05-14 08:12:13 -07:00
return (
!trimmed ||
trimmed.startsWith("#") ||
2026-05-14 09:11:05 -07:00
/^%23/i.test(trimmed) ||
2026-05-14 08:12:13 -07:00
/^(?:data|about|javascript|mailto|tel):/i.test(trimmed)
);
}
function readAttribute(tag, attr) {
2026-05-15 09:25:19 -07:00
const match = findAttribute(tag, attr);
return match ? match.value : "";
2026-05-14 08:12:13 -07:00
}
2026-05-14 09:11:05 -07:00
function cleanCssUrl(value) {
const decoded = String(value)
.trim()
.replaceAll("&amp;", "&")
.replaceAll("&quot;", '"')
.replaceAll("&#39;", "'")
.replaceAll("&apos;", "'");
const quote = decoded[0];
if ((quote === '"' || quote === "'") && decoded.at(-1) === quote) {
return decoded.slice(1, -1).trim();
}
return decoded;
}
2026-05-15 09:25:19 -07:00
function findAttribute(openingTag, attr) {
const attrLower = attr.toLowerCase();
const nameMatch = openingTag.match(/^<[^\s/>]+/);
let index = nameMatch ? nameMatch[0].length : 1;
while (index < openingTag.length) {
while (index < openingTag.length && /\s/.test(openingTag[index])) {
index += 1;
}
if (index >= openingTag.length || openingTag[index] === ">" || openingTag[index] === "/") {
return null;
}
const start = index;
while (index < openingTag.length && !/[\s=/>]/.test(openingTag[index])) {
index += 1;
}
const name = openingTag.slice(start, index);
while (index < openingTag.length && /\s/.test(openingTag[index])) {
index += 1;
}
let value = "";
if (openingTag[index] === "=") {
index += 1;
while (index < openingTag.length && /\s/.test(openingTag[index])) {
index += 1;
}
const quote = openingTag[index];
if (quote === '"' || quote === "'") {
index += 1;
const valueStart = index;
while (index < openingTag.length && openingTag[index] !== quote) {
index += 1;
}
value = openingTag.slice(valueStart, index);
if (openingTag[index] === quote) {
index += 1;
}
} else {
const valueStart = index;
while (index < openingTag.length && !/[\s>]/.test(openingTag[index])) {
index += 1;
}
value = openingTag.slice(valueStart, index);
}
}
if (name.toLowerCase() === attrLower) {
return { start, end: index, value };
}
}
return null;
}