fixes
This commit is contained in:
255
src/archiver.mjs
255
src/archiver.mjs
@@ -32,6 +32,7 @@ const PRIVACY_FILTERS_DIR = path.join(__dirname, "..", "privacy-filters");
|
||||
let privacyFiltersAvailable = false;
|
||||
let filterRules = { blockRules: [], allowRules: [], cosmeticRules: [] };
|
||||
let userScriptData = []; // { file, content, matches, excludes }
|
||||
let userScriptRequireContent = "";
|
||||
|
||||
async function loadPrivacyFilters() {
|
||||
try {
|
||||
@@ -40,6 +41,7 @@ async function loadPrivacyFilters() {
|
||||
filterRules = parseFilterRules(filterContent);
|
||||
|
||||
const userscriptDir = path.join(PRIVACY_FILTERS_DIR, "userscript");
|
||||
userScriptRequireContent = await fs.readFile(path.join(userscriptDir, "bpc_func.js"), "utf8");
|
||||
const userScriptFiles = [
|
||||
"bpc.en.user.js",
|
||||
"bpc.de.user.js",
|
||||
@@ -126,7 +128,7 @@ function parseNetworkRule(line) {
|
||||
const lastDollar = line.lastIndexOf("$");
|
||||
if (lastDollar > 0) {
|
||||
const optsStr = line.slice(lastDollar + 1);
|
||||
if (/^[a-z,=~\-|0-9]+$/i.test(optsStr)) {
|
||||
if (/^[a-z,=~_.\-|0-9]+$/i.test(optsStr)) {
|
||||
options = optsStr.split(",");
|
||||
pattern = line.slice(0, lastDollar);
|
||||
}
|
||||
@@ -134,8 +136,20 @@ function parseNetworkRule(line) {
|
||||
|
||||
if (!pattern) return null;
|
||||
|
||||
const type = options.find((o) =>
|
||||
["script", "stylesheet", "image", "media", "xmlhttprequest", "other", "inline-script"].includes(o)
|
||||
const types = options.filter((o) =>
|
||||
[
|
||||
"document",
|
||||
"font",
|
||||
"image",
|
||||
"inline-script",
|
||||
"media",
|
||||
"object",
|
||||
"other",
|
||||
"script",
|
||||
"stylesheet",
|
||||
"subdocument",
|
||||
"xmlhttprequest"
|
||||
].includes(o)
|
||||
);
|
||||
const isThirdParty = options.includes("third-party");
|
||||
const isFirstParty = options.includes("~third-party");
|
||||
@@ -162,7 +176,7 @@ function parseNetworkRule(line) {
|
||||
kind: "domain",
|
||||
domain,
|
||||
path,
|
||||
type,
|
||||
types,
|
||||
isThirdParty,
|
||||
isFirstParty,
|
||||
includeDomains,
|
||||
@@ -171,27 +185,38 @@ function parseNetworkRule(line) {
|
||||
};
|
||||
}
|
||||
|
||||
if (pattern.startsWith("/")) {
|
||||
const lastSlash = pattern.lastIndexOf("/");
|
||||
if (lastSlash > 0) {
|
||||
const regex = pattern.slice(1, lastSlash);
|
||||
return {
|
||||
kind: "regex",
|
||||
regex,
|
||||
type,
|
||||
isThirdParty,
|
||||
isFirstParty,
|
||||
includeDomains,
|
||||
excludeDomains,
|
||||
important
|
||||
};
|
||||
}
|
||||
if (pattern.startsWith("/") && pattern.endsWith("/") && pattern.length > 1) {
|
||||
const regex = pattern.slice(1, -1);
|
||||
return {
|
||||
kind: "regex",
|
||||
regex,
|
||||
types,
|
||||
isThirdParty,
|
||||
isFirstParty,
|
||||
includeDomains,
|
||||
excludeDomains,
|
||||
important
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
return {
|
||||
kind: "pattern",
|
||||
regex: adblockPatternToRegex(pattern),
|
||||
types,
|
||||
isThirdParty,
|
||||
isFirstParty,
|
||||
includeDomains,
|
||||
excludeDomains,
|
||||
important
|
||||
};
|
||||
}
|
||||
|
||||
function cosmeticSelectorToCss(selector) {
|
||||
if (selector.endsWith(":remove()")) {
|
||||
const baseSelector = selector.slice(0, -":remove()".length);
|
||||
return baseSelector ? `${baseSelector} { display: none !important; }` : null;
|
||||
}
|
||||
|
||||
const styleMatch = selector.match(/:style\((.+)\)$/);
|
||||
if (styleMatch) {
|
||||
const baseSelector = selector.slice(0, selector.lastIndexOf(":style("));
|
||||
@@ -246,17 +271,8 @@ function matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname,
|
||||
if (blocked) return false;
|
||||
}
|
||||
|
||||
if (rule.type) {
|
||||
const typeMap = {
|
||||
script: "script",
|
||||
stylesheet: "stylesheet",
|
||||
image: "image",
|
||||
media: "media",
|
||||
xmlhttprequest: "xhr",
|
||||
other: "other",
|
||||
"inline-script": "script"
|
||||
};
|
||||
if (typeMap[rule.type] && resourceType !== typeMap[rule.type]) {
|
||||
if (rule.types.length > 0) {
|
||||
if (!rule.types.some((type) => resourceTypeMatches(type, resourceType))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -271,18 +287,11 @@ function matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname,
|
||||
}
|
||||
|
||||
if (rule.kind === "domain") {
|
||||
const domainRe = new RegExp(
|
||||
"^" + rule.domain.replace(/\./g, "\\.").replace(/\*/g, "[^.]*") + "$",
|
||||
"i"
|
||||
);
|
||||
if (!domainRe.test(hostname)) return false;
|
||||
if (!domainPatternMatches(hostname, rule.domain)) return false;
|
||||
|
||||
if (rule.path) {
|
||||
const pathRe = new RegExp(
|
||||
"^" + rule.path.replace(/\./g, "\\.").replace(/\*/g, ".*").replace(/\?/g, "\\?").replace(/\^/g, ""),
|
||||
"i"
|
||||
);
|
||||
if (!pathRe.test(urlObj.pathname)) return false;
|
||||
const pathRe = new RegExp("^" + adblockPatternToRegex(rule.path), "i");
|
||||
if (!pathRe.test(urlObj.pathname + urlObj.search)) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -296,9 +305,84 @@ function matchesNetworkRule(url, urlObj, hostname, resourceType, sourceHostname,
|
||||
}
|
||||
}
|
||||
|
||||
if (rule.kind === "pattern") {
|
||||
try {
|
||||
const re = new RegExp(rule.regex, "i");
|
||||
return re.test(url);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
function resourceTypeMatches(filterType, resourceType) {
|
||||
const typeMap = {
|
||||
document: ["document"],
|
||||
font: ["font"],
|
||||
image: ["image"],
|
||||
"inline-script": ["script"],
|
||||
media: ["media"],
|
||||
object: ["object"],
|
||||
other: ["other"],
|
||||
script: ["script"],
|
||||
stylesheet: ["stylesheet"],
|
||||
subdocument: ["document"],
|
||||
xmlhttprequest: ["fetch", "xhr"]
|
||||
};
|
||||
const mapped = typeMap[filterType];
|
||||
return mapped ? mapped.includes(resourceType) : false;
|
||||
}
|
||||
|
||||
function domainPatternMatches(hostname, pattern) {
|
||||
const normalized = pattern.replace(/\^$/, "").toLowerCase();
|
||||
if (!normalized) return false;
|
||||
|
||||
if (!normalized.includes("*")) {
|
||||
return hostname === normalized || hostname.endsWith("." + normalized);
|
||||
}
|
||||
|
||||
const re = new RegExp(
|
||||
"^" +
|
||||
normalized
|
||||
.split("*")
|
||||
.map((part) => part.replace(/[|\\{}()[\]^$+?.]/g, "\\$&"))
|
||||
.join("[^.]*") +
|
||||
"$",
|
||||
"i"
|
||||
);
|
||||
return re.test(hostname);
|
||||
}
|
||||
|
||||
function adblockPatternToRegex(pattern) {
|
||||
let source = "";
|
||||
let remaining = pattern;
|
||||
let anchoredStart = false;
|
||||
let anchoredEnd = false;
|
||||
|
||||
if (remaining.startsWith("|")) {
|
||||
anchoredStart = true;
|
||||
remaining = remaining.slice(1);
|
||||
}
|
||||
if (remaining.endsWith("|")) {
|
||||
anchoredEnd = true;
|
||||
remaining = remaining.slice(0, -1);
|
||||
}
|
||||
|
||||
for (const ch of remaining) {
|
||||
if (ch === "*") {
|
||||
source += ".*";
|
||||
} else if (ch === "^") {
|
||||
source += "(?:[^A-Za-z0-9_.%-]|$)";
|
||||
} else {
|
||||
source += ch.replace(/[|\\{}()[\]^$+?.]/g, "\\$&");
|
||||
}
|
||||
}
|
||||
|
||||
return `${anchoredStart ? "^" : ""}${source}${anchoredEnd ? "$" : ""}`;
|
||||
}
|
||||
|
||||
function shouldBlockRequest(url, resourceType, sourceHostname) {
|
||||
if (url === sourceHostname || url.startsWith(sourceHostname + "/")) {
|
||||
return false;
|
||||
@@ -512,7 +596,7 @@ async function setupRequestBlocking(page, sourceHostname) {
|
||||
await page.route("**/*", (route) => {
|
||||
try {
|
||||
const request = route.request();
|
||||
if (request.isNavigationRequest()) {
|
||||
if (request.isNavigationRequest() && request.frame() === page.mainFrame()) {
|
||||
route.continue();
|
||||
return;
|
||||
}
|
||||
@@ -584,6 +668,9 @@ async function injectPrivacyUserScripts(page, sourceUrl) {
|
||||
// Inject GM API mock first.
|
||||
try {
|
||||
await page.addScriptTag({ content: GM_MOCK });
|
||||
if (userScriptRequireContent) {
|
||||
await page.addScriptTag({ content: userScriptRequireContent });
|
||||
}
|
||||
} catch {
|
||||
return;
|
||||
}
|
||||
@@ -731,15 +818,19 @@ function addArchiveComment(html, sourceUrl) {
|
||||
|
||||
export function findExternalAssetRefs(html) {
|
||||
const refs = new Set();
|
||||
const attrPattern = /\s(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi;
|
||||
for (const match of html.matchAll(attrPattern)) {
|
||||
if (isSelfContainedAssetRef(match[2])) {
|
||||
continue;
|
||||
}
|
||||
for (const part of match[2].split(",")) {
|
||||
const candidate = part.trim().split(/\s+/)[0];
|
||||
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
||||
refs.add(candidate);
|
||||
const assetTagPattern = /<(?:img|source|audio|video|track|embed|object|input|iframe)\b[^>]*>/gi;
|
||||
for (const match of html.matchAll(assetTagPattern)) {
|
||||
const tag = match[0];
|
||||
for (const attr of ["src", "srcset", "poster", "data"]) {
|
||||
const value = readAttribute(tag, attr);
|
||||
if (!value || isSelfContainedAssetRef(value)) {
|
||||
continue;
|
||||
}
|
||||
for (const part of value.split(",")) {
|
||||
const candidate = part.trim().split(/\s+/)[0];
|
||||
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
||||
refs.add(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -779,8 +870,8 @@ function isSelfContainedAssetRef(value) {
|
||||
}
|
||||
|
||||
function readAttribute(tag, attr) {
|
||||
const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"));
|
||||
return match ? match[2] ?? match[3] ?? match[4] ?? "" : "";
|
||||
const match = findAttribute(tag, attr);
|
||||
return match ? match.value : "";
|
||||
}
|
||||
|
||||
function cleanCssUrl(value) {
|
||||
@@ -796,3 +887,61 @@ function cleanCssUrl(value) {
|
||||
}
|
||||
return decoded;
|
||||
}
|
||||
|
||||
function findAttribute(openingTag, attr) {
|
||||
const attrLower = attr.toLowerCase();
|
||||
const nameMatch = openingTag.match(/^<[^\s/>]+/);
|
||||
let index = nameMatch ? nameMatch[0].length : 1;
|
||||
|
||||
while (index < openingTag.length) {
|
||||
while (index < openingTag.length && /\s/.test(openingTag[index])) {
|
||||
index += 1;
|
||||
}
|
||||
if (index >= openingTag.length || openingTag[index] === ">" || openingTag[index] === "/") {
|
||||
return null;
|
||||
}
|
||||
|
||||
const start = index;
|
||||
while (index < openingTag.length && !/[\s=/>]/.test(openingTag[index])) {
|
||||
index += 1;
|
||||
}
|
||||
const name = openingTag.slice(start, index);
|
||||
|
||||
while (index < openingTag.length && /\s/.test(openingTag[index])) {
|
||||
index += 1;
|
||||
}
|
||||
|
||||
let value = "";
|
||||
if (openingTag[index] === "=") {
|
||||
index += 1;
|
||||
while (index < openingTag.length && /\s/.test(openingTag[index])) {
|
||||
index += 1;
|
||||
}
|
||||
|
||||
const quote = openingTag[index];
|
||||
if (quote === '"' || quote === "'") {
|
||||
index += 1;
|
||||
const valueStart = index;
|
||||
while (index < openingTag.length && openingTag[index] !== quote) {
|
||||
index += 1;
|
||||
}
|
||||
value = openingTag.slice(valueStart, index);
|
||||
if (openingTag[index] === quote) {
|
||||
index += 1;
|
||||
}
|
||||
} else {
|
||||
const valueStart = index;
|
||||
while (index < openingTag.length && !/[\s>]/.test(openingTag[index])) {
|
||||
index += 1;
|
||||
}
|
||||
value = openingTag.slice(valueStart, index);
|
||||
}
|
||||
}
|
||||
|
||||
if (name.toLowerCase() === attrLower) {
|
||||
return { start, end: index, value };
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -197,11 +197,6 @@ export class AssetInliner {
|
||||
async (match) => this.rewriteMediaAttributes(match[0], effectiveBase)
|
||||
);
|
||||
|
||||
output = await replaceAsync(output, /srcset=(["'])([\s\S]*?)\1/gi, async (match) => {
|
||||
const rewritten = await this.inlineSrcset(match[2], effectiveBase);
|
||||
return `srcset=${match[1]}${htmlEscape(rewritten)}${match[1]}`;
|
||||
});
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
@@ -259,12 +254,28 @@ export class AssetInliner {
|
||||
output = replaceMissingMediaAttribute(output, attr);
|
||||
}
|
||||
}
|
||||
const srcset = getAttribute(output, "srcset");
|
||||
if (srcset) {
|
||||
const rewritten = await this.inlineSrcset(srcset, baseUrl);
|
||||
output = setAttribute(output, "srcset", rewritten);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
async rewriteIframeTag(tag, baseUrl, depth) {
|
||||
const srcdoc = getAttribute(tag, "srcdoc");
|
||||
if (srcdoc) {
|
||||
let rewritten = removeAttribute(tag, "src");
|
||||
if (depth >= 2) {
|
||||
return rewritten;
|
||||
}
|
||||
const inlined = await this.inlineHtml(srcdoc, baseUrl, { depth: depth + 1 });
|
||||
rewritten = setAttribute(rewritten, "srcdoc", inlined);
|
||||
return rewritten;
|
||||
}
|
||||
|
||||
const src = getAttribute(tag, "src");
|
||||
if (!src || getAttribute(tag, "srcdoc")) {
|
||||
if (!src) {
|
||||
return this.rewriteMediaAttributes(tag, baseUrl);
|
||||
}
|
||||
const absolute = resolveUrl(src, baseUrl);
|
||||
@@ -425,24 +436,42 @@ function mimeFromUrl(rawUrl) {
|
||||
}
|
||||
|
||||
function getAttribute(tag, attr) {
|
||||
const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"));
|
||||
if (!match) {
|
||||
const openingTag = getOpeningTag(tag);
|
||||
if (!openingTag) {
|
||||
return null;
|
||||
}
|
||||
return htmlDecode(match[2] ?? match[3] ?? match[4] ?? "");
|
||||
const match = findAttribute(openingTag, attr);
|
||||
return match ? htmlDecode(match.value) : null;
|
||||
}
|
||||
|
||||
function setAttribute(tag, attr, value) {
|
||||
const escaped = htmlEscape(value);
|
||||
const attrRegex = new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i");
|
||||
if (attrRegex.test(tag)) {
|
||||
return tag.replace(attrRegex, `${attr}="${escaped}"`);
|
||||
}
|
||||
return tag.replace(/^<[^>]*>/, (openingTag) => openingTag.replace(/\/?>$/, (end) => ` ${attr}="${escaped}"${end}`));
|
||||
return replaceOpeningTag(tag, (openingTag) => {
|
||||
const match = findAttribute(openingTag, attr);
|
||||
if (match) {
|
||||
return `${openingTag.slice(0, match.start)}${attr}="${escaped}"${openingTag.slice(match.end)}`;
|
||||
}
|
||||
|
||||
const selfClosing = /\/\s*>$/.test(openingTag);
|
||||
const closeIndex = openingTag.lastIndexOf(">");
|
||||
const beforeClose = openingTag.slice(0, closeIndex).replace(/\s*\/\s*$/, "");
|
||||
return `${beforeClose} ${attr}="${escaped}"${selfClosing ? " /" : ""}>`;
|
||||
});
|
||||
}
|
||||
|
||||
function removeAttribute(tag, attr) {
|
||||
return tag.replace(new RegExp(`\\s+${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"), "");
|
||||
return replaceOpeningTag(tag, (openingTag) => {
|
||||
const match = findAttribute(openingTag, attr);
|
||||
if (!match) {
|
||||
return openingTag;
|
||||
}
|
||||
|
||||
let start = match.start;
|
||||
while (start > 0 && /\s/.test(openingTag[start - 1])) {
|
||||
start -= 1;
|
||||
}
|
||||
return `${openingTag.slice(0, start)}${openingTag.slice(match.end)}`;
|
||||
});
|
||||
}
|
||||
|
||||
function replaceMissingMediaAttribute(tag, attr) {
|
||||
@@ -474,3 +503,95 @@ function cleanCssUrl(value) {
|
||||
}
|
||||
return decoded;
|
||||
}
|
||||
|
||||
function getOpeningTag(markup) {
|
||||
const end = openingTagEndIndex(markup);
|
||||
return end >= 0 ? markup.slice(0, end + 1) : null;
|
||||
}
|
||||
|
||||
function replaceOpeningTag(markup, replacer) {
|
||||
const end = openingTagEndIndex(markup);
|
||||
if (end < 0) {
|
||||
return markup;
|
||||
}
|
||||
return `${replacer(markup.slice(0, end + 1))}${markup.slice(end + 1)}`;
|
||||
}
|
||||
|
||||
function openingTagEndIndex(markup) {
|
||||
let quote = "";
|
||||
for (let i = 0; i < markup.length; i += 1) {
|
||||
const ch = markup[i];
|
||||
if (quote) {
|
||||
if (ch === quote) {
|
||||
quote = "";
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (ch === '"' || ch === "'") {
|
||||
quote = ch;
|
||||
continue;
|
||||
}
|
||||
if (ch === ">") {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
function findAttribute(openingTag, attr) {
|
||||
const attrLower = attr.toLowerCase();
|
||||
const nameMatch = openingTag.match(/^<[^\s/>]+/);
|
||||
let index = nameMatch ? nameMatch[0].length : 1;
|
||||
|
||||
while (index < openingTag.length) {
|
||||
while (index < openingTag.length && /\s/.test(openingTag[index])) {
|
||||
index += 1;
|
||||
}
|
||||
if (index >= openingTag.length || openingTag[index] === ">" || openingTag[index] === "/") {
|
||||
return null;
|
||||
}
|
||||
|
||||
const start = index;
|
||||
while (index < openingTag.length && !/[\s=/>]/.test(openingTag[index])) {
|
||||
index += 1;
|
||||
}
|
||||
const name = openingTag.slice(start, index);
|
||||
|
||||
while (index < openingTag.length && /\s/.test(openingTag[index])) {
|
||||
index += 1;
|
||||
}
|
||||
|
||||
let value = "";
|
||||
if (openingTag[index] === "=") {
|
||||
index += 1;
|
||||
while (index < openingTag.length && /\s/.test(openingTag[index])) {
|
||||
index += 1;
|
||||
}
|
||||
|
||||
const quote = openingTag[index];
|
||||
if (quote === '"' || quote === "'") {
|
||||
index += 1;
|
||||
const valueStart = index;
|
||||
while (index < openingTag.length && openingTag[index] !== quote) {
|
||||
index += 1;
|
||||
}
|
||||
value = openingTag.slice(valueStart, index);
|
||||
if (openingTag[index] === quote) {
|
||||
index += 1;
|
||||
}
|
||||
} else {
|
||||
const valueStart = index;
|
||||
while (index < openingTag.length && !/[\s>]/.test(openingTag[index])) {
|
||||
index += 1;
|
||||
}
|
||||
value = openingTag.slice(valueStart, index);
|
||||
}
|
||||
}
|
||||
|
||||
if (name.toLowerCase() === attrLower) {
|
||||
return { start, end: index, value };
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user