Add EasyList filter support
This commit is contained in:
@@ -16,6 +16,7 @@
|
||||
"start:frontend": "node src/frontend-server.mjs",
|
||||
"start:worker": "node src/worker-server.mjs",
|
||||
"test": "node --test test/*.test.mjs",
|
||||
"update-filter-lists": "node scripts/update-filter-lists.mjs",
|
||||
"install-browsers": "playwright install chromium"
|
||||
},
|
||||
"dependencies": {
|
||||
|
||||
@@ -1,3 +1,27 @@
|
||||
# Privacy filter lists
|
||||
|
||||
This directory contains the static filters and userscripts applied before the
|
||||
archiver snapshots a page. `bpc-paywall-filter.txt` and `userscript/` come from
|
||||
Bypass Paywalls Clean. `lists/` contains bundled ad, annoyance, and cookie
|
||||
notice lists for offline/container use:
|
||||
|
||||
- `easylist.txt` from `https://easylist.to/easylist/easylist.txt`
|
||||
- `ublock-filters.txt` from `https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/filters.txt`
|
||||
- `easylist-cookie.txt` from `https://easylist-downloads.adblockplus.org/fanboy-cookiemonster.txt`
|
||||
- `ublock-annoyances.txt` from `https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/annoyances.txt`
|
||||
- `ublock-cookies.txt` from `https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/annoyances-cookies.txt`
|
||||
|
||||
Refresh the bundled lists with:
|
||||
|
||||
```sh
|
||||
npm run update-filter-lists
|
||||
```
|
||||
|
||||
The parser supports common ABP/uBO network rules, exceptions, domain/type
|
||||
modifiers, cosmetic hiding rules, `:remove()` and `:style(...)` downgrades, and
|
||||
AdGuard CSS injection rules. Unsupported procedural filters, HTML filters, and
|
||||
scriptlets are skipped.
|
||||
|
||||
# Bypass Paywalls Clean filters
|
||||
|
||||
Adblocker list which allows you to read articles from (supported) sites that implement a paywall (for a lot of sites you also need to install an userscript).\
|
||||
|
||||
25891
privacy-filters/lists/easylist-cookie.txt
Normal file
25891
privacy-filters/lists/easylist-cookie.txt
Normal file
File diff suppressed because one or more lines are too long
89772
privacy-filters/lists/easylist.txt
Normal file
89772
privacy-filters/lists/easylist.txt
Normal file
File diff suppressed because it is too large
Load Diff
8079
privacy-filters/lists/ublock-annoyances.txt
Normal file
8079
privacy-filters/lists/ublock-annoyances.txt
Normal file
File diff suppressed because it is too large
Load Diff
5570
privacy-filters/lists/ublock-cookies.txt
Normal file
5570
privacy-filters/lists/ublock-cookies.txt
Normal file
File diff suppressed because one or more lines are too long
41456
privacy-filters/lists/ublock-filters.txt
Normal file
41456
privacy-filters/lists/ublock-filters.txt
Normal file
File diff suppressed because one or more lines are too long
98
scripts/update-filter-lists.mjs
Normal file
98
scripts/update-filter-lists.mjs
Normal file
@@ -0,0 +1,98 @@
|
||||
#!/usr/bin/env node
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const repoRoot = path.join(__dirname, "..");
|
||||
const outputDir = path.join(repoRoot, "privacy-filters", "lists");
|
||||
|
||||
const FILTER_LISTS = [
|
||||
{
|
||||
file: "easylist.txt",
|
||||
url: "https://easylist.to/easylist/easylist.txt"
|
||||
},
|
||||
{
|
||||
file: "ublock-filters.txt",
|
||||
url: "https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/filters.txt",
|
||||
expandIncludes: true
|
||||
},
|
||||
{
|
||||
file: "easylist-cookie.txt",
|
||||
url: "https://easylist-downloads.adblockplus.org/fanboy-cookiemonster.txt"
|
||||
},
|
||||
{
|
||||
file: "ublock-annoyances.txt",
|
||||
url: "https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/annoyances.txt",
|
||||
expandIncludes: true
|
||||
},
|
||||
{
|
||||
file: "ublock-cookies.txt",
|
||||
url: "https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/annoyances-cookies.txt",
|
||||
expandIncludes: true
|
||||
}
|
||||
];
|
||||
|
||||
async function main() {
|
||||
await fs.mkdir(outputDir, { recursive: true });
|
||||
|
||||
for (const list of FILTER_LISTS) {
|
||||
const content = list.expandIncludes
|
||||
? await fetchListWithIncludes(list.url)
|
||||
: await fetchText(list.url);
|
||||
const header = [
|
||||
`! Bundled by scripts/update-filter-lists.mjs from ${list.url}`,
|
||||
`! Bundled at ${new Date().toISOString()}`,
|
||||
""
|
||||
].join("\n");
|
||||
const output = header + normalizeLineEndings(content).replace(/\s*$/, "\n");
|
||||
const outputPath = path.join(outputDir, list.file);
|
||||
await fs.writeFile(outputPath, output, "utf8");
|
||||
console.log(`wrote ${path.relative(repoRoot, outputPath)} (${output.length} bytes)`);
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchListWithIncludes(url, seen = new Set()) {
|
||||
if (seen.has(url)) {
|
||||
return "";
|
||||
}
|
||||
seen.add(url);
|
||||
|
||||
const content = await fetchText(url);
|
||||
const lines = [];
|
||||
for (const line of normalizeLineEndings(content).split("\n")) {
|
||||
const includeMatch = line.match(/^!#include\s+(.+)$/);
|
||||
if (!includeMatch) {
|
||||
lines.push(line);
|
||||
continue;
|
||||
}
|
||||
|
||||
const includeUrl = new URL(includeMatch[1].trim(), url).href;
|
||||
lines.push(`! >>> begin include ${includeMatch[1].trim()}`);
|
||||
lines.push(await fetchListWithIncludes(includeUrl, seen));
|
||||
lines.push(`! <<< end include ${includeMatch[1].trim()}`);
|
||||
}
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
async function fetchText(url) {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
"user-agent": "local-page-archiver filter-list updater"
|
||||
},
|
||||
redirect: "follow"
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch ${url}: HTTP ${response.status}`);
|
||||
}
|
||||
return response.text();
|
||||
}
|
||||
|
||||
function normalizeLineEndings(value) {
|
||||
return String(value).replace(/\r\n?/g, "\n");
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(error.message);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
940
src/archiver.mjs
940
src/archiver.mjs
File diff suppressed because it is too large
Load Diff
@@ -144,6 +144,7 @@ export class AssetInliner {
|
||||
this.referer = options.referer;
|
||||
this.maxAssetBytes = options.maxAssetBytes || 30 * 1024 * 1024;
|
||||
this.maxInlineStyleBytes = options.maxInlineStyleBytes || 128 * 1024;
|
||||
this.shouldBlockAsset = options.shouldBlockAsset || null;
|
||||
this.cache = new Map();
|
||||
this.warnings = [];
|
||||
}
|
||||
@@ -214,7 +215,7 @@ export class AssetInliner {
|
||||
if (!absolute || absolute.startsWith("data:")) {
|
||||
return "";
|
||||
}
|
||||
const css = await this.fetchText(absolute, baseUrl);
|
||||
const css = await this.fetchText(absolute, baseUrl, "stylesheet");
|
||||
if (css == null) {
|
||||
return "";
|
||||
}
|
||||
@@ -242,7 +243,7 @@ export class AssetInliner {
|
||||
}
|
||||
}
|
||||
|
||||
const dataUri = await this.toDataUri(href, baseUrl);
|
||||
const dataUri = await this.toDataUri(href, baseUrl, linkResourceType(asValue));
|
||||
if (!dataUri) {
|
||||
return "";
|
||||
}
|
||||
@@ -251,12 +252,13 @@ export class AssetInliner {
|
||||
|
||||
async rewriteMediaAttributes(tag, baseUrl) {
|
||||
let output = tag;
|
||||
const tagName = getTagName(output);
|
||||
for (const attr of ["src", "poster", "data"]) {
|
||||
const value = getAttribute(output, attr);
|
||||
if (!value) {
|
||||
continue;
|
||||
}
|
||||
const dataUri = await this.toDataUri(value, baseUrl);
|
||||
const dataUri = await this.toDataUri(value, baseUrl, mediaResourceType(tagName, attr));
|
||||
if (dataUri) {
|
||||
output = setAttribute(output, attr, dataUri);
|
||||
} else {
|
||||
@@ -291,7 +293,7 @@ export class AssetInliner {
|
||||
if (!absolute || absolute.startsWith("data:")) {
|
||||
return tag;
|
||||
}
|
||||
const text = await this.fetchText(absolute, baseUrl);
|
||||
const text = await this.fetchText(absolute, baseUrl, "subdocument");
|
||||
if (text != null) {
|
||||
const inlined = await this.inlineHtml(text, absolute, { depth: depth + 1 });
|
||||
let rewritten = removeAttribute(tag, "src");
|
||||
@@ -308,7 +310,7 @@ export class AssetInliner {
|
||||
const rewritten = [];
|
||||
for (const candidate of candidates) {
|
||||
const [urlPart, ...descriptor] = candidate.split(/\s+/);
|
||||
const dataUri = await this.toDataUri(urlPart, baseUrl);
|
||||
const dataUri = await this.toDataUri(urlPart, baseUrl, "image");
|
||||
rewritten.push([dataUri || TRANSPARENT_IMAGE_DATA_URI, ...descriptor].join(" "));
|
||||
}
|
||||
return rewritten.join(", ");
|
||||
@@ -323,7 +325,7 @@ export class AssetInliner {
|
||||
if (!absolute || absolute.startsWith("data:")) {
|
||||
return "";
|
||||
}
|
||||
const imported = await this.fetchText(absolute, baseUrl);
|
||||
const imported = await this.fetchText(absolute, baseUrl, "stylesheet");
|
||||
if (imported == null) {
|
||||
return "";
|
||||
}
|
||||
@@ -336,33 +338,34 @@ export class AssetInliner {
|
||||
if (!raw || raw.startsWith("#") || /^%23/i.test(raw) || /^(?:data|blob|about|javascript):/i.test(raw)) {
|
||||
return match[0];
|
||||
}
|
||||
const dataUri = await this.toDataUri(raw, baseUrl);
|
||||
const dataUri = await this.toDataUri(raw, baseUrl, cssResourceType(raw, baseUrl));
|
||||
return dataUri ? `url("${dataUri}")` : "url(about:blank)";
|
||||
});
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
async toDataUri(rawUrl, baseUrl) {
|
||||
async toDataUri(rawUrl, baseUrl, resourceType = "other") {
|
||||
const absolute = resolveUrl(rawUrl, baseUrl);
|
||||
if (!absolute || absolute.startsWith("data:")) {
|
||||
return absolute;
|
||||
}
|
||||
if (this.cache.has(absolute)) {
|
||||
return this.cache.get(absolute);
|
||||
const cacheKey = `${resourceType}:${absolute}`;
|
||||
if (this.cache.has(cacheKey)) {
|
||||
return this.cache.get(cacheKey);
|
||||
}
|
||||
const asset = await this.fetchAsset(absolute, baseUrl);
|
||||
const asset = await this.fetchAsset(absolute, baseUrl, resourceType);
|
||||
if (!asset) {
|
||||
this.cache.set(absolute, null);
|
||||
this.cache.set(cacheKey, null);
|
||||
return null;
|
||||
}
|
||||
const dataUri = `data:${asset.contentType};base64,${asset.bytes.toString("base64")}`;
|
||||
this.cache.set(absolute, dataUri);
|
||||
this.cache.set(cacheKey, dataUri);
|
||||
return dataUri;
|
||||
}
|
||||
|
||||
async fetchText(rawUrl, baseUrl) {
|
||||
const asset = await this.fetchAsset(rawUrl, baseUrl);
|
||||
async fetchText(rawUrl, baseUrl, resourceType = "other") {
|
||||
const asset = await this.fetchAsset(rawUrl, baseUrl, resourceType);
|
||||
if (!asset) {
|
||||
return null;
|
||||
}
|
||||
@@ -373,12 +376,15 @@ export class AssetInliner {
|
||||
return asset.bytes.toString("utf8");
|
||||
}
|
||||
|
||||
async fetchAsset(rawUrl, baseUrl) {
|
||||
async fetchAsset(rawUrl, baseUrl, resourceType = "other") {
|
||||
const absolute = resolveUrl(rawUrl, baseUrl);
|
||||
if (!absolute || absolute.startsWith("data:")) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
if (this.shouldBlockAsset?.(absolute, resourceType)) {
|
||||
return null;
|
||||
}
|
||||
if (isFileUrl(absolute)) {
|
||||
const filePath = fileURLToPath(absolute);
|
||||
const bytes = await fs.readFile(filePath);
|
||||
@@ -512,6 +518,43 @@ function mimeFromUrl(rawUrl) {
|
||||
return MIME_BY_EXT.get(path.extname(pathname).toLowerCase()) || "application/octet-stream";
|
||||
}
|
||||
|
||||
function linkResourceType(asValue) {
|
||||
const normalized = String(asValue || "").toLowerCase();
|
||||
if (normalized === "font") return "font";
|
||||
if (normalized === "script") return "script";
|
||||
if (normalized === "style") return "stylesheet";
|
||||
if (normalized === "document") return "subdocument";
|
||||
if (normalized === "audio" || normalized === "video") return "media";
|
||||
return "image";
|
||||
}
|
||||
|
||||
function mediaResourceType(tagName, attr) {
|
||||
if (tagName === "iframe") return "subdocument";
|
||||
if (tagName === "object" || tagName === "embed") return "object";
|
||||
if (tagName === "audio" || tagName === "video") return "media";
|
||||
if (attr === "poster") return "image";
|
||||
if (tagName === "track") return "other";
|
||||
return "image";
|
||||
}
|
||||
|
||||
function cssResourceType(rawUrl, baseUrl) {
|
||||
const absolute = resolveUrl(rawUrl, baseUrl) || rawUrl;
|
||||
let pathname = absolute;
|
||||
try {
|
||||
pathname = new URL(absolute).pathname;
|
||||
} catch {
|
||||
// Keep raw string.
|
||||
}
|
||||
const ext = path.extname(pathname).toLowerCase();
|
||||
if ([".woff", ".woff2", ".ttf", ".otf"].includes(ext)) return "font";
|
||||
if ([".mp4", ".webm", ".mp3", ".m4a"].includes(ext)) return "media";
|
||||
return "image";
|
||||
}
|
||||
|
||||
function getTagName(markup) {
|
||||
return markup.match(/^<([a-z0-9:-]+)/i)?.[1]?.toLowerCase() || "";
|
||||
}
|
||||
|
||||
function getAttribute(tag, attr) {
|
||||
const openingTag = getOpeningTag(tag);
|
||||
if (!openingTag) {
|
||||
|
||||
@@ -1,6 +1,82 @@
|
||||
import assert from "node:assert/strict";
|
||||
import test from "node:test";
|
||||
import { renderPage } from "../src/archiver.mjs";
|
||||
import {
|
||||
getCosmeticCssForHostname,
|
||||
parseFilterRules,
|
||||
renderPage,
|
||||
shouldBlockRequestWithRules
|
||||
} from "../src/archiver.mjs";
|
||||
|
||||
test("parses EasyList-style network rules, exceptions, and badfilter entries", () => {
|
||||
const rules = parseFilterRules(`
|
||||
[Adblock Plus 2.0]
|
||||
||ads.example.com^$script,third-party
|
||||
@@||ads.example.com/allowed.js$script,domain=publisher.test
|
||||
banner$~image
|
||||
||disabled.example^$script
|
||||
||disabled.example^$script,badfilter
|
||||
`);
|
||||
|
||||
assert.equal(
|
||||
shouldBlockRequestWithRules(
|
||||
rules,
|
||||
"https://ads.example.com/banner.js",
|
||||
"script",
|
||||
"www.publisher.test"
|
||||
),
|
||||
true
|
||||
);
|
||||
assert.equal(
|
||||
shouldBlockRequestWithRules(
|
||||
rules,
|
||||
"https://ads.example.com/banner.png",
|
||||
"image",
|
||||
"www.publisher.test"
|
||||
),
|
||||
false
|
||||
);
|
||||
assert.equal(
|
||||
shouldBlockRequestWithRules(
|
||||
rules,
|
||||
"https://ads.example.com/allowed.js",
|
||||
"script",
|
||||
"www.publisher.test"
|
||||
),
|
||||
false
|
||||
);
|
||||
assert.equal(
|
||||
shouldBlockRequestWithRules(
|
||||
rules,
|
||||
"https://disabled.example/ad.js",
|
||||
"script",
|
||||
"www.publisher.test"
|
||||
),
|
||||
false
|
||||
);
|
||||
});
|
||||
|
||||
test("applies cosmetic filters with domain exceptions and skips unsupported procedural selectors", () => {
|
||||
const rules = parseFilterRules(`
|
||||
##.generic-ad
|
||||
example.com##.site-ad
|
||||
example.com#@#.generic-ad
|
||||
~news.example.com,example.com##.except-news
|
||||
example.*##.entity-ad
|
||||
bad.example##div:has-text(ad)
|
||||
foo.com#$#.adguard { display: none !important; }
|
||||
`);
|
||||
|
||||
const exampleCss = getCosmeticCssForHostname(rules, "www.example.com").join("\n");
|
||||
assert.doesNotMatch(exampleCss, /\.generic-ad/);
|
||||
assert.match(exampleCss, /\.site-ad/);
|
||||
assert.match(exampleCss, /\.except-news/);
|
||||
assert.match(exampleCss, /\.entity-ad/);
|
||||
|
||||
const newsCss = getCosmeticCssForHostname(rules, "news.example.com").join("\n");
|
||||
assert.doesNotMatch(newsCss, /\.except-news/);
|
||||
assert.equal(getCosmeticCssForHostname(rules, "bad.example").length, 1);
|
||||
assert.match(getCosmeticCssForHostname(rules, "foo.com").join("\n"), /\.adguard/);
|
||||
});
|
||||
|
||||
test("renderPage serializes CSSOM-inserted style rules", async () => {
|
||||
const html = `<!doctype html>
|
||||
|
||||
@@ -81,3 +81,25 @@ test("external asset reporting parses srcset-like attributes without splitting U
|
||||
"https://media.example/photos/id/master/w_240,c_limit/photo.jpg"
|
||||
]);
|
||||
});
|
||||
|
||||
test("asset inliner skips URLs blocked by the filter hook", async () => {
|
||||
const blocked = [];
|
||||
const inliner = new AssetInliner({
|
||||
shouldBlockAsset: (url, resourceType) => {
|
||||
blocked.push([url, resourceType]);
|
||||
return true;
|
||||
}
|
||||
});
|
||||
|
||||
const output = await inliner.inlineHtml(`
|
||||
<link rel="stylesheet" href="https://ads.example/ad.css">
|
||||
<img src="https://ads.example/ad.png">
|
||||
`, "https://publisher.example/article");
|
||||
|
||||
assert.doesNotMatch(output, /ad\.css/);
|
||||
assert.match(output, /data:image\/gif;base64/);
|
||||
assert.deepEqual(blocked, [
|
||||
["https://ads.example/ad.css", "stylesheet"],
|
||||
["https://ads.example/ad.png", "image"]
|
||||
]);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user