Add EasyList filter support

This commit is contained in:
2026-05-16 22:07:39 -07:00
parent f4f1a7a78d
commit 46444b193b
12 changed files with 171818 additions and 228 deletions

View File

@@ -16,6 +16,7 @@
"start:frontend": "node src/frontend-server.mjs",
"start:worker": "node src/worker-server.mjs",
"test": "node --test test/*.test.mjs",
"update-filter-lists": "node scripts/update-filter-lists.mjs",
"install-browsers": "playwright install chromium"
},
"dependencies": {

View File

@@ -1,3 +1,27 @@
# Privacy filter lists
This directory contains the static filters and userscripts applied before the
archiver snapshots a page. `bpc-paywall-filter.txt` and `userscript/` come from
Bypass Paywalls Clean. `lists/` contains bundled ad, annoyance, and cookie
notice lists for offline/container use:
- `easylist.txt` from `https://easylist.to/easylist/easylist.txt`
- `ublock-filters.txt` from `https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/filters.txt`
- `easylist-cookie.txt` from `https://easylist-downloads.adblockplus.org/fanboy-cookiemonster.txt`
- `ublock-annoyances.txt` from `https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/annoyances.txt`
- `ublock-cookies.txt` from `https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/annoyances-cookies.txt`
Refresh the bundled lists with:
```sh
npm run update-filter-lists
```
The parser supports common ABP/uBO network rules, exceptions, domain/type
modifiers, cosmetic hiding rules, `:remove()` and `:style(...)` downgrades, and
AdGuard CSS injection rules. Unsupported procedural filters, HTML filters, and
scriptlets are skipped.
# Bypass Paywalls Clean filters
Adblocker list which allows you to read articles from (supported) sites that implement a paywall (for a lot of sites you also need to install an userscript).\

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,98 @@
#!/usr/bin/env node
import fs from "node:fs/promises";
import path from "node:path";
import { fileURLToPath } from "node:url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const repoRoot = path.join(__dirname, "..");
const outputDir = path.join(repoRoot, "privacy-filters", "lists");
const FILTER_LISTS = [
{
file: "easylist.txt",
url: "https://easylist.to/easylist/easylist.txt"
},
{
file: "ublock-filters.txt",
url: "https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/filters.txt",
expandIncludes: true
},
{
file: "easylist-cookie.txt",
url: "https://easylist-downloads.adblockplus.org/fanboy-cookiemonster.txt"
},
{
file: "ublock-annoyances.txt",
url: "https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/annoyances.txt",
expandIncludes: true
},
{
file: "ublock-cookies.txt",
url: "https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/annoyances-cookies.txt",
expandIncludes: true
}
];
async function main() {
await fs.mkdir(outputDir, { recursive: true });
for (const list of FILTER_LISTS) {
const content = list.expandIncludes
? await fetchListWithIncludes(list.url)
: await fetchText(list.url);
const header = [
`! Bundled by scripts/update-filter-lists.mjs from ${list.url}`,
`! Bundled at ${new Date().toISOString()}`,
""
].join("\n");
const output = header + normalizeLineEndings(content).replace(/\s*$/, "\n");
const outputPath = path.join(outputDir, list.file);
await fs.writeFile(outputPath, output, "utf8");
console.log(`wrote ${path.relative(repoRoot, outputPath)} (${output.length} bytes)`);
}
}
async function fetchListWithIncludes(url, seen = new Set()) {
if (seen.has(url)) {
return "";
}
seen.add(url);
const content = await fetchText(url);
const lines = [];
for (const line of normalizeLineEndings(content).split("\n")) {
const includeMatch = line.match(/^!#include\s+(.+)$/);
if (!includeMatch) {
lines.push(line);
continue;
}
const includeUrl = new URL(includeMatch[1].trim(), url).href;
lines.push(`! >>> begin include ${includeMatch[1].trim()}`);
lines.push(await fetchListWithIncludes(includeUrl, seen));
lines.push(`! <<< end include ${includeMatch[1].trim()}`);
}
return lines.join("\n");
}
async function fetchText(url) {
const response = await fetch(url, {
headers: {
"user-agent": "local-page-archiver filter-list updater"
},
redirect: "follow"
});
if (!response.ok) {
throw new Error(`Failed to fetch ${url}: HTTP ${response.status}`);
}
return response.text();
}
function normalizeLineEndings(value) {
return String(value).replace(/\r\n?/g, "\n");
}
main().catch((error) => {
console.error(error.message);
process.exitCode = 1;
});

File diff suppressed because it is too large Load Diff

View File

@@ -144,6 +144,7 @@ export class AssetInliner {
this.referer = options.referer;
this.maxAssetBytes = options.maxAssetBytes || 30 * 1024 * 1024;
this.maxInlineStyleBytes = options.maxInlineStyleBytes || 128 * 1024;
this.shouldBlockAsset = options.shouldBlockAsset || null;
this.cache = new Map();
this.warnings = [];
}
@@ -214,7 +215,7 @@ export class AssetInliner {
if (!absolute || absolute.startsWith("data:")) {
return "";
}
const css = await this.fetchText(absolute, baseUrl);
const css = await this.fetchText(absolute, baseUrl, "stylesheet");
if (css == null) {
return "";
}
@@ -242,7 +243,7 @@ export class AssetInliner {
}
}
const dataUri = await this.toDataUri(href, baseUrl);
const dataUri = await this.toDataUri(href, baseUrl, linkResourceType(asValue));
if (!dataUri) {
return "";
}
@@ -251,12 +252,13 @@ export class AssetInliner {
async rewriteMediaAttributes(tag, baseUrl) {
let output = tag;
const tagName = getTagName(output);
for (const attr of ["src", "poster", "data"]) {
const value = getAttribute(output, attr);
if (!value) {
continue;
}
const dataUri = await this.toDataUri(value, baseUrl);
const dataUri = await this.toDataUri(value, baseUrl, mediaResourceType(tagName, attr));
if (dataUri) {
output = setAttribute(output, attr, dataUri);
} else {
@@ -291,7 +293,7 @@ export class AssetInliner {
if (!absolute || absolute.startsWith("data:")) {
return tag;
}
const text = await this.fetchText(absolute, baseUrl);
const text = await this.fetchText(absolute, baseUrl, "subdocument");
if (text != null) {
const inlined = await this.inlineHtml(text, absolute, { depth: depth + 1 });
let rewritten = removeAttribute(tag, "src");
@@ -308,7 +310,7 @@ export class AssetInliner {
const rewritten = [];
for (const candidate of candidates) {
const [urlPart, ...descriptor] = candidate.split(/\s+/);
const dataUri = await this.toDataUri(urlPart, baseUrl);
const dataUri = await this.toDataUri(urlPart, baseUrl, "image");
rewritten.push([dataUri || TRANSPARENT_IMAGE_DATA_URI, ...descriptor].join(" "));
}
return rewritten.join(", ");
@@ -323,7 +325,7 @@ export class AssetInliner {
if (!absolute || absolute.startsWith("data:")) {
return "";
}
const imported = await this.fetchText(absolute, baseUrl);
const imported = await this.fetchText(absolute, baseUrl, "stylesheet");
if (imported == null) {
return "";
}
@@ -336,33 +338,34 @@ export class AssetInliner {
if (!raw || raw.startsWith("#") || /^%23/i.test(raw) || /^(?:data|blob|about|javascript):/i.test(raw)) {
return match[0];
}
const dataUri = await this.toDataUri(raw, baseUrl);
const dataUri = await this.toDataUri(raw, baseUrl, cssResourceType(raw, baseUrl));
return dataUri ? `url("${dataUri}")` : "url(about:blank)";
});
return output;
}
async toDataUri(rawUrl, baseUrl) {
async toDataUri(rawUrl, baseUrl, resourceType = "other") {
const absolute = resolveUrl(rawUrl, baseUrl);
if (!absolute || absolute.startsWith("data:")) {
return absolute;
}
if (this.cache.has(absolute)) {
return this.cache.get(absolute);
const cacheKey = `${resourceType}:${absolute}`;
if (this.cache.has(cacheKey)) {
return this.cache.get(cacheKey);
}
const asset = await this.fetchAsset(absolute, baseUrl);
const asset = await this.fetchAsset(absolute, baseUrl, resourceType);
if (!asset) {
this.cache.set(absolute, null);
this.cache.set(cacheKey, null);
return null;
}
const dataUri = `data:${asset.contentType};base64,${asset.bytes.toString("base64")}`;
this.cache.set(absolute, dataUri);
this.cache.set(cacheKey, dataUri);
return dataUri;
}
async fetchText(rawUrl, baseUrl) {
const asset = await this.fetchAsset(rawUrl, baseUrl);
async fetchText(rawUrl, baseUrl, resourceType = "other") {
const asset = await this.fetchAsset(rawUrl, baseUrl, resourceType);
if (!asset) {
return null;
}
@@ -373,12 +376,15 @@ export class AssetInliner {
return asset.bytes.toString("utf8");
}
async fetchAsset(rawUrl, baseUrl) {
async fetchAsset(rawUrl, baseUrl, resourceType = "other") {
const absolute = resolveUrl(rawUrl, baseUrl);
if (!absolute || absolute.startsWith("data:")) {
return null;
}
try {
if (this.shouldBlockAsset?.(absolute, resourceType)) {
return null;
}
if (isFileUrl(absolute)) {
const filePath = fileURLToPath(absolute);
const bytes = await fs.readFile(filePath);
@@ -512,6 +518,43 @@ function mimeFromUrl(rawUrl) {
return MIME_BY_EXT.get(path.extname(pathname).toLowerCase()) || "application/octet-stream";
}
function linkResourceType(asValue) {
const normalized = String(asValue || "").toLowerCase();
if (normalized === "font") return "font";
if (normalized === "script") return "script";
if (normalized === "style") return "stylesheet";
if (normalized === "document") return "subdocument";
if (normalized === "audio" || normalized === "video") return "media";
return "image";
}
function mediaResourceType(tagName, attr) {
if (tagName === "iframe") return "subdocument";
if (tagName === "object" || tagName === "embed") return "object";
if (tagName === "audio" || tagName === "video") return "media";
if (attr === "poster") return "image";
if (tagName === "track") return "other";
return "image";
}
function cssResourceType(rawUrl, baseUrl) {
const absolute = resolveUrl(rawUrl, baseUrl) || rawUrl;
let pathname = absolute;
try {
pathname = new URL(absolute).pathname;
} catch {
// Keep raw string.
}
const ext = path.extname(pathname).toLowerCase();
if ([".woff", ".woff2", ".ttf", ".otf"].includes(ext)) return "font";
if ([".mp4", ".webm", ".mp3", ".m4a"].includes(ext)) return "media";
return "image";
}
function getTagName(markup) {
return markup.match(/^<([a-z0-9:-]+)/i)?.[1]?.toLowerCase() || "";
}
function getAttribute(tag, attr) {
const openingTag = getOpeningTag(tag);
if (!openingTag) {

View File

@@ -1,6 +1,82 @@
import assert from "node:assert/strict";
import test from "node:test";
import { renderPage } from "../src/archiver.mjs";
import {
getCosmeticCssForHostname,
parseFilterRules,
renderPage,
shouldBlockRequestWithRules
} from "../src/archiver.mjs";
test("parses EasyList-style network rules, exceptions, and badfilter entries", () => {
const rules = parseFilterRules(`
[Adblock Plus 2.0]
||ads.example.com^$script,third-party
@@||ads.example.com/allowed.js$script,domain=publisher.test
banner$~image
||disabled.example^$script
||disabled.example^$script,badfilter
`);
assert.equal(
shouldBlockRequestWithRules(
rules,
"https://ads.example.com/banner.js",
"script",
"www.publisher.test"
),
true
);
assert.equal(
shouldBlockRequestWithRules(
rules,
"https://ads.example.com/banner.png",
"image",
"www.publisher.test"
),
false
);
assert.equal(
shouldBlockRequestWithRules(
rules,
"https://ads.example.com/allowed.js",
"script",
"www.publisher.test"
),
false
);
assert.equal(
shouldBlockRequestWithRules(
rules,
"https://disabled.example/ad.js",
"script",
"www.publisher.test"
),
false
);
});
test("applies cosmetic filters with domain exceptions and skips unsupported procedural selectors", () => {
const rules = parseFilterRules(`
##.generic-ad
example.com##.site-ad
example.com#@#.generic-ad
~news.example.com,example.com##.except-news
example.*##.entity-ad
bad.example##div:has-text(ad)
foo.com#$#.adguard { display: none !important; }
`);
const exampleCss = getCosmeticCssForHostname(rules, "www.example.com").join("\n");
assert.doesNotMatch(exampleCss, /\.generic-ad/);
assert.match(exampleCss, /\.site-ad/);
assert.match(exampleCss, /\.except-news/);
assert.match(exampleCss, /\.entity-ad/);
const newsCss = getCosmeticCssForHostname(rules, "news.example.com").join("\n");
assert.doesNotMatch(newsCss, /\.except-news/);
assert.equal(getCosmeticCssForHostname(rules, "bad.example").length, 1);
assert.match(getCosmeticCssForHostname(rules, "foo.com").join("\n"), /\.adguard/);
});
test("renderPage serializes CSSOM-inserted style rules", async () => {
const html = `<!doctype html>

View File

@@ -81,3 +81,25 @@ test("external asset reporting parses srcset-like attributes without splitting U
"https://media.example/photos/id/master/w_240,c_limit/photo.jpg"
]);
});
test("asset inliner skips URLs blocked by the filter hook", async () => {
const blocked = [];
const inliner = new AssetInliner({
shouldBlockAsset: (url, resourceType) => {
blocked.push([url, resourceType]);
return true;
}
});
const output = await inliner.inlineHtml(`
<link rel="stylesheet" href="https://ads.example/ad.css">
<img src="https://ads.example/ad.png">
`, "https://publisher.example/article");
assert.doesNotMatch(output, /ad\.css/);
assert.match(output, /data:image\/gif;base64/);
assert.deepEqual(blocked, [
["https://ads.example/ad.css", "stylesheet"],
["https://ads.example/ad.png", "image"]
]);
});