diff --git a/README.md b/README.md
index a522386..a59eb64 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,6 @@
# Local Page Archiver
-This project saves self-contained HTML archives for pages the operator is authorized to access. It sends a real browser user agent, renders web URLs with Playwright, strips ad/tracker-like elements, normalizes the captured DOM, and inlines page requisites as `data:` URLs.
-
-It intentionally does not execute paywall-bypass rules. The bundled `bypass-paywalls-clean-filters` files are treated as reference material only; paywall selectors and scripts are not applied.
+This project saves self-contained HTML archives. It opens the input with Playwright, captures the rendered HTML, and inlines external resources as `data:` URLs.
## CLI
@@ -15,35 +13,7 @@ node src/cli.mjs archive "https://example.com/article"
For an existing HTML file:
```sh
-node src/cli.mjs archive ./page.html --static
+node src/cli.mjs archive ./page.html
```
-For an `archive.ph` HTML export where you want the captured page without the archive shell:
-
-```sh
-node src/cli.mjs archive ./bloomberg-archive.html --static --strip-archive-shell
-```
-
-Local `archive.ph` HTML inputs with `--strip-archive-shell` use the static extractor by default because those files already contain the rendered page. Add `--render` only when you explicitly want Chromium to load the local HTML first.
-
-Computed-style freezing is off by default for live web pages because it can inflate modern article pages into very large HTML files. Add `--freeze-styles` only when stylesheet inlining is not enough to preserve layout.
-
Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set.
-
-## API
-
-```sh
-ARCHIVE_PATH=/tmp/local-page-archives npm run serve
-```
-
-Archive a page:
-
-```sh
-curl -X POST http://127.0.0.1:8787/archive \
- -H 'content-type: application/json' \
- -d '{"url":"https://example.com/article"}'
-```
-
-The response includes the archived file path and a local `viewUrl`.
-
-Set `PORT` to choose a port other than the default `8787`.
diff --git a/package.json b/package.json
index b1d1165..9b26266 100644
--- a/package.json
+++ b/package.json
@@ -3,13 +3,12 @@
"version": "0.1.0",
"private": true,
"type": "module",
- "description": "Render and save self-contained HTML archives for pages the operator is authorized to access.",
+ "description": "Render and save self-contained HTML archives.",
"bin": {
"archive-page": "./src/cli.mjs"
},
"scripts": {
"archive": "node src/cli.mjs archive",
- "serve": "node src/server.mjs",
"install-browsers": "playwright install chromium"
},
"dependencies": {
diff --git a/src/archiver.mjs b/src/archiver.mjs
index dcf510c..94f98cc 100644
--- a/src/archiver.mjs
+++ b/src/archiver.mjs
@@ -6,115 +6,38 @@ import {
DEFAULT_USER_AGENT,
defaultArchivePath,
findEffectiveBase,
- htmlEscape,
inputToUrl,
- isFileUrl,
isHttpUrl,
- slugForUrl,
- stripArchiveShell
+ slugForUrl
} from "./asset-inliner.mjs";
const require = createRequire(import.meta.url);
+const PAGE_TIMEOUT_MS = 60000;
+const NETWORK_IDLE_TIMEOUT_MS = 5000;
+const VIEWPORT = {
+ width: 1024,
+ height: 768
+};
export { DEFAULT_USER_AGENT, defaultArchivePath };
-const AD_SELECTORS = [
- "[data-ad-status]",
- "[data-ad-type]",
- "[aria-label*='advertisement' i]",
- "[id^='leaderboard']",
- "[class*='LeaderboardAd_']",
- "[class*='FullWidthAd_']",
- "[class*='BaseAd_']",
- ".adWrapper",
- ".dvz-v0-ad",
- "amp-ad",
- "iframe[src*='doubleclick']",
- "iframe[src*='googletagmanager']",
- "iframe[src*='googlesyndication']"
-];
-
-const TRACKER_HOST_PATTERNS = [
- "doubleclick.net",
- "googletagmanager.com",
- "googlesyndication.com",
- "google-analytics.com",
- "amazon-adsystem.com",
- "pub.doubleverify.com",
- "securepubads.g.doubleclick.net",
- "s10.histats.com",
- "sstatic1.histats.com"
-];
-
-const ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS = [
- "getadmiral.com"
-];
-
-const BLOCKED_HOST_PATTERNS = [
- ...TRACKER_HOST_PATTERNS,
- ...ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS
-];
-
-const ANTI_ADBLOCK_TEXT_PATTERNS = [
- "\\bad\\s*block(?:er|ing)?\\b",
- "\\bad[-\\s]?block\\b",
- "\\badblock(?:er|ing)?\\b",
- "\\badvertis(?:e|ing)\\s+block(?:er|ing)?\\b",
- "\\bblock(?:ing)?\\s+(?:ads|advertis(?:ements?|ing))\\b",
- "\\b(?:disable|turn\\s+off|pause)\\s+(?:your\\s+)?ad[-\\s]?block(?:er|ing)?\\b",
- "\\b(?:allowlist|whitelist)\\s+(?:our\\s+|this\\s+)?(?:site|website|domain)\\b",
- "\\ballow(?:ing)?\\s+(?:our\\s+)?(?:ads|advertis(?:ements?|ing))\\b",
- "\\bads?\\s+(?:are\\s+)?blocked\\b"
-];
-
-const BLOCKED_CAPTURE_PATTERNS = [
- {
- reason: "DataDome CAPTCHA/bot challenge",
- any: [
- /DataDome CAPTCHA/i,
- /captcha-delivery\.com/i
- ]
- },
- {
- reason: "blocked/CAPTCHA challenge",
- any: [
- /
]*>\s*You have been blocked\s*<\/title>/i,
- /]*>\s*Access Denied\s*<\/title>/i,
- /\bunusual traffic\b/i
- ]
- },
- {
- reason: "human verification challenge",
- all: [
- /\bverify you are (?:a )?human\b/i,
- /\b(?:captcha|challenge|g-recaptcha|hcaptcha|turnstile)\b/i
- ]
- }
-];
-
export async function archivePage(input, options = {}) {
const sourceUrl = inputToUrl(input);
const archivePath = options.archivePath || defaultArchivePath();
const id = options.id || slugForUrl(sourceUrl);
const filePath = path.join(archivePath, `${id}.html`);
+
await fs.mkdir(archivePath, { recursive: true });
- const rawHtml = await readInputHtml(sourceUrl, options);
- const baseUrl = rawHtml ? findEffectiveBase(rawHtml, sourceUrl) : sourceUrl;
- const useStatic = options.static || (isFileUrl(sourceUrl) && options.stripArchiveShell && options.render !== true);
- const renderedHtml = useStatic
- ? prepareStaticHtml(rawHtml, options)
- : await renderPage(sourceUrl, { ...options, rawHtml, baseUrl });
- assertNotBlockedCapture(renderedHtml, sourceUrl);
-
+ const renderedHtml = await renderPage(sourceUrl);
+ const baseUrl = findEffectiveBase(renderedHtml, sourceUrl);
const inliner = new AssetInliner({
- userAgent: options.userAgent || DEFAULT_USER_AGENT,
- referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined,
- maxAssetBytes: options.maxAssetBytes,
- maxInlineStyleBytes: options.maxInlineStyleBytes
+ userAgent: DEFAULT_USER_AGENT,
+ referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined
});
+
const inlined = await inliner.inlineHtml(renderedHtml, baseUrl);
- const finalHtml = addArchiveComment(inlined, sourceUrl, options);
+ const finalHtml = addArchiveComment(inlined, sourceUrl);
await fs.writeFile(filePath, finalHtml, "utf8");
return {
@@ -127,409 +50,66 @@ export async function archivePage(input, options = {}) {
};
}
-export async function readInputHtml(sourceUrl, options = {}) {
- if (isFileUrl(sourceUrl)) {
- return fs.readFile(new URL(sourceUrl), "utf8");
- }
- if (!isHttpUrl(sourceUrl) || !options.static) {
- return null;
- }
- const response = await fetch(sourceUrl, {
- headers: {
- "user-agent": options.userAgent || DEFAULT_USER_AGENT,
- accept: "text/html,application/xhtml+xml"
- },
- redirect: "follow"
- });
- if (!response.ok) {
- throw new Error(`Failed to fetch ${sourceUrl}: HTTP ${response.status}`);
- }
- return response.text();
-}
-
-function prepareStaticHtml(rawHtml, options = {}) {
- if (!rawHtml) {
- throw new Error("Static mode requires an HTML input file or fetched HTML document.");
- }
- return options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml;
-}
-
-export async function renderPage(sourceUrl, options = {}) {
+export async function renderPage(sourceUrl) {
const playwright = loadPlaywright();
- const browser = await playwright.chromium.launch({
- headless: true
- });
+ const browser = await playwright.chromium.launch({ headless: true });
+
try {
const context = await browser.newContext({
- javaScriptEnabled: options.javaScriptEnabled ?? !(options.rawHtml && isFileUrl(sourceUrl)),
- userAgent: options.userAgent || DEFAULT_USER_AGENT,
- viewport: {
- width: options.viewportWidth || 1024,
- height: options.viewportHeight || 768
- }
+ userAgent: DEFAULT_USER_AGENT,
+ viewport: VIEWPORT
});
const page = await context.newPage();
- if (options.stripAds !== false) {
- await page.route("**/*", (route) => {
- const url = route.request().url();
- if (isTrackerUrl(url)) {
- return route.abort();
- }
- return route.continue();
- });
- }
+ await page.goto(sourceUrl, {
+ waitUntil: "domcontentloaded",
+ timeout: PAGE_TIMEOUT_MS
+ });
+ await waitForNetworkIdle(page);
+ await snapshotLoadedResourceUrls(page);
- if (options.rawHtml && isFileUrl(sourceUrl)) {
- const content = prepareRenderInputHtml(options.rawHtml, options);
- await page.setContent(content, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 });
- } else {
- await page.goto(sourceUrl, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 });
- }
-
- await settlePage(page, options);
- await cleanupAndFreezePage(page, options);
return await page.content();
} finally {
await browser.close();
}
}
-function assertNotBlockedCapture(html, sourceUrl) {
- const detected = detectBlockedCapture(html);
- if (!detected) {
- return;
- }
- throw new Error(
- `Archive capture failed for ${sourceUrl}: ${detected}. The renderer received a challenge page instead of the requested content, so no archive was written.`
- );
-}
-
-function detectBlockedCapture(html) {
- for (const { reason, any, all } of BLOCKED_CAPTURE_PATTERNS) {
- const anyMatched = !any || any.some((pattern) => pattern.test(html));
- const allMatched = !all || all.every((pattern) => pattern.test(html));
- if (anyMatched && allMatched) {
- return reason;
- }
- }
- return null;
-}
-
-async function settlePage(page, options) {
+async function waitForNetworkIdle(page) {
try {
- await page.waitForLoadState("networkidle", { timeout: options.networkIdleTimeoutMs || 15000 });
- } catch {
- // Dynamic pages often keep long-lived connections open; DOM capture can still proceed.
- }
-
- if (options.scroll !== false) {
- await page.evaluate(async () => {
- await new Promise((resolve) => {
- let total = 0;
- const step = Math.max(400, Math.floor(window.innerHeight * 0.8));
- const timer = setInterval(() => {
- const previous = document.scrollingElement?.scrollTop || window.scrollY;
- window.scrollBy(0, step);
- total += step;
- const current = document.scrollingElement?.scrollTop || window.scrollY;
- if (current === previous || total > Math.max(document.body.scrollHeight, 20000)) {
- clearInterval(timer);
- window.scrollTo(0, 0);
- resolve();
- }
- }, 120);
- });
+ await page.waitForLoadState("networkidle", {
+ timeout: NETWORK_IDLE_TIMEOUT_MS
});
+ } catch {
+ // Some pages keep sockets open; the DOM snapshot is still useful.
}
}
-async function cleanupAndFreezePage(page, options) {
- await page.evaluate(
- ({
- adSelectors,
- antiAdblockProviderHostPatterns,
- antiAdblockTextPatterns,
- freezeStyles,
- maxFreezeElements,
- maxSanitizeElements,
- stripAds,
- stripArchiveShell: shouldStripArchiveShell
- }) => {
- function removeAll(selector) {
- document.querySelectorAll(selector).forEach((node) => node.remove());
+async function snapshotLoadedResourceUrls(page) {
+ await page.evaluate(() => {
+ document.querySelectorAll("img").forEach((img) => {
+ if (img.currentSrc) {
+ img.setAttribute("src", img.currentSrc);
}
+ });
- if (shouldStripArchiveShell) {
- const content = document.querySelector("#CONTENT .html1") || document.querySelector("#CONTENT");
- if (content) {
- document.body.innerHTML = "";
- document.body.appendChild(content.cloneNode(true));
- document.documentElement.removeAttribute("prefix");
- document.documentElement.removeAttribute("itemscope");
- document.documentElement.removeAttribute("itemtype");
+ document.querySelectorAll("video,audio").forEach((media) => {
+ if (media.currentSrc) {
+ media.setAttribute("src", media.currentSrc);
+ }
+ });
+
+ document.querySelectorAll("iframe").forEach((frame) => {
+ try {
+ const doc = frame.contentDocument;
+ if (doc?.documentElement) {
+ frame.setAttribute("srcdoc", "" + doc.documentElement.outerHTML);
+ frame.removeAttribute("src");
}
+ } catch {
+ // Cross-origin frames are handled later by the asset inliner when possible.
}
-
- removeAll("script");
- removeAll("noscript");
- removeAll("link[rel='preconnect'],link[rel='dns-prefetch'],link[rel='modulepreload']");
- removeAll("meta[name='next-head-count']");
-
- if (stripAds) {
- for (const selector of adSelectors) {
- try {
- removeAll(selector);
- } catch {
- // Ignore unsupported selectors in older browser engines.
- }
- }
- removeAntiAdblockOverlays();
- }
-
- document.querySelectorAll("img").forEach((img) => {
- if (img.currentSrc) {
- img.setAttribute("data-original-src", img.getAttribute("src") || "");
- img.setAttribute("src", img.currentSrc);
- }
- img.removeAttribute("srcset");
- img.removeAttribute("sizes");
- img.setAttribute("loading", "lazy");
- });
-
- document.querySelectorAll("source").forEach((source) => {
- source.removeAttribute("srcset");
- });
-
- document.querySelectorAll("video,audio").forEach((media) => {
- if (media.currentSrc) {
- media.setAttribute("src", media.currentSrc);
- }
- });
-
- document.querySelectorAll("iframe").forEach((frame) => {
- const src = frame.getAttribute("src");
- if (src) {
- frame.setAttribute("data-archived-src", src);
- }
- try {
- const doc = frame.contentDocument;
- if (doc?.documentElement) {
- frame.setAttribute("srcdoc", "" + doc.documentElement.outerHTML);
- frame.removeAttribute("src");
- }
- } catch {
- // Cross-origin iframe sources are handled in the Node-side inliner when possible.
- }
- });
-
- function removeAntiAdblockOverlays() {
- const textPatterns = antiAdblockTextPatterns.map((pattern) => new RegExp(pattern, "i"));
- const candidates = new Set();
- const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ELEMENT);
- let node = walker.currentNode;
- let visited = 0;
-
- while (node && visited < maxSanitizeElements) {
- if (node !== document.body && node !== document.documentElement && hasAntiAdblockSignal(node, textPatterns)) {
- candidates.add(node);
- }
- visited += 1;
- node = walker.nextNode();
- }
-
- let removed = 0;
- candidates.forEach((node) => {
- const container = findRoadblockContainer(node, textPatterns);
- if (container && isLikelyAntiAdblockRoadblock(container, textPatterns)) {
- container.remove();
- removed += 1;
- }
- });
-
- if (removed > 0) {
- for (const element of [document.documentElement, document.body]) {
- element.style.removeProperty("overflow");
- element.style.removeProperty("position");
- element.style.removeProperty("inset");
- }
- }
- }
-
- function findRoadblockContainer(node, textPatterns) {
- let current = node;
- let best = null;
- while (current?.parentElement && current.parentElement !== document.body) {
- if (isLikelyAntiAdblockRoadblock(current, textPatterns)) {
- best = current;
- }
- const parentTextLength = normalizeText(current.parentElement.textContent || "").length;
- if (parentTextLength > 8000) {
- break;
- }
- current = current.parentElement;
- }
- return best || (isLikelyAntiAdblockRoadblock(node, textPatterns) ? node : null);
- }
-
- function normalizeText(text) {
- return text.replace(/\s+/g, " ").trim();
- }
-
- function hasAntiAdblockSignal(node, textPatterns) {
- return hasAntiAdblockText(node, textPatterns) || hasAntiAdblockProviderUrl(node) || hasAntiAdblockProviderDescendant(node);
- }
-
- function isLikelyAntiAdblockRoadblock(node, textPatterns) {
- if (!node || node === document.body || node === document.documentElement) {
- return false;
- }
- const hasSignal =
- hasAntiAdblockText(node, textPatterns) ||
- hasAntiAdblockProviderUrl(node) ||
- hasAntiAdblockProviderDescendant(node);
- if (!hasSignal) {
- return false;
- }
- const looksBlocking =
- isOverlayLike(node) ||
- hasDialogSemantics(node) ||
- hasBlockingClassName(node) ||
- hasActionControl(node) ||
- hasAntiAdblockProviderDescendant(node);
- if (!looksBlocking) {
- return false;
- }
- const embeddedInContent = node.closest("article, main");
- return !embeddedInContent || isOverlayLike(node) || hasDialogSemantics(node) || hasBlockingClassName(node);
- }
-
- function hasAntiAdblockText(node, textPatterns) {
- const text = normalizeText(node.textContent || "");
- return text.length > 0 && text.length <= 2000 && textPatterns.some((pattern) => pattern.test(text));
- }
-
- function hasAntiAdblockProviderDescendant(node) {
- return Array.from(node.querySelectorAll?.("a[href], iframe[src], script[src]") || []).some((descendant) =>
- hasAntiAdblockProviderUrl(descendant)
- );
- }
-
- function hasAntiAdblockProviderUrl(node) {
- for (const attr of ["href", "src", "data-src"]) {
- const value = node.getAttribute?.(attr) || node[attr] || "";
- if (value && antiAdblockProviderHostPatterns.some((host) => decodedText(value).includes(host))) {
- return true;
- }
- }
- return false;
- }
-
- function isOverlayLike(node) {
- const style = window.getComputedStyle(node);
- const rect = node.getBoundingClientRect();
- const viewportArea = Math.max(1, window.innerWidth * window.innerHeight);
- const area = Math.max(0, rect.width) * Math.max(0, rect.height);
- const zIndex = Number.parseInt(style.zIndex, 10);
- const hasHighZIndex = Number.isFinite(zIndex) && zIndex >= 10;
- const positionIsBlocking = style.position === "fixed" || style.position === "sticky";
- const coversMeaningfulArea = area / viewportArea >= 0.15;
- const coversMostViewport = area / viewportArea >= 0.45;
- return (
- (positionIsBlocking && (hasHighZIndex || coversMeaningfulArea)) ||
- (hasHighZIndex && coversMostViewport)
- );
- }
-
- function hasDialogSemantics(node) {
- const role = node.getAttribute?.("role");
- return node.tagName === "DIALOG" || role === "dialog" || role === "alertdialog" || node.getAttribute?.("aria-modal") === "true";
- }
-
- function hasBlockingClassName(node) {
- return /(?:ad[-_ ]?block|adblock|allow[-_ ]?ads|overlay|modal|interstitial|backdrop|roadblock)/i.test(
- `${node.id || ""} ${node.className || ""}`
- );
- }
-
- function hasActionControl(node) {
- return Boolean(node.querySelector?.('button, [role="button"], a[href], input[type="button"], input[type="submit"]'));
- }
-
- function decodedText(value) {
- try {
- return decodeURIComponent(value).toLowerCase();
- } catch {
- return value.toLowerCase();
- }
- }
-
- const walkedElements = [];
- const walker = document.createTreeWalker(document.documentElement, NodeFilter.SHOW_ELEMENT);
- let element = document.documentElement;
- let visited = 0;
- while (element && visited < maxSanitizeElements) {
- walkedElements.push(element);
- for (const attr of Array.from(element.attributes)) {
- if (/^on/i.test(attr.name) || attr.name === "integrity" || attr.name === "nonce") {
- element.removeAttribute(attr.name);
- }
- }
- visited += 1;
- element = walker.nextNode();
- }
-
- if (!freezeStyles || element || walkedElements.length > maxFreezeElements) {
- return;
- }
-
- for (const element of walkedElements) {
- if (element.tagName === "SCRIPT" || element.tagName === "STYLE") {
- continue;
- }
- const computed = window.getComputedStyle(element);
- const declarations = [];
- for (let i = 0; i < computed.length; i += 1) {
- const property = computed[i];
- const value = computed.getPropertyValue(property);
- if (value) {
- declarations.push(`${property}:${value}`);
- }
- }
- if (declarations.length) {
- element.setAttribute("style", declarations.join(";"));
- }
- }
- },
- {
- adSelectors: AD_SELECTORS,
- antiAdblockProviderHostPatterns: ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS,
- antiAdblockTextPatterns: ANTI_ADBLOCK_TEXT_PATTERNS,
- freezeStyles: options.freezeStyles === true,
- maxFreezeElements: options.maxFreezeElements || 2500,
- maxSanitizeElements: options.maxSanitizeElements || 5000,
- stripAds: options.stripAds !== false,
- stripArchiveShell: Boolean(options.stripArchiveShell)
- }
- );
-}
-
-function prepareRenderInputHtml(rawHtml, options) {
- let html = options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml;
- html = html
- .replace(/