diff --git a/README.md b/README.md index a522386..a59eb64 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,6 @@ # Local Page Archiver -This project saves self-contained HTML archives for pages the operator is authorized to access. It sends a real browser user agent, renders web URLs with Playwright, strips ad/tracker-like elements, normalizes the captured DOM, and inlines page requisites as `data:` URLs. - -It intentionally does not execute paywall-bypass rules. The bundled `bypass-paywalls-clean-filters` files are treated as reference material only; paywall selectors and scripts are not applied. +This project saves self-contained HTML archives. It opens the input with Playwright, captures the rendered HTML, and inlines external resources as `data:` URLs. ## CLI @@ -15,35 +13,7 @@ node src/cli.mjs archive "https://example.com/article" For an existing HTML file: ```sh -node src/cli.mjs archive ./page.html --static +node src/cli.mjs archive ./page.html ``` -For an `archive.ph` HTML export where you want the captured page without the archive shell: - -```sh -node src/cli.mjs archive ./bloomberg-archive.html --static --strip-archive-shell -``` - -Local `archive.ph` HTML inputs with `--strip-archive-shell` use the static extractor by default because those files already contain the rendered page. Add `--render` only when you explicitly want Chromium to load the local HTML first. - -Computed-style freezing is off by default for live web pages because it can inflate modern article pages into very large HTML files. Add `--freeze-styles` only when stylesheet inlining is not enough to preserve layout. - Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set. - -## API - -```sh -ARCHIVE_PATH=/tmp/local-page-archives npm run serve -``` - -Archive a page: - -```sh -curl -X POST http://127.0.0.1:8787/archive \ - -H 'content-type: application/json' \ - -d '{"url":"https://example.com/article"}' -``` - -The response includes the archived file path and a local `viewUrl`. - -Set `PORT` to choose a port other than the default `8787`. diff --git a/package.json b/package.json index b1d1165..9b26266 100644 --- a/package.json +++ b/package.json @@ -3,13 +3,12 @@ "version": "0.1.0", "private": true, "type": "module", - "description": "Render and save self-contained HTML archives for pages the operator is authorized to access.", + "description": "Render and save self-contained HTML archives.", "bin": { "archive-page": "./src/cli.mjs" }, "scripts": { "archive": "node src/cli.mjs archive", - "serve": "node src/server.mjs", "install-browsers": "playwright install chromium" }, "dependencies": { diff --git a/src/archiver.mjs b/src/archiver.mjs index dcf510c..94f98cc 100644 --- a/src/archiver.mjs +++ b/src/archiver.mjs @@ -6,115 +6,38 @@ import { DEFAULT_USER_AGENT, defaultArchivePath, findEffectiveBase, - htmlEscape, inputToUrl, - isFileUrl, isHttpUrl, - slugForUrl, - stripArchiveShell + slugForUrl } from "./asset-inliner.mjs"; const require = createRequire(import.meta.url); +const PAGE_TIMEOUT_MS = 60000; +const NETWORK_IDLE_TIMEOUT_MS = 5000; +const VIEWPORT = { + width: 1024, + height: 768 +}; export { DEFAULT_USER_AGENT, defaultArchivePath }; -const AD_SELECTORS = [ - "[data-ad-status]", - "[data-ad-type]", - "[aria-label*='advertisement' i]", - "[id^='leaderboard']", - "[class*='LeaderboardAd_']", - "[class*='FullWidthAd_']", - "[class*='BaseAd_']", - ".adWrapper", - ".dvz-v0-ad", - "amp-ad", - "iframe[src*='doubleclick']", - "iframe[src*='googletagmanager']", - "iframe[src*='googlesyndication']" -]; - -const TRACKER_HOST_PATTERNS = [ - "doubleclick.net", - "googletagmanager.com", - "googlesyndication.com", - "google-analytics.com", - "amazon-adsystem.com", - "pub.doubleverify.com", - "securepubads.g.doubleclick.net", - "s10.histats.com", - "sstatic1.histats.com" -]; - -const ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS = [ - "getadmiral.com" -]; - -const BLOCKED_HOST_PATTERNS = [ - ...TRACKER_HOST_PATTERNS, - ...ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS -]; - -const ANTI_ADBLOCK_TEXT_PATTERNS = [ - "\\bad\\s*block(?:er|ing)?\\b", - "\\bad[-\\s]?block\\b", - "\\badblock(?:er|ing)?\\b", - "\\badvertis(?:e|ing)\\s+block(?:er|ing)?\\b", - "\\bblock(?:ing)?\\s+(?:ads|advertis(?:ements?|ing))\\b", - "\\b(?:disable|turn\\s+off|pause)\\s+(?:your\\s+)?ad[-\\s]?block(?:er|ing)?\\b", - "\\b(?:allowlist|whitelist)\\s+(?:our\\s+|this\\s+)?(?:site|website|domain)\\b", - "\\ballow(?:ing)?\\s+(?:our\\s+)?(?:ads|advertis(?:ements?|ing))\\b", - "\\bads?\\s+(?:are\\s+)?blocked\\b" -]; - -const BLOCKED_CAPTURE_PATTERNS = [ - { - reason: "DataDome CAPTCHA/bot challenge", - any: [ - /DataDome CAPTCHA/i, - /captcha-delivery\.com/i - ] - }, - { - reason: "blocked/CAPTCHA challenge", - any: [ - /]*>\s*You have been blocked\s*<\/title>/i, - /]*>\s*Access Denied\s*<\/title>/i, - /\bunusual traffic\b/i - ] - }, - { - reason: "human verification challenge", - all: [ - /\bverify you are (?:a )?human\b/i, - /\b(?:captcha|challenge|g-recaptcha|hcaptcha|turnstile)\b/i - ] - } -]; - export async function archivePage(input, options = {}) { const sourceUrl = inputToUrl(input); const archivePath = options.archivePath || defaultArchivePath(); const id = options.id || slugForUrl(sourceUrl); const filePath = path.join(archivePath, `${id}.html`); + await fs.mkdir(archivePath, { recursive: true }); - const rawHtml = await readInputHtml(sourceUrl, options); - const baseUrl = rawHtml ? findEffectiveBase(rawHtml, sourceUrl) : sourceUrl; - const useStatic = options.static || (isFileUrl(sourceUrl) && options.stripArchiveShell && options.render !== true); - const renderedHtml = useStatic - ? prepareStaticHtml(rawHtml, options) - : await renderPage(sourceUrl, { ...options, rawHtml, baseUrl }); - assertNotBlockedCapture(renderedHtml, sourceUrl); - + const renderedHtml = await renderPage(sourceUrl); + const baseUrl = findEffectiveBase(renderedHtml, sourceUrl); const inliner = new AssetInliner({ - userAgent: options.userAgent || DEFAULT_USER_AGENT, - referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined, - maxAssetBytes: options.maxAssetBytes, - maxInlineStyleBytes: options.maxInlineStyleBytes + userAgent: DEFAULT_USER_AGENT, + referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined }); + const inlined = await inliner.inlineHtml(renderedHtml, baseUrl); - const finalHtml = addArchiveComment(inlined, sourceUrl, options); + const finalHtml = addArchiveComment(inlined, sourceUrl); await fs.writeFile(filePath, finalHtml, "utf8"); return { @@ -127,409 +50,66 @@ export async function archivePage(input, options = {}) { }; } -export async function readInputHtml(sourceUrl, options = {}) { - if (isFileUrl(sourceUrl)) { - return fs.readFile(new URL(sourceUrl), "utf8"); - } - if (!isHttpUrl(sourceUrl) || !options.static) { - return null; - } - const response = await fetch(sourceUrl, { - headers: { - "user-agent": options.userAgent || DEFAULT_USER_AGENT, - accept: "text/html,application/xhtml+xml" - }, - redirect: "follow" - }); - if (!response.ok) { - throw new Error(`Failed to fetch ${sourceUrl}: HTTP ${response.status}`); - } - return response.text(); -} - -function prepareStaticHtml(rawHtml, options = {}) { - if (!rawHtml) { - throw new Error("Static mode requires an HTML input file or fetched HTML document."); - } - return options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml; -} - -export async function renderPage(sourceUrl, options = {}) { +export async function renderPage(sourceUrl) { const playwright = loadPlaywright(); - const browser = await playwright.chromium.launch({ - headless: true - }); + const browser = await playwright.chromium.launch({ headless: true }); + try { const context = await browser.newContext({ - javaScriptEnabled: options.javaScriptEnabled ?? !(options.rawHtml && isFileUrl(sourceUrl)), - userAgent: options.userAgent || DEFAULT_USER_AGENT, - viewport: { - width: options.viewportWidth || 1024, - height: options.viewportHeight || 768 - } + userAgent: DEFAULT_USER_AGENT, + viewport: VIEWPORT }); const page = await context.newPage(); - if (options.stripAds !== false) { - await page.route("**/*", (route) => { - const url = route.request().url(); - if (isTrackerUrl(url)) { - return route.abort(); - } - return route.continue(); - }); - } + await page.goto(sourceUrl, { + waitUntil: "domcontentloaded", + timeout: PAGE_TIMEOUT_MS + }); + await waitForNetworkIdle(page); + await snapshotLoadedResourceUrls(page); - if (options.rawHtml && isFileUrl(sourceUrl)) { - const content = prepareRenderInputHtml(options.rawHtml, options); - await page.setContent(content, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 }); - } else { - await page.goto(sourceUrl, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 }); - } - - await settlePage(page, options); - await cleanupAndFreezePage(page, options); return await page.content(); } finally { await browser.close(); } } -function assertNotBlockedCapture(html, sourceUrl) { - const detected = detectBlockedCapture(html); - if (!detected) { - return; - } - throw new Error( - `Archive capture failed for ${sourceUrl}: ${detected}. The renderer received a challenge page instead of the requested content, so no archive was written.` - ); -} - -function detectBlockedCapture(html) { - for (const { reason, any, all } of BLOCKED_CAPTURE_PATTERNS) { - const anyMatched = !any || any.some((pattern) => pattern.test(html)); - const allMatched = !all || all.every((pattern) => pattern.test(html)); - if (anyMatched && allMatched) { - return reason; - } - } - return null; -} - -async function settlePage(page, options) { +async function waitForNetworkIdle(page) { try { - await page.waitForLoadState("networkidle", { timeout: options.networkIdleTimeoutMs || 15000 }); - } catch { - // Dynamic pages often keep long-lived connections open; DOM capture can still proceed. - } - - if (options.scroll !== false) { - await page.evaluate(async () => { - await new Promise((resolve) => { - let total = 0; - const step = Math.max(400, Math.floor(window.innerHeight * 0.8)); - const timer = setInterval(() => { - const previous = document.scrollingElement?.scrollTop || window.scrollY; - window.scrollBy(0, step); - total += step; - const current = document.scrollingElement?.scrollTop || window.scrollY; - if (current === previous || total > Math.max(document.body.scrollHeight, 20000)) { - clearInterval(timer); - window.scrollTo(0, 0); - resolve(); - } - }, 120); - }); + await page.waitForLoadState("networkidle", { + timeout: NETWORK_IDLE_TIMEOUT_MS }); + } catch { + // Some pages keep sockets open; the DOM snapshot is still useful. } } -async function cleanupAndFreezePage(page, options) { - await page.evaluate( - ({ - adSelectors, - antiAdblockProviderHostPatterns, - antiAdblockTextPatterns, - freezeStyles, - maxFreezeElements, - maxSanitizeElements, - stripAds, - stripArchiveShell: shouldStripArchiveShell - }) => { - function removeAll(selector) { - document.querySelectorAll(selector).forEach((node) => node.remove()); +async function snapshotLoadedResourceUrls(page) { + await page.evaluate(() => { + document.querySelectorAll("img").forEach((img) => { + if (img.currentSrc) { + img.setAttribute("src", img.currentSrc); } + }); - if (shouldStripArchiveShell) { - const content = document.querySelector("#CONTENT .html1") || document.querySelector("#CONTENT"); - if (content) { - document.body.innerHTML = ""; - document.body.appendChild(content.cloneNode(true)); - document.documentElement.removeAttribute("prefix"); - document.documentElement.removeAttribute("itemscope"); - document.documentElement.removeAttribute("itemtype"); + document.querySelectorAll("video,audio").forEach((media) => { + if (media.currentSrc) { + media.setAttribute("src", media.currentSrc); + } + }); + + document.querySelectorAll("iframe").forEach((frame) => { + try { + const doc = frame.contentDocument; + if (doc?.documentElement) { + frame.setAttribute("srcdoc", "" + doc.documentElement.outerHTML); + frame.removeAttribute("src"); } + } catch { + // Cross-origin frames are handled later by the asset inliner when possible. } - - removeAll("script"); - removeAll("noscript"); - removeAll("link[rel='preconnect'],link[rel='dns-prefetch'],link[rel='modulepreload']"); - removeAll("meta[name='next-head-count']"); - - if (stripAds) { - for (const selector of adSelectors) { - try { - removeAll(selector); - } catch { - // Ignore unsupported selectors in older browser engines. - } - } - removeAntiAdblockOverlays(); - } - - document.querySelectorAll("img").forEach((img) => { - if (img.currentSrc) { - img.setAttribute("data-original-src", img.getAttribute("src") || ""); - img.setAttribute("src", img.currentSrc); - } - img.removeAttribute("srcset"); - img.removeAttribute("sizes"); - img.setAttribute("loading", "lazy"); - }); - - document.querySelectorAll("source").forEach((source) => { - source.removeAttribute("srcset"); - }); - - document.querySelectorAll("video,audio").forEach((media) => { - if (media.currentSrc) { - media.setAttribute("src", media.currentSrc); - } - }); - - document.querySelectorAll("iframe").forEach((frame) => { - const src = frame.getAttribute("src"); - if (src) { - frame.setAttribute("data-archived-src", src); - } - try { - const doc = frame.contentDocument; - if (doc?.documentElement) { - frame.setAttribute("srcdoc", "" + doc.documentElement.outerHTML); - frame.removeAttribute("src"); - } - } catch { - // Cross-origin iframe sources are handled in the Node-side inliner when possible. - } - }); - - function removeAntiAdblockOverlays() { - const textPatterns = antiAdblockTextPatterns.map((pattern) => new RegExp(pattern, "i")); - const candidates = new Set(); - const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ELEMENT); - let node = walker.currentNode; - let visited = 0; - - while (node && visited < maxSanitizeElements) { - if (node !== document.body && node !== document.documentElement && hasAntiAdblockSignal(node, textPatterns)) { - candidates.add(node); - } - visited += 1; - node = walker.nextNode(); - } - - let removed = 0; - candidates.forEach((node) => { - const container = findRoadblockContainer(node, textPatterns); - if (container && isLikelyAntiAdblockRoadblock(container, textPatterns)) { - container.remove(); - removed += 1; - } - }); - - if (removed > 0) { - for (const element of [document.documentElement, document.body]) { - element.style.removeProperty("overflow"); - element.style.removeProperty("position"); - element.style.removeProperty("inset"); - } - } - } - - function findRoadblockContainer(node, textPatterns) { - let current = node; - let best = null; - while (current?.parentElement && current.parentElement !== document.body) { - if (isLikelyAntiAdblockRoadblock(current, textPatterns)) { - best = current; - } - const parentTextLength = normalizeText(current.parentElement.textContent || "").length; - if (parentTextLength > 8000) { - break; - } - current = current.parentElement; - } - return best || (isLikelyAntiAdblockRoadblock(node, textPatterns) ? node : null); - } - - function normalizeText(text) { - return text.replace(/\s+/g, " ").trim(); - } - - function hasAntiAdblockSignal(node, textPatterns) { - return hasAntiAdblockText(node, textPatterns) || hasAntiAdblockProviderUrl(node) || hasAntiAdblockProviderDescendant(node); - } - - function isLikelyAntiAdblockRoadblock(node, textPatterns) { - if (!node || node === document.body || node === document.documentElement) { - return false; - } - const hasSignal = - hasAntiAdblockText(node, textPatterns) || - hasAntiAdblockProviderUrl(node) || - hasAntiAdblockProviderDescendant(node); - if (!hasSignal) { - return false; - } - const looksBlocking = - isOverlayLike(node) || - hasDialogSemantics(node) || - hasBlockingClassName(node) || - hasActionControl(node) || - hasAntiAdblockProviderDescendant(node); - if (!looksBlocking) { - return false; - } - const embeddedInContent = node.closest("article, main"); - return !embeddedInContent || isOverlayLike(node) || hasDialogSemantics(node) || hasBlockingClassName(node); - } - - function hasAntiAdblockText(node, textPatterns) { - const text = normalizeText(node.textContent || ""); - return text.length > 0 && text.length <= 2000 && textPatterns.some((pattern) => pattern.test(text)); - } - - function hasAntiAdblockProviderDescendant(node) { - return Array.from(node.querySelectorAll?.("a[href], iframe[src], script[src]") || []).some((descendant) => - hasAntiAdblockProviderUrl(descendant) - ); - } - - function hasAntiAdblockProviderUrl(node) { - for (const attr of ["href", "src", "data-src"]) { - const value = node.getAttribute?.(attr) || node[attr] || ""; - if (value && antiAdblockProviderHostPatterns.some((host) => decodedText(value).includes(host))) { - return true; - } - } - return false; - } - - function isOverlayLike(node) { - const style = window.getComputedStyle(node); - const rect = node.getBoundingClientRect(); - const viewportArea = Math.max(1, window.innerWidth * window.innerHeight); - const area = Math.max(0, rect.width) * Math.max(0, rect.height); - const zIndex = Number.parseInt(style.zIndex, 10); - const hasHighZIndex = Number.isFinite(zIndex) && zIndex >= 10; - const positionIsBlocking = style.position === "fixed" || style.position === "sticky"; - const coversMeaningfulArea = area / viewportArea >= 0.15; - const coversMostViewport = area / viewportArea >= 0.45; - return ( - (positionIsBlocking && (hasHighZIndex || coversMeaningfulArea)) || - (hasHighZIndex && coversMostViewport) - ); - } - - function hasDialogSemantics(node) { - const role = node.getAttribute?.("role"); - return node.tagName === "DIALOG" || role === "dialog" || role === "alertdialog" || node.getAttribute?.("aria-modal") === "true"; - } - - function hasBlockingClassName(node) { - return /(?:ad[-_ ]?block|adblock|allow[-_ ]?ads|overlay|modal|interstitial|backdrop|roadblock)/i.test( - `${node.id || ""} ${node.className || ""}` - ); - } - - function hasActionControl(node) { - return Boolean(node.querySelector?.('button, [role="button"], a[href], input[type="button"], input[type="submit"]')); - } - - function decodedText(value) { - try { - return decodeURIComponent(value).toLowerCase(); - } catch { - return value.toLowerCase(); - } - } - - const walkedElements = []; - const walker = document.createTreeWalker(document.documentElement, NodeFilter.SHOW_ELEMENT); - let element = document.documentElement; - let visited = 0; - while (element && visited < maxSanitizeElements) { - walkedElements.push(element); - for (const attr of Array.from(element.attributes)) { - if (/^on/i.test(attr.name) || attr.name === "integrity" || attr.name === "nonce") { - element.removeAttribute(attr.name); - } - } - visited += 1; - element = walker.nextNode(); - } - - if (!freezeStyles || element || walkedElements.length > maxFreezeElements) { - return; - } - - for (const element of walkedElements) { - if (element.tagName === "SCRIPT" || element.tagName === "STYLE") { - continue; - } - const computed = window.getComputedStyle(element); - const declarations = []; - for (let i = 0; i < computed.length; i += 1) { - const property = computed[i]; - const value = computed.getPropertyValue(property); - if (value) { - declarations.push(`${property}:${value}`); - } - } - if (declarations.length) { - element.setAttribute("style", declarations.join(";")); - } - } - }, - { - adSelectors: AD_SELECTORS, - antiAdblockProviderHostPatterns: ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS, - antiAdblockTextPatterns: ANTI_ADBLOCK_TEXT_PATTERNS, - freezeStyles: options.freezeStyles === true, - maxFreezeElements: options.maxFreezeElements || 2500, - maxSanitizeElements: options.maxSanitizeElements || 5000, - stripAds: options.stripAds !== false, - stripArchiveShell: Boolean(options.stripArchiveShell) - } - ); -} - -function prepareRenderInputHtml(rawHtml, options) { - let html = options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml; - html = html - .replace(//gi, "") - .replace(//gi, ""); - if (!options.baseUrl) { - return html; - } - if (/`; - if (/]*>/i.test(html)) { - return html.replace(/]*>/i, (match) => `${match}${baseTag}`); - } - return `${baseTag}${html}`; + }); + }); } function loadPlaywright() { @@ -537,23 +117,14 @@ function loadPlaywright() { return require("playwright"); } catch (error) { throw new Error( - `Playwright is required for render mode. Run "npm install" and "npm run install-browsers", or use --static for HTML input files. Original error: ${error.message}` + `Playwright is required. Run "npm install" and "npm run install-browsers". Original error: ${error.message}` ); } } -function isTrackerUrl(rawUrl) { - let host = ""; - try { - host = new URL(rawUrl).hostname; - } catch { - return false; - } - return BLOCKED_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`)); -} - -function addArchiveComment(html, sourceUrl, options) { - const comment = ``; +function addArchiveComment(html, sourceUrl) { + const safeSource = String(sourceUrl).replaceAll("--", "- -"); + const comment = ``; if (/]*>/i, (doctype) => `${doctype}\n${comment}`); } @@ -574,6 +145,7 @@ export function findExternalAssetRefs(html) { } } } + const linkPattern = /]*>/gi; for (const match of html.matchAll(linkPattern)) { const tag = match[0]; @@ -586,6 +158,7 @@ export function findExternalAssetRefs(html) { refs.add(href); } } + const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi; for (const match of html.matchAll(cssUrlPattern)) { const candidate = cleanCssUrl(match[2]); @@ -593,6 +166,7 @@ export function findExternalAssetRefs(html) { refs.add(candidate); } } + return Array.from(refs).sort(); } diff --git a/src/asset-inliner.mjs b/src/asset-inliner.mjs index fdcc82a..1dc4cfc 100644 --- a/src/asset-inliner.mjs +++ b/src/asset-inliner.mjs @@ -125,47 +125,6 @@ export function htmlDecode(value) { .replaceAll(">", ">"); } -export function stripArchiveShell(html) { - if (!html.includes('id="CONTENT"') && !html.includes("id='CONTENT'")) { - return html; - } - const contentStart = html.search(/]*\bid=(["'])CONTENT\1[^>]*>/i); - const marker = html.search( - /]*>\s*]*\bid=(["'])hashtags\1/i - ); - if (contentStart === -1 || marker === -1 || marker <= contentStart) { - return html; - } - const title = html.match(/]*>[\s\S]*?<\/title>/i)?.[0] || "Archived page"; - const fontStyle = html.match(/]*type=(["'])text\/css\1[^>]*>[\s\S]*?<\/style>/i)?.[0] || ""; - const capturedStart = html.slice(contentStart, marker).search(/]*\bclass=(["'])html1\1[^>]*>/i); - const fragmentStart = capturedStart === -1 ? contentStart : contentStart + capturedStart; - const fragmentEnd = findMatchingDivEnd(html, fragmentStart) || marker; - const content = html.slice(fragmentStart, Math.min(fragmentEnd, marker)); - return `${title}${fontStyle}${content}`; -} - -function findMatchingDivEnd(html, startIndex) { - const tags = /<\/?div\b[^>]*>/gi; - tags.lastIndex = startIndex; - let depth = 0; - for (const match of html.matchAll(tags)) { - const tag = match[0]; - if (match.index < startIndex) { - continue; - } - if (/^$/.test(tag)) { - depth += 1; - } else if (/^<\/div/i.test(tag)) { - depth -= 1; - if (depth === 0) { - return match.index + tag.length; - } - } - } - return null; -} - export async function replaceAsync(input, regex, replacer) { const parts = []; let lastIndex = 0; @@ -238,9 +197,6 @@ export class AssetInliner { async (match) => this.rewriteMediaAttributes(match[0], effectiveBase) ); - output = removeExternalBookkeepingUrls(output); - output = restoreArchiveProxyLinks(output); - output = await replaceAsync(output, /srcset=(["'])([\s\S]*?)\1/gi, async (match) => { const rewritten = await this.inlineSrcset(match[2], effectiveBase); return `srcset=${match[1]}${htmlEscape(rewritten)}${match[1]}`; @@ -267,7 +223,7 @@ export class AssetInliner { return ""; } const inlinedCss = await this.inlineCss(css, absolute); - return ``; + return ``; } const isInlineableLink = @@ -320,7 +276,6 @@ export class AssetInliner { const inlined = await this.inlineHtml(text, absolute, { depth: depth + 1 }); let rewritten = removeAttribute(tag, "src"); rewritten = setAttribute(rewritten, "srcdoc", inlined); - rewritten = setAttribute(rewritten, "data-archived-src", absolute); return rewritten; } return this.rewriteMediaAttributes(tag, baseUrl); @@ -459,45 +414,6 @@ export class AssetInliner { } } -function removeExternalBookkeepingUrls(html) { - return html.replace( - /\s(?:old-src|currentSourceUrl|data-original-src|data-archived-src)=(["'])([\s\S]*?)\1/gi, - "" - ); -} - -function restoreArchiveProxyLinks(html) { - return html.replace(/\s(href|action)=(["'])([\s\S]*?)\2/gi, (full, attr, quote, rawValue) => { - const restored = restoreArchiveProxyUrl(htmlDecode(rawValue)); - if (restored === rawValue) { - return full; - } - return ` ${attr}=${quote}${htmlEscape(restored)}${quote}`; - }); -} - -function restoreArchiveProxyUrl(rawValue) { - const value = rawValue.trim(); - const archiveHost = "archive\\.(?:ph|today|is|li|md|fo|vn|pm)"; - const proxied = value.match(new RegExp(`^https?://${archiveHost}/o/[^/]+/(https?://.+)$`, "i")); - if (proxied) { - return safeDecodeUrl(proxied[1]); - } - const samePage = value.match(new RegExp(`^https?://${archiveHost}/[^/#?]+(#.+)$`, "i")); - if (samePage) { - return samePage[1]; - } - return rawValue; -} - -function safeDecodeUrl(value) { - try { - return decodeURIComponent(value); - } catch { - return value; - } -} - function mimeFromUrl(rawUrl) { let pathname = rawUrl; try { diff --git a/src/cli.mjs b/src/cli.mjs index 2bbcfff..c99812d 100644 --- a/src/cli.mjs +++ b/src/cli.mjs @@ -1,5 +1,5 @@ #!/usr/bin/env node -import { archivePage, DEFAULT_USER_AGENT, defaultArchivePath } from "./archiver.mjs"; +import { archivePage, defaultArchivePath } from "./archiver.mjs"; function parseArgs(argv) { const args = { @@ -33,19 +33,7 @@ function usage() { Options: --archive-path Output directory. Defaults to ARCHIVE_PATH or ${defaultArchivePath()} - --id Output id/file stem - --static Do not use a browser; transform the input HTML only - --render Force browser rendering for local archive-shell HTML - --freeze-styles Snapshot computed styles into inline style attributes - --strip-archive-shell Remove an archive.ph shell from an already archived HTML file - --no-strip-ads Keep ad-like elements - --user-agent User agent to send for page and asset requests - --max-asset-bytes Per-asset inline limit - --max-inline-style-bytes - Per-style-attribute inline rewrite limit - -Default user agent: - ${DEFAULT_USER_AGENT}`); + --id Output id/file stem`); } async function main() { @@ -66,15 +54,7 @@ async function main() { const result = await archivePage(input, { archivePath: args["archive-path"], - id: args.id, - freezeStyles: Boolean(args["freeze-styles"]), - render: Boolean(args.render), - static: Boolean(args.static), - stripArchiveShell: Boolean(args["strip-archive-shell"]), - stripAds: args["strip-ads"] !== false, - userAgent: args["user-agent"] || DEFAULT_USER_AGENT, - maxAssetBytes: args["max-asset-bytes"] ? Number(args["max-asset-bytes"]) : undefined, - maxInlineStyleBytes: args["max-inline-style-bytes"] ? Number(args["max-inline-style-bytes"]) : undefined + id: args.id }); console.log(`Archived: ${result.sourceUrl}`); diff --git a/src/server.mjs b/src/server.mjs deleted file mode 100644 index 9ebb772..0000000 --- a/src/server.mjs +++ /dev/null @@ -1,86 +0,0 @@ -import http from "node:http"; -import fs from "node:fs/promises"; -import path from "node:path"; -import { archivePage, DEFAULT_USER_AGENT, defaultArchivePath } from "./archiver.mjs"; - -const archivePath = process.env.ARCHIVE_PATH || defaultArchivePath(); -const port = Number(process.env.PORT || 8787); - -const server = http.createServer(async (req, res) => { - try { - const url = new URL(req.url, `http://${req.headers.host}`); - if (req.method === "GET" && url.pathname === "/health") { - return sendJson(res, 200, { ok: true, archivePath }); - } - - if (req.method === "POST" && url.pathname === "/archive") { - const body = await readJson(req); - if (!body.url) { - return sendJson(res, 400, { error: "Missing required field: url" }); - } - const result = await archivePage(body.url, { - archivePath, - id: body.id, - freezeStyles: Boolean(body.freezeStyles), - render: Boolean(body.render), - static: Boolean(body.static), - stripArchiveShell: Boolean(body.stripArchiveShell), - stripAds: body.stripAds !== false, - userAgent: body.userAgent || DEFAULT_USER_AGENT, - maxAssetBytes: body.maxAssetBytes, - maxInlineStyleBytes: body.maxInlineStyleBytes - }); - return sendJson(res, 201, { - id: result.id, - sourceUrl: result.sourceUrl, - file: result.filePath, - externalAssets: result.externalAssets, - warnings: result.warnings, - viewUrl: `/archives/${encodeURIComponent(path.basename(result.filePath))}` - }); - } - - if (req.method === "GET" && url.pathname.startsWith("/archives/")) { - const file = decodeURIComponent(url.pathname.slice("/archives/".length)); - if (!/^[a-zA-Z0-9._-]+\.html$/.test(file)) { - return sendJson(res, 400, { error: "Invalid archive file name" }); - } - const fullPath = path.join(archivePath, file); - const html = await fs.readFile(fullPath); - res.writeHead(200, { - "content-type": "text/html; charset=utf-8", - "content-length": html.length - }); - return res.end(html); - } - - sendJson(res, 404, { error: "Not found" }); - } catch (error) { - sendJson(res, 500, { error: error.message }); - } -}); - -server.listen(port, () => { - console.log(`Archive API listening on http://127.0.0.1:${port}`); - console.log(`ARCHIVE_PATH=${archivePath}`); -}); - -async function readJson(req) { - const chunks = []; - for await (const chunk of req) { - chunks.push(chunk); - } - if (!chunks.length) { - return {}; - } - return JSON.parse(Buffer.concat(chunks).toString("utf8")); -} - -function sendJson(res, status, value) { - const body = Buffer.from(JSON.stringify(value, null, 2)); - res.writeHead(status, { - "content-type": "application/json; charset=utf-8", - "content-length": body.length - }); - res.end(body); -}