From 17a1ee963ab0a7eb661a2afb5039446d7a6b9dce Mon Sep 17 00:00:00 2001 From: James Magahern Date: Thu, 14 May 2026 08:12:13 -0700 Subject: [PATCH] initial commit --- .gitignore | 2 + README.md | 47 ++++ package-lock.json | 65 ++++++ package.json | 21 ++ src/archiver.mjs | 395 ++++++++++++++++++++++++++++++++ src/asset-inliner.mjs | 521 ++++++++++++++++++++++++++++++++++++++++++ src/cli.mjs | 96 ++++++++ src/server.mjs | 84 +++++++ 8 files changed, 1231 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 package-lock.json create mode 100644 package.json create mode 100644 src/archiver.mjs create mode 100644 src/asset-inliner.mjs create mode 100644 src/cli.mjs create mode 100644 src/server.mjs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d570088 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +node_modules/ + diff --git a/README.md b/README.md new file mode 100644 index 0000000..0b2870d --- /dev/null +++ b/README.md @@ -0,0 +1,47 @@ +# Local Page Archiver + +This project saves self-contained HTML archives for pages the operator is authorized to access. It sends a real browser user agent, renders web URLs with Playwright, strips ad/tracker-like elements, normalizes the captured DOM, and inlines page requisites as `data:` URLs. + +It intentionally does not execute paywall-bypass rules. The bundled `bypass-paywalls-clean-filters` files are treated as reference material only; paywall selectors and scripts are not applied. + +## CLI + +```sh +npm install +npm run install-browsers +node src/cli.mjs archive "https://example.com/article" +``` + +For an existing HTML file: + +```sh +node src/cli.mjs archive ./page.html --static +``` + +For an `archive.ph` HTML export where you want the captured page without the archive shell: + +```sh +node src/cli.mjs archive ./bloomberg-archive.html --static --strip-archive-shell +``` + +Local `archive.ph` HTML inputs with `--strip-archive-shell` use the static extractor by default because those files already contain the rendered page. Add `--render` only when you explicitly want Chromium to load the local HTML first. + +Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set. + +## API + +```sh +ARCHIVE_PATH=/tmp/local-page-archives npm run serve +``` + +Archive a page: + +```sh +curl -X POST http://127.0.0.1:8787/archive \ + -H 'content-type: application/json' \ + -d '{"url":"https://example.com/article"}' +``` + +The response includes the archived file path and a local `viewUrl`. + +Set `PORT` to choose a port other than the default `8787`. diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..7c6e3c3 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,65 @@ +{ + "name": "local-page-archiver", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "local-page-archiver", + "version": "0.1.0", + "dependencies": { + "playwright": "^1.59.1" + }, + "bin": { + "archive-page": "src/cli.mjs" + }, + "engines": { + "node": ">=22" + } + }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/playwright": { + "version": "1.60.0", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.60.0.tgz", + "integrity": "sha512-hheHdokM8cdqCb0lcE3s+zT4t4W+vvjpGxsZlDnikarzx8tSzMebh3UiFtgqwFwnTnjYQcsyMF8ei2mCO/tpeA==", + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.60.0" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.60.0", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.60.0.tgz", + "integrity": "sha512-9bW6zvX/m0lEbgTKJ6YppOKx8H3VOPBMOCFh2irXFOT4BbHgrx5hPjwJYLT40Lu+4qtD36qKc/Hn56StUW57IA==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..b1d1165 --- /dev/null +++ b/package.json @@ -0,0 +1,21 @@ +{ + "name": "local-page-archiver", + "version": "0.1.0", + "private": true, + "type": "module", + "description": "Render and save self-contained HTML archives for pages the operator is authorized to access.", + "bin": { + "archive-page": "./src/cli.mjs" + }, + "scripts": { + "archive": "node src/cli.mjs archive", + "serve": "node src/server.mjs", + "install-browsers": "playwright install chromium" + }, + "dependencies": { + "playwright": "^1.59.1" + }, + "engines": { + "node": ">=22" + } +} diff --git a/src/archiver.mjs b/src/archiver.mjs new file mode 100644 index 0000000..8165482 --- /dev/null +++ b/src/archiver.mjs @@ -0,0 +1,395 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { createRequire } from "node:module"; +import { + AssetInliner, + DEFAULT_USER_AGENT, + defaultArchivePath, + findEffectiveBase, + htmlEscape, + inputToUrl, + isFileUrl, + isHttpUrl, + slugForUrl, + stripArchiveShell +} from "./asset-inliner.mjs"; + +const require = createRequire(import.meta.url); + +export { DEFAULT_USER_AGENT, defaultArchivePath }; + +const AD_SELECTORS = [ + "[data-ad-status]", + "[data-ad-type]", + "[aria-label*='advertisement' i]", + "[id^='leaderboard']", + "[class*='LeaderboardAd_']", + "[class*='FullWidthAd_']", + "[class*='BaseAd_']", + ".adWrapper", + ".dvz-v0-ad", + "amp-ad", + "iframe[src*='doubleclick']", + "iframe[src*='googletagmanager']", + "iframe[src*='googlesyndication']" +]; + +const TRACKER_HOST_PATTERNS = [ + "doubleclick.net", + "googletagmanager.com", + "googlesyndication.com", + "google-analytics.com", + "pub.doubleverify.com", + "securepubads.g.doubleclick.net", + "s10.histats.com", + "sstatic1.histats.com" +]; + +export async function archivePage(input, options = {}) { + const sourceUrl = inputToUrl(input); + const archivePath = options.archivePath || defaultArchivePath(); + const id = options.id || slugForUrl(sourceUrl); + const filePath = path.join(archivePath, `${id}.html`); + await fs.mkdir(archivePath, { recursive: true }); + + const rawHtml = await readInputHtml(sourceUrl, options); + const baseUrl = rawHtml ? findEffectiveBase(rawHtml, sourceUrl) : sourceUrl; + const useStatic = options.static || (isFileUrl(sourceUrl) && options.stripArchiveShell && options.render !== true); + const renderedHtml = useStatic + ? prepareStaticHtml(rawHtml, options) + : await renderPage(sourceUrl, { ...options, rawHtml, baseUrl }); + + const inliner = new AssetInliner({ + userAgent: options.userAgent || DEFAULT_USER_AGENT, + referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined, + maxAssetBytes: options.maxAssetBytes + }); + const inlined = await inliner.inlineHtml(renderedHtml, baseUrl); + const finalHtml = addArchiveComment(inlined, sourceUrl, options); + await fs.writeFile(filePath, finalHtml, "utf8"); + + return { + id, + filePath, + sourceUrl, + archivePath, + warnings: inliner.warnings, + externalAssets: findExternalAssetRefs(finalHtml) + }; +} + +export async function readInputHtml(sourceUrl, options = {}) { + if (isFileUrl(sourceUrl)) { + return fs.readFile(new URL(sourceUrl), "utf8"); + } + if (!isHttpUrl(sourceUrl) || !options.static) { + return null; + } + const response = await fetch(sourceUrl, { + headers: { + "user-agent": options.userAgent || DEFAULT_USER_AGENT, + accept: "text/html,application/xhtml+xml" + }, + redirect: "follow" + }); + if (!response.ok) { + throw new Error(`Failed to fetch ${sourceUrl}: HTTP ${response.status}`); + } + return response.text(); +} + +function prepareStaticHtml(rawHtml, options = {}) { + if (!rawHtml) { + throw new Error("Static mode requires an HTML input file or fetched HTML document."); + } + return options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml; +} + +export async function renderPage(sourceUrl, options = {}) { + const playwright = loadPlaywright(); + const browser = await playwright.chromium.launch({ + headless: true + }); + try { + const context = await browser.newContext({ + javaScriptEnabled: options.javaScriptEnabled ?? !(options.rawHtml && isFileUrl(sourceUrl)), + userAgent: options.userAgent || DEFAULT_USER_AGENT, + viewport: { + width: options.viewportWidth || 1024, + height: options.viewportHeight || 768 + } + }); + const page = await context.newPage(); + + if (options.stripAds !== false) { + await page.route("**/*", (route) => { + const url = route.request().url(); + if (isTrackerUrl(url)) { + return route.abort(); + } + return route.continue(); + }); + } + + if (options.rawHtml && isFileUrl(sourceUrl)) { + const content = prepareRenderInputHtml(options.rawHtml, options); + await page.setContent(content, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 }); + } else { + await page.goto(sourceUrl, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 }); + } + + await settlePage(page, options); + await cleanupAndFreezePage(page, options); + return await page.content(); + } finally { + await browser.close(); + } +} + +async function settlePage(page, options) { + try { + await page.waitForLoadState("networkidle", { timeout: options.networkIdleTimeoutMs || 15000 }); + } catch { + // Dynamic pages often keep long-lived connections open; DOM capture can still proceed. + } + + if (options.scroll !== false) { + await page.evaluate(async () => { + await new Promise((resolve) => { + let total = 0; + const step = Math.max(400, Math.floor(window.innerHeight * 0.8)); + const timer = setInterval(() => { + const previous = document.scrollingElement?.scrollTop || window.scrollY; + window.scrollBy(0, step); + total += step; + const current = document.scrollingElement?.scrollTop || window.scrollY; + if (current === previous || total > Math.max(document.body.scrollHeight, 20000)) { + clearInterval(timer); + window.scrollTo(0, 0); + resolve(); + } + }, 120); + }); + }); + } +} + +async function cleanupAndFreezePage(page, options) { + await page.evaluate( + ({ + adSelectors, + freezeStyles, + maxFreezeElements, + maxSanitizeElements, + stripAds, + stripArchiveShell: shouldStripArchiveShell + }) => { + function removeAll(selector) { + document.querySelectorAll(selector).forEach((node) => node.remove()); + } + + if (shouldStripArchiveShell) { + const content = document.querySelector("#CONTENT .html1") || document.querySelector("#CONTENT"); + if (content) { + document.body.innerHTML = ""; + document.body.appendChild(content.cloneNode(true)); + document.documentElement.removeAttribute("prefix"); + document.documentElement.removeAttribute("itemscope"); + document.documentElement.removeAttribute("itemtype"); + } + } + + removeAll("script"); + removeAll("noscript"); + removeAll("link[rel='preconnect'],link[rel='dns-prefetch'],link[rel='modulepreload']"); + removeAll("meta[name='next-head-count']"); + + if (stripAds) { + for (const selector of adSelectors) { + try { + removeAll(selector); + } catch { + // Ignore unsupported selectors in older browser engines. + } + } + } + + document.querySelectorAll("img").forEach((img) => { + if (img.currentSrc) { + img.setAttribute("data-original-src", img.getAttribute("src") || ""); + img.setAttribute("src", img.currentSrc); + } + img.removeAttribute("srcset"); + img.removeAttribute("sizes"); + img.setAttribute("loading", "lazy"); + }); + + document.querySelectorAll("source").forEach((source) => { + source.removeAttribute("srcset"); + }); + + document.querySelectorAll("video,audio").forEach((media) => { + if (media.currentSrc) { + media.setAttribute("src", media.currentSrc); + } + }); + + document.querySelectorAll("iframe").forEach((frame) => { + const src = frame.getAttribute("src"); + if (src) { + frame.setAttribute("data-archived-src", src); + } + try { + const doc = frame.contentDocument; + if (doc?.documentElement) { + frame.setAttribute("srcdoc", "" + doc.documentElement.outerHTML); + frame.removeAttribute("src"); + } + } catch { + // Cross-origin iframe sources are handled in the Node-side inliner when possible. + } + }); + + const walkedElements = []; + const walker = document.createTreeWalker(document.documentElement, NodeFilter.SHOW_ELEMENT); + let element = document.documentElement; + let visited = 0; + while (element && visited < maxSanitizeElements) { + walkedElements.push(element); + for (const attr of Array.from(element.attributes)) { + if (/^on/i.test(attr.name) || attr.name === "integrity" || attr.name === "nonce") { + element.removeAttribute(attr.name); + } + } + visited += 1; + element = walker.nextNode(); + } + + if (!freezeStyles || element || walkedElements.length > maxFreezeElements) { + return; + } + + for (const element of walkedElements) { + if (element.tagName === "SCRIPT" || element.tagName === "STYLE") { + continue; + } + const computed = window.getComputedStyle(element); + const declarations = []; + for (let i = 0; i < computed.length; i += 1) { + const property = computed[i]; + const value = computed.getPropertyValue(property); + if (value) { + declarations.push(`${property}:${value}`); + } + } + if (declarations.length) { + element.setAttribute("style", declarations.join(";")); + } + } + }, + { + adSelectors: AD_SELECTORS, + freezeStyles: options.freezeStyles !== false, + maxFreezeElements: options.maxFreezeElements || 2500, + maxSanitizeElements: options.maxSanitizeElements || 5000, + stripAds: options.stripAds !== false, + stripArchiveShell: Boolean(options.stripArchiveShell) + } + ); +} + +function prepareRenderInputHtml(rawHtml, options) { + let html = options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml; + html = html + .replace(//gi, "") + .replace(//gi, ""); + if (!options.baseUrl) { + return html; + } + if (/`; + if (/]*>/i.test(html)) { + return html.replace(/]*>/i, (match) => `${match}${baseTag}`); + } + return `${baseTag}${html}`; +} + +function loadPlaywright() { + try { + return require("playwright"); + } catch (error) { + throw new Error( + `Playwright is required for render mode. Run "npm install" and "npm run install-browsers", or use --static for HTML input files. Original error: ${error.message}` + ); + } +} + +function isTrackerUrl(rawUrl) { + let host = ""; + try { + host = new URL(rawUrl).hostname; + } catch { + return false; + } + return TRACKER_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`)); +} + +function addArchiveComment(html, sourceUrl, options) { + const comment = ``; + if (/]*>/i, (doctype) => `${doctype}\n${comment}`); + } + return `\n${comment}\n${html}`; +} + +export function findExternalAssetRefs(html) { + const refs = new Set(); + const attrPattern = /\b(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi; + for (const match of html.matchAll(attrPattern)) { + if (isSelfContainedAssetRef(match[2])) { + continue; + } + for (const part of match[2].split(",")) { + const candidate = part.trim().split(/\s+/)[0]; + if (candidate && !isSelfContainedAssetRef(candidate)) { + refs.add(candidate); + } + } + } + const linkPattern = /]*>/gi; + for (const match of html.matchAll(linkPattern)) { + const tag = match[0]; + const rel = readAttribute(tag, "rel") || ""; + if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) { + continue; + } + const href = readAttribute(tag, "href"); + if (href && !isSelfContainedAssetRef(href)) { + refs.add(href); + } + } + const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi; + for (const match of html.matchAll(cssUrlPattern)) { + const candidate = match[2].trim(); + if (candidate && !isSelfContainedAssetRef(candidate)) { + refs.add(candidate); + } + } + return Array.from(refs).sort(); +} + +function isSelfContainedAssetRef(value) { + const trimmed = value.trim(); + return ( + !trimmed || + trimmed.startsWith("#") || + /^(?:data|about|javascript|mailto|tel):/i.test(trimmed) + ); +} + +function readAttribute(tag, attr) { + const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i")); + return match ? match[2] ?? match[3] ?? match[4] ?? "" : ""; +} diff --git a/src/asset-inliner.mjs b/src/asset-inliner.mjs new file mode 100644 index 0000000..cb68a5d --- /dev/null +++ b/src/asset-inliner.mjs @@ -0,0 +1,521 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { fileURLToPath, pathToFileURL } from "node:url"; + +export const DEFAULT_USER_AGENT = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"; + +const TEXT_TYPES = new Set([ + "application/javascript", + "application/json", + "application/ld+json", + "application/xml", + "image/svg+xml", + "text/css", + "text/html", + "text/javascript", + "text/plain", + "text/xml" +]); + +const MIME_BY_EXT = new Map([ + [".apng", "image/apng"], + [".avif", "image/avif"], + [".css", "text/css"], + [".gif", "image/gif"], + [".html", "text/html"], + [".ico", "image/x-icon"], + [".jpeg", "image/jpeg"], + [".jpg", "image/jpeg"], + [".js", "text/javascript"], + [".json", "application/json"], + [".m4a", "audio/mp4"], + [".mp3", "audio/mpeg"], + [".mp4", "video/mp4"], + [".otf", "font/otf"], + [".png", "image/png"], + [".svg", "image/svg+xml"], + [".ttf", "font/ttf"], + [".webm", "video/webm"], + [".webp", "image/webp"], + [".woff", "font/woff"], + [".woff2", "font/woff2"], + [".xml", "application/xml"] +]); + +const TRANSPARENT_IMAGE_DATA_URI = + "data:image/gif;base64,R0lGODlhAQABAAAAACwAAAAAAQABAAA="; + +export function defaultArchivePath() { + return process.env.ARCHIVE_PATH || path.join(process.env.TMPDIR || "/tmp", "local-page-archives"); +} + +export function isHttpUrl(value) { + return /^https?:\/\//i.test(value); +} + +export function isFileUrl(value) { + return /^file:\/\//i.test(value); +} + +export function inputToUrl(input) { + if (/^[a-z][a-z0-9+.-]*:/i.test(input)) { + return input; + } + return pathToFileURL(path.resolve(input)).href; +} + +export function slugForUrl(inputUrl) { + const url = new URL(inputUrl); + const stem = + `${url.hostname}${url.pathname}` + .replace(/\/+$/, "") + .replace(/[^a-z0-9]+/gi, "-") + .replace(/^-+|-+$/g, "") + .slice(0, 90) || "archive"; + return `${stem}-${new Date().toISOString().replace(/[:.]/g, "-")}`; +} + +export function findEffectiveBase(html, fallbackBaseUrl) { + const match = html.match(/]*\bhref=(["']?)([^"'\s>]+)\1/i); + if (!match) { + return fallbackBaseUrl; + } + return resolveUrl(match[2], fallbackBaseUrl) || fallbackBaseUrl; +} + +export function resolveUrl(rawUrl, baseUrl) { + if (!rawUrl) { + return null; + } + const trimmed = htmlDecode(rawUrl.trim()); + if ( + !trimmed || + trimmed.startsWith("#") || + /^(?:about|blob|data|javascript|mailto|tel):/i.test(trimmed) + ) { + return trimmed; + } + try { + if (trimmed.startsWith("//") && (!baseUrl || /^file:/i.test(baseUrl))) { + return `https:${trimmed}`; + } + return new URL(trimmed, baseUrl).href; + } catch { + return null; + } +} + +export function htmlEscape(value) { + return String(value) + .replaceAll("&", "&") + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll('"', """); +} + +export function htmlDecode(value) { + return String(value) + .replaceAll("&", "&") + .replaceAll(""", '"') + .replaceAll("'", "'") + .replaceAll("'", "'") + .replaceAll("<", "<") + .replaceAll(">", ">"); +} + +export function stripArchiveShell(html) { + if (!html.includes('id="CONTENT"') && !html.includes("id='CONTENT'")) { + return html; + } + const contentStart = html.search(/]*\bid=(["'])CONTENT\1[^>]*>/i); + const marker = html.search( + /]*>\s*]*\bid=(["'])hashtags\1/i + ); + if (contentStart === -1 || marker === -1 || marker <= contentStart) { + return html; + } + const title = html.match(/]*>[\s\S]*?<\/title>/i)?.[0] || "Archived page"; + const fontStyle = html.match(/]*type=(["'])text\/css\1[^>]*>[\s\S]*?<\/style>/i)?.[0] || ""; + const capturedStart = html.slice(contentStart, marker).search(/]*\bclass=(["'])html1\1[^>]*>/i); + const fragmentStart = capturedStart === -1 ? contentStart : contentStart + capturedStart; + const fragmentEnd = findMatchingDivEnd(html, fragmentStart) || marker; + const content = html.slice(fragmentStart, Math.min(fragmentEnd, marker)); + return `${title}${fontStyle}${content}`; +} + +function findMatchingDivEnd(html, startIndex) { + const tags = /<\/?div\b[^>]*>/gi; + tags.lastIndex = startIndex; + let depth = 0; + for (const match of html.matchAll(tags)) { + const tag = match[0]; + if (match.index < startIndex) { + continue; + } + if (/^$/.test(tag)) { + depth += 1; + } else if (/^<\/div/i.test(tag)) { + depth -= 1; + if (depth === 0) { + return match.index + tag.length; + } + } + } + return null; +} + +export async function replaceAsync(input, regex, replacer) { + const parts = []; + let lastIndex = 0; + for (const match of input.matchAll(regex)) { + parts.push(input.slice(lastIndex, match.index)); + parts.push(await replacer(match)); + lastIndex = match.index + match[0].length; + } + parts.push(input.slice(lastIndex)); + return parts.join(""); +} + +export class AssetInliner { + constructor(options = {}) { + this.userAgent = options.userAgent || DEFAULT_USER_AGENT; + this.referer = options.referer; + this.maxAssetBytes = options.maxAssetBytes || 30 * 1024 * 1024; + this.cache = new Map(); + this.warnings = []; + } + + async inlineHtml(html, baseUrl, options = {}) { + const depth = options.depth || 0; + const effectiveBase = findEffectiveBase(html, baseUrl); + let output = html; + + output = output.replace(/]*>/gi, ""); + output = output.replace(//gi, ""); + output = output.replace(//gi, ""); + output = output.replace(/]*\brel=(["']?)(?:preconnect|dns-prefetch|modulepreload)\1[^>]*>/gi, ""); + + output = await replaceAsync(output, /]*)>([\s\S]*?)<\/style>/gi, async (match) => { + const attrs = match[1] || ""; + const css = await this.inlineCss(match[2] || "", effectiveBase); + return `${css}`; + }); + + output = await replaceAsync(output, /\sstyle=(["'])([\s\S]*?)\1/gi, async (match) => { + const css = await this.inlineCss(htmlDecode(match[2]), effectiveBase); + return ` style=${match[1]}${htmlEscape(css)}${match[1]}`; + }); + + output = await replaceAsync(output, /]*>/gi, async (match) => { + return this.rewriteLinkTag(match[0], effectiveBase); + }); + + output = await replaceAsync(output, /]*>[\s\S]*?<\/iframe>|]*\/?>/gi, async (match) => { + if (depth >= 1) { + return this.rewriteMediaAttributes(match[0], effectiveBase); + } + return this.rewriteIframeTag(match[0], effectiveBase, depth); + }); + + output = await replaceAsync( + output, + /<(?:img|source|audio|video|track|embed|object|input)\b[^>]*>/gi, + async (match) => this.rewriteMediaAttributes(match[0], effectiveBase) + ); + + output = removeExternalBookkeepingUrls(output); + output = restoreArchiveProxyLinks(output); + + output = await replaceAsync(output, /srcset=(["'])([\s\S]*?)\1/gi, async (match) => { + const rewritten = await this.inlineSrcset(match[2], effectiveBase); + return `srcset=${match[1]}${htmlEscape(rewritten)}${match[1]}`; + }); + + return output; + } + + async rewriteLinkTag(tag, baseUrl) { + const rel = getAttribute(tag, "rel") || ""; + const href = getAttribute(tag, "href"); + const asValue = getAttribute(tag, "as") || ""; + if (!href) { + return tag; + } + + if (/\bstylesheet\b/i.test(rel)) { + const absolute = resolveUrl(href, baseUrl); + if (!absolute || absolute.startsWith("data:")) { + return ""; + } + const css = await this.fetchText(absolute, baseUrl); + if (css == null) { + return ""; + } + const inlinedCss = await this.inlineCss(css, absolute); + return ``; + } + + const isInlineableLink = + /\b(?:icon|apple-touch-icon|image_src)\b/i.test(rel) || + (/\bpreload\b/i.test(rel) && /^(?:font|image|style)$/i.test(asValue)); + if (!isInlineableLink) { + return tag; + } + if (/\bpreload\b/i.test(rel) && /^style$/i.test(asValue)) { + return ""; + } + const dataUri = await this.toDataUri(href, baseUrl); + if (!dataUri) { + return ""; + } + return setAttribute(tag, "href", dataUri); + } + + async rewriteMediaAttributes(tag, baseUrl) { + let output = tag; + for (const attr of ["src", "poster", "data"]) { + const value = getAttribute(output, attr); + if (!value) { + continue; + } + const dataUri = await this.toDataUri(value, baseUrl); + if (dataUri) { + output = setAttribute(output, attr, dataUri); + } else { + output = replaceMissingMediaAttribute(output, attr); + } + } + return output; + } + + async rewriteIframeTag(tag, baseUrl, depth) { + const src = getAttribute(tag, "src"); + if (!src || getAttribute(tag, "srcdoc")) { + return this.rewriteMediaAttributes(tag, baseUrl); + } + const absolute = resolveUrl(src, baseUrl); + if (!absolute || absolute.startsWith("data:")) { + return tag; + } + const text = await this.fetchText(absolute, baseUrl); + if (text != null) { + const inlined = await this.inlineHtml(text, absolute, { depth: depth + 1 }); + let rewritten = removeAttribute(tag, "src"); + rewritten = setAttribute(rewritten, "srcdoc", inlined); + rewritten = setAttribute(rewritten, "data-archived-src", absolute); + return rewritten; + } + return this.rewriteMediaAttributes(tag, baseUrl); + } + + async inlineSrcset(value, baseUrl) { + const candidates = value + .split(",") + .map((part) => part.trim()) + .filter(Boolean); + const rewritten = []; + for (const candidate of candidates) { + const [urlPart, ...descriptor] = candidate.split(/\s+/); + const dataUri = await this.toDataUri(urlPart, baseUrl); + rewritten.push([dataUri || TRANSPARENT_IMAGE_DATA_URI, ...descriptor].join(" ")); + } + return rewritten.join(", "); + } + + async inlineCss(css, baseUrl) { + let output = await replaceAsync( + css, + /@import\s+(?:url\()?["']?([^"')\s;]+)["']?\)?[^;]*;/gi, + async (match) => { + const absolute = resolveUrl(match[1], baseUrl); + if (!absolute || absolute.startsWith("data:")) { + return ""; + } + const imported = await this.fetchText(absolute, baseUrl); + if (imported == null) { + return ""; + } + return this.inlineCss(imported, absolute); + } + ); + + output = await replaceAsync(output, /url\(\s*(["']?)([^"')]+)\1\s*\)/gi, async (match) => { + const raw = htmlDecode(match[2].trim()); + if (!raw || raw.startsWith("#") || /^(?:data|blob|about|javascript):/i.test(raw)) { + return match[0]; + } + const dataUri = await this.toDataUri(raw, baseUrl); + return dataUri ? `url("${dataUri}")` : "url(about:blank)"; + }); + + return output; + } + + async toDataUri(rawUrl, baseUrl) { + const absolute = resolveUrl(rawUrl, baseUrl); + if (!absolute || absolute.startsWith("data:")) { + return absolute; + } + if (this.cache.has(absolute)) { + return this.cache.get(absolute); + } + const asset = await this.fetchAsset(absolute, baseUrl); + if (!asset) { + this.cache.set(absolute, null); + return null; + } + const dataUri = `data:${asset.contentType};base64,${asset.bytes.toString("base64")}`; + this.cache.set(absolute, dataUri); + return dataUri; + } + + async fetchText(rawUrl, baseUrl) { + const asset = await this.fetchAsset(rawUrl, baseUrl); + if (!asset) { + return null; + } + const contentType = asset.contentType.split(";")[0].toLowerCase(); + if (!TEXT_TYPES.has(contentType) && !contentType.endsWith("+xml")) { + return null; + } + return asset.bytes.toString("utf8"); + } + + async fetchAsset(rawUrl, baseUrl) { + const absolute = resolveUrl(rawUrl, baseUrl); + if (!absolute || absolute.startsWith("data:")) { + return null; + } + try { + if (isFileUrl(absolute)) { + const filePath = fileURLToPath(absolute); + const bytes = await fs.readFile(filePath); + return { + bytes, + contentType: mimeFromUrl(absolute) + }; + } + if (!isHttpUrl(absolute)) { + return null; + } + for (let attempt = 1; attempt <= 2; attempt += 1) { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 30000); + try { + const response = await fetch(absolute, { + headers: { + "user-agent": this.userAgent, + accept: "*/*", + ...(this.referer ? { referer: this.referer } : {}) + }, + redirect: "follow", + signal: controller.signal + }); + clearTimeout(timeout); + if (!response.ok) { + this.warnings.push(`Failed to fetch ${absolute}: HTTP ${response.status}`); + return null; + } + const arrayBuffer = await response.arrayBuffer(); + if (arrayBuffer.byteLength > this.maxAssetBytes) { + this.warnings.push(`Skipped ${absolute}: ${arrayBuffer.byteLength} bytes exceeds ${this.maxAssetBytes}`); + return null; + } + return { + bytes: Buffer.from(arrayBuffer), + contentType: response.headers.get("content-type")?.split(";")[0] || mimeFromUrl(absolute) + }; + } catch (error) { + clearTimeout(timeout); + if (attempt < 2) { + continue; + } + this.warnings.push(`Failed to fetch ${absolute}: ${error.message}`); + return null; + } + } + } catch (error) { + this.warnings.push(`Failed to fetch ${absolute}: ${error.message}`); + return null; + } + } +} + +function removeExternalBookkeepingUrls(html) { + return html.replace( + /\s(?:old-src|currentSourceUrl|data-original-src|data-archived-src)=(["'])(https?:\/\/[\s\S]*?)\1/gi, + "" + ); +} + +function restoreArchiveProxyLinks(html) { + return html.replace(/\s(href|action)=(["'])([\s\S]*?)\2/gi, (full, attr, quote, rawValue) => { + const restored = restoreArchiveProxyUrl(htmlDecode(rawValue)); + if (restored === rawValue) { + return full; + } + return ` ${attr}=${quote}${htmlEscape(restored)}${quote}`; + }); +} + +function restoreArchiveProxyUrl(rawValue) { + const value = rawValue.trim(); + const archiveHost = "archive\\.(?:ph|today|is|li|md|fo|vn|pm)"; + const proxied = value.match(new RegExp(`^https?://${archiveHost}/o/[^/]+/(https?://.+)$`, "i")); + if (proxied) { + return safeDecodeUrl(proxied[1]); + } + const samePage = value.match(new RegExp(`^https?://${archiveHost}/[^/#?]+(#.+)$`, "i")); + if (samePage) { + return samePage[1]; + } + return rawValue; +} + +function safeDecodeUrl(value) { + try { + return decodeURIComponent(value); + } catch { + return value; + } +} + +function mimeFromUrl(rawUrl) { + let pathname = rawUrl; + try { + pathname = new URL(rawUrl).pathname; + } catch { + // Keep raw string. + } + return MIME_BY_EXT.get(path.extname(pathname).toLowerCase()) || "application/octet-stream"; +} + +function getAttribute(tag, attr) { + const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i")); + if (!match) { + return null; + } + return htmlDecode(match[2] ?? match[3] ?? match[4] ?? ""); +} + +function setAttribute(tag, attr, value) { + const escaped = htmlEscape(value); + const attrRegex = new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"); + if (attrRegex.test(tag)) { + return tag.replace(attrRegex, `${attr}="${escaped}"`); + } + return tag.replace(/\/?>$/, (end) => ` ${attr}="${escaped}"${end}`); +} + +function removeAttribute(tag, attr) { + return tag.replace(new RegExp(`\\s+${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"), ""); +} + +function replaceMissingMediaAttribute(tag, attr) { + const tagName = tag.match(/^<([a-z0-9:-]+)/i)?.[1]?.toLowerCase() || ""; + if (attr === "src" && (tagName === "img" || tagName === "input")) { + return setAttribute(tag, attr, TRANSPARENT_IMAGE_DATA_URI); + } + return removeAttribute(tag, attr); +} diff --git a/src/cli.mjs b/src/cli.mjs new file mode 100644 index 0000000..c1f86c8 --- /dev/null +++ b/src/cli.mjs @@ -0,0 +1,96 @@ +#!/usr/bin/env node +import { archivePage, DEFAULT_USER_AGENT, defaultArchivePath } from "./archiver.mjs"; + +function parseArgs(argv) { + const args = { + command: argv[2], + positional: [] + }; + for (let i = 3; i < argv.length; i += 1) { + const arg = argv[i]; + if (!arg.startsWith("--")) { + args.positional.push(arg); + continue; + } + const [flag, inlineValue] = arg.split("=", 2); + const key = flag.slice(2); + if (key.startsWith("no-")) { + args[key.slice(3)] = false; + } else if (inlineValue !== undefined) { + args[key] = inlineValue; + } else if (i + 1 < argv.length && !argv[i + 1].startsWith("--")) { + args[key] = argv[++i]; + } else { + args[key] = true; + } + } + return args; +} + +function usage() { + console.log(`Usage: + node src/cli.mjs archive [options] + +Options: + --archive-path Output directory. Defaults to ARCHIVE_PATH or ${defaultArchivePath()} + --id Output id/file stem + --static Do not use a browser; transform the input HTML only + --render Force browser rendering for local archive-shell HTML + --strip-archive-shell Remove an archive.ph shell from an already archived HTML file + --no-strip-ads Keep ad-like elements + --user-agent User agent to send for page and asset requests + --max-asset-bytes Per-asset inline limit + +Default user agent: + ${DEFAULT_USER_AGENT}`); +} + +async function main() { + const args = parseArgs(process.argv); + if (!args.command || args.command === "help" || args["help"]) { + usage(); + return; + } + if (args.command !== "archive") { + throw new Error(`Unknown command: ${args.command}`); + } + const input = args.positional[0]; + if (!input) { + usage(); + process.exitCode = 1; + return; + } + + const result = await archivePage(input, { + archivePath: args["archive-path"], + id: args.id, + render: Boolean(args.render), + static: Boolean(args.static), + stripArchiveShell: Boolean(args["strip-archive-shell"]), + stripAds: args["strip-ads"] !== false, + userAgent: args["user-agent"] || DEFAULT_USER_AGENT, + maxAssetBytes: args["max-asset-bytes"] ? Number(args["max-asset-bytes"]) : undefined + }); + + console.log(`Archived: ${result.sourceUrl}`); + console.log(`Output: ${result.filePath}`); + if (result.externalAssets.length) { + console.log(`External asset references remaining: ${result.externalAssets.length}`); + for (const ref of result.externalAssets.slice(0, 20)) { + console.log(` ${ref}`); + } + } else { + console.log("External asset references remaining: 0"); + } + if (result.warnings.length) { + console.log(`Warnings: ${result.warnings.length}`); + for (const warning of result.warnings.slice(0, 20)) { + console.log(` ${warning}`); + } + } +} + +main().catch((error) => { + console.error(error.message); + process.exitCode = 1; +}); diff --git a/src/server.mjs b/src/server.mjs new file mode 100644 index 0000000..8c7c37c --- /dev/null +++ b/src/server.mjs @@ -0,0 +1,84 @@ +import http from "node:http"; +import fs from "node:fs/promises"; +import path from "node:path"; +import { archivePage, DEFAULT_USER_AGENT, defaultArchivePath } from "./archiver.mjs"; + +const archivePath = process.env.ARCHIVE_PATH || defaultArchivePath(); +const port = Number(process.env.PORT || 8787); + +const server = http.createServer(async (req, res) => { + try { + const url = new URL(req.url, `http://${req.headers.host}`); + if (req.method === "GET" && url.pathname === "/health") { + return sendJson(res, 200, { ok: true, archivePath }); + } + + if (req.method === "POST" && url.pathname === "/archive") { + const body = await readJson(req); + if (!body.url) { + return sendJson(res, 400, { error: "Missing required field: url" }); + } + const result = await archivePage(body.url, { + archivePath, + id: body.id, + render: Boolean(body.render), + static: Boolean(body.static), + stripArchiveShell: Boolean(body.stripArchiveShell), + stripAds: body.stripAds !== false, + userAgent: body.userAgent || DEFAULT_USER_AGENT, + maxAssetBytes: body.maxAssetBytes + }); + return sendJson(res, 201, { + id: result.id, + sourceUrl: result.sourceUrl, + file: result.filePath, + externalAssets: result.externalAssets, + warnings: result.warnings, + viewUrl: `/archives/${encodeURIComponent(path.basename(result.filePath))}` + }); + } + + if (req.method === "GET" && url.pathname.startsWith("/archives/")) { + const file = decodeURIComponent(url.pathname.slice("/archives/".length)); + if (!/^[a-zA-Z0-9._-]+\.html$/.test(file)) { + return sendJson(res, 400, { error: "Invalid archive file name" }); + } + const fullPath = path.join(archivePath, file); + const html = await fs.readFile(fullPath); + res.writeHead(200, { + "content-type": "text/html; charset=utf-8", + "content-length": html.length + }); + return res.end(html); + } + + sendJson(res, 404, { error: "Not found" }); + } catch (error) { + sendJson(res, 500, { error: error.message }); + } +}); + +server.listen(port, () => { + console.log(`Archive API listening on http://127.0.0.1:${port}`); + console.log(`ARCHIVE_PATH=${archivePath}`); +}); + +async function readJson(req) { + const chunks = []; + for await (const chunk of req) { + chunks.push(chunk); + } + if (!chunks.length) { + return {}; + } + return JSON.parse(Buffer.concat(chunks).toString("utf8")); +} + +function sendJson(res, status, value) { + const body = Buffer.from(JSON.stringify(value, null, 2)); + res.writeHead(status, { + "content-type": "application/json; charset=utf-8", + "content-length": body.length + }); + res.end(body); +}