import fs from "node:fs/promises"; import path from "node:path"; import { fileURLToPath, pathToFileURL } from "node:url"; export const DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"; const TEXT_TYPES = new Set([ "application/javascript", "application/json", "application/ld+json", "application/xml", "image/svg+xml", "text/css", "text/html", "text/javascript", "text/plain", "text/xml" ]); const MIME_BY_EXT = new Map([ [".apng", "image/apng"], [".avif", "image/avif"], [".css", "text/css"], [".gif", "image/gif"], [".html", "text/html"], [".ico", "image/x-icon"], [".jpeg", "image/jpeg"], [".jpg", "image/jpeg"], [".js", "text/javascript"], [".json", "application/json"], [".m4a", "audio/mp4"], [".mp3", "audio/mpeg"], [".mp4", "video/mp4"], [".otf", "font/otf"], [".png", "image/png"], [".svg", "image/svg+xml"], [".ttf", "font/ttf"], [".webm", "video/webm"], [".webp", "image/webp"], [".woff", "font/woff"], [".woff2", "font/woff2"], [".xml", "application/xml"] ]); const TRANSPARENT_IMAGE_DATA_URI = "data:image/gif;base64,R0lGODlhAQABAAAAACwAAAAAAQABAAA="; export function defaultArchivePath() { return process.env.ARCHIVE_PATH || path.join(process.env.TMPDIR || "/tmp", "local-page-archives"); } export function isHttpUrl(value) { return /^https?:\/\//i.test(value); } export function isFileUrl(value) { return /^file:\/\//i.test(value); } export function inputToUrl(input) { if (/^[a-z][a-z0-9+.-]*:/i.test(input)) { return input; } return pathToFileURL(path.resolve(input)).href; } export function slugForUrl(inputUrl) { const url = new URL(inputUrl); const stem = `${url.hostname}${url.pathname}` .replace(/\/+$/, "") .replace(/[^a-z0-9]+/gi, "-") .replace(/^-+|-+$/g, "") .slice(0, 90) || "archive"; return `${stem}-${new Date().toISOString().replace(/[:.]/g, "-")}`; } export function findEffectiveBase(html, fallbackBaseUrl) { const match = html.match(/]*\bhref=(["']?)([^"'\s>]+)\1/i); if (!match) { return fallbackBaseUrl; } return resolveUrl(match[2], fallbackBaseUrl) || fallbackBaseUrl; } export function resolveUrl(rawUrl, baseUrl) { if (!rawUrl) { return null; } const trimmed = htmlDecode(rawUrl.trim()); if ( !trimmed || trimmed.startsWith("#") || /^(?:about|blob|data|javascript|mailto|tel):/i.test(trimmed) ) { return trimmed; } try { if (trimmed.startsWith("//") && (!baseUrl || /^file:/i.test(baseUrl))) { return `https:${trimmed}`; } return new URL(trimmed, baseUrl).href; } catch { return null; } } export function htmlEscape(value) { return String(value) .replaceAll("&", "&") .replaceAll("<", "<") .replaceAll(">", ">") .replaceAll('"', """); } export function htmlDecode(value) { return String(value) .replaceAll("&", "&") .replaceAll(""", '"') .replaceAll("'", "'") .replaceAll("'", "'") .replaceAll("<", "<") .replaceAll(">", ">"); } export function stripArchiveShell(html) { if (!html.includes('id="CONTENT"') && !html.includes("id='CONTENT'")) { return html; } const contentStart = html.search(/]*\bid=(["'])CONTENT\1[^>]*>/i); const marker = html.search( /]*>\s*]*\bid=(["'])hashtags\1/i ); if (contentStart === -1 || marker === -1 || marker <= contentStart) { return html; } const title = html.match(/]*>[\s\S]*?<\/title>/i)?.[0] || "Archived page"; const fontStyle = html.match(/]*type=(["'])text\/css\1[^>]*>[\s\S]*?<\/style>/i)?.[0] || ""; const capturedStart = html.slice(contentStart, marker).search(/]*\bclass=(["'])html1\1[^>]*>/i); const fragmentStart = capturedStart === -1 ? contentStart : contentStart + capturedStart; const fragmentEnd = findMatchingDivEnd(html, fragmentStart) || marker; const content = html.slice(fragmentStart, Math.min(fragmentEnd, marker)); return `${title}${fontStyle}${content}`; } function findMatchingDivEnd(html, startIndex) { const tags = /<\/?div\b[^>]*>/gi; tags.lastIndex = startIndex; let depth = 0; for (const match of html.matchAll(tags)) { const tag = match[0]; if (match.index < startIndex) { continue; } if (/^$/.test(tag)) { depth += 1; } else if (/^<\/div/i.test(tag)) { depth -= 1; if (depth === 0) { return match.index + tag.length; } } } return null; } export async function replaceAsync(input, regex, replacer) { const parts = []; let lastIndex = 0; for (const match of input.matchAll(regex)) { parts.push(input.slice(lastIndex, match.index)); parts.push(await replacer(match)); lastIndex = match.index + match[0].length; } parts.push(input.slice(lastIndex)); return parts.join(""); } export class AssetInliner { constructor(options = {}) { this.userAgent = options.userAgent || DEFAULT_USER_AGENT; this.referer = options.referer; this.maxAssetBytes = options.maxAssetBytes || 30 * 1024 * 1024; this.cache = new Map(); this.warnings = []; } async inlineHtml(html, baseUrl, options = {}) { const depth = options.depth || 0; const effectiveBase = findEffectiveBase(html, baseUrl); let output = html; output = output.replace(/]*>/gi, ""); output = output.replace(//gi, ""); output = output.replace(//gi, ""); output = output.replace(/]*\brel=(["']?)(?:preconnect|dns-prefetch|modulepreload)\1[^>]*>/gi, ""); output = await replaceAsync(output, /]*)>([\s\S]*?)<\/style>/gi, async (match) => { const attrs = match[1] || ""; const css = await this.inlineCss(match[2] || "", effectiveBase); return `${css}`; }); output = await replaceAsync(output, /\sstyle=(["'])([\s\S]*?)\1/gi, async (match) => { const css = await this.inlineCss(htmlDecode(match[2]), effectiveBase); return ` style=${match[1]}${htmlEscape(css)}${match[1]}`; }); output = await replaceAsync(output, /]*>/gi, async (match) => { return this.rewriteLinkTag(match[0], effectiveBase); }); output = await replaceAsync(output, /]*>[\s\S]*?<\/iframe>|]*\/?>/gi, async (match) => { if (depth >= 1) { return this.rewriteMediaAttributes(match[0], effectiveBase); } return this.rewriteIframeTag(match[0], effectiveBase, depth); }); output = await replaceAsync( output, /<(?:img|source|audio|video|track|embed|object|input)\b[^>]*>/gi, async (match) => this.rewriteMediaAttributes(match[0], effectiveBase) ); output = removeExternalBookkeepingUrls(output); output = restoreArchiveProxyLinks(output); output = await replaceAsync(output, /srcset=(["'])([\s\S]*?)\1/gi, async (match) => { const rewritten = await this.inlineSrcset(match[2], effectiveBase); return `srcset=${match[1]}${htmlEscape(rewritten)}${match[1]}`; }); return output; } async rewriteLinkTag(tag, baseUrl) { const rel = getAttribute(tag, "rel") || ""; const href = getAttribute(tag, "href"); const asValue = getAttribute(tag, "as") || ""; if (!href) { return tag; } if (/\bstylesheet\b/i.test(rel)) { const absolute = resolveUrl(href, baseUrl); if (!absolute || absolute.startsWith("data:")) { return ""; } const css = await this.fetchText(absolute, baseUrl); if (css == null) { return ""; } const inlinedCss = await this.inlineCss(css, absolute); return ``; } const isInlineableLink = /\b(?:icon|apple-touch-icon|image_src)\b/i.test(rel) || (/\bpreload\b/i.test(rel) && /^(?:font|image|style)$/i.test(asValue)); if (!isInlineableLink) { return tag; } if (/\bpreload\b/i.test(rel) && /^style$/i.test(asValue)) { return ""; } const dataUri = await this.toDataUri(href, baseUrl); if (!dataUri) { return ""; } return setAttribute(tag, "href", dataUri); } async rewriteMediaAttributes(tag, baseUrl) { let output = tag; for (const attr of ["src", "poster", "data"]) { const value = getAttribute(output, attr); if (!value) { continue; } const dataUri = await this.toDataUri(value, baseUrl); if (dataUri) { output = setAttribute(output, attr, dataUri); } else { output = replaceMissingMediaAttribute(output, attr); } } return output; } async rewriteIframeTag(tag, baseUrl, depth) { const src = getAttribute(tag, "src"); if (!src || getAttribute(tag, "srcdoc")) { return this.rewriteMediaAttributes(tag, baseUrl); } const absolute = resolveUrl(src, baseUrl); if (!absolute || absolute.startsWith("data:")) { return tag; } const text = await this.fetchText(absolute, baseUrl); if (text != null) { const inlined = await this.inlineHtml(text, absolute, { depth: depth + 1 }); let rewritten = removeAttribute(tag, "src"); rewritten = setAttribute(rewritten, "srcdoc", inlined); rewritten = setAttribute(rewritten, "data-archived-src", absolute); return rewritten; } return this.rewriteMediaAttributes(tag, baseUrl); } async inlineSrcset(value, baseUrl) { const candidates = value .split(",") .map((part) => part.trim()) .filter(Boolean); const rewritten = []; for (const candidate of candidates) { const [urlPart, ...descriptor] = candidate.split(/\s+/); const dataUri = await this.toDataUri(urlPart, baseUrl); rewritten.push([dataUri || TRANSPARENT_IMAGE_DATA_URI, ...descriptor].join(" ")); } return rewritten.join(", "); } async inlineCss(css, baseUrl) { let output = await replaceAsync( css, /@import\s+(?:url\()?["']?([^"')\s;]+)["']?\)?[^;]*;/gi, async (match) => { const absolute = resolveUrl(match[1], baseUrl); if (!absolute || absolute.startsWith("data:")) { return ""; } const imported = await this.fetchText(absolute, baseUrl); if (imported == null) { return ""; } return this.inlineCss(imported, absolute); } ); output = await replaceAsync(output, /url\(\s*(["']?)([^"')]+)\1\s*\)/gi, async (match) => { const raw = htmlDecode(match[2].trim()); if (!raw || raw.startsWith("#") || /^(?:data|blob|about|javascript):/i.test(raw)) { return match[0]; } const dataUri = await this.toDataUri(raw, baseUrl); return dataUri ? `url("${dataUri}")` : "url(about:blank)"; }); return output; } async toDataUri(rawUrl, baseUrl) { const absolute = resolveUrl(rawUrl, baseUrl); if (!absolute || absolute.startsWith("data:")) { return absolute; } if (this.cache.has(absolute)) { return this.cache.get(absolute); } const asset = await this.fetchAsset(absolute, baseUrl); if (!asset) { this.cache.set(absolute, null); return null; } const dataUri = `data:${asset.contentType};base64,${asset.bytes.toString("base64")}`; this.cache.set(absolute, dataUri); return dataUri; } async fetchText(rawUrl, baseUrl) { const asset = await this.fetchAsset(rawUrl, baseUrl); if (!asset) { return null; } const contentType = asset.contentType.split(";")[0].toLowerCase(); if (!TEXT_TYPES.has(contentType) && !contentType.endsWith("+xml")) { return null; } return asset.bytes.toString("utf8"); } async fetchAsset(rawUrl, baseUrl) { const absolute = resolveUrl(rawUrl, baseUrl); if (!absolute || absolute.startsWith("data:")) { return null; } try { if (isFileUrl(absolute)) { const filePath = fileURLToPath(absolute); const bytes = await fs.readFile(filePath); return { bytes, contentType: mimeFromUrl(absolute) }; } if (!isHttpUrl(absolute)) { return null; } for (let attempt = 1; attempt <= 2; attempt += 1) { const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), 30000); try { const response = await fetch(absolute, { headers: { "user-agent": this.userAgent, accept: "*/*", ...(this.referer ? { referer: this.referer } : {}) }, redirect: "follow", signal: controller.signal }); clearTimeout(timeout); if (!response.ok) { this.warnings.push(`Failed to fetch ${absolute}: HTTP ${response.status}`); return null; } const arrayBuffer = await response.arrayBuffer(); if (arrayBuffer.byteLength > this.maxAssetBytes) { this.warnings.push(`Skipped ${absolute}: ${arrayBuffer.byteLength} bytes exceeds ${this.maxAssetBytes}`); return null; } return { bytes: Buffer.from(arrayBuffer), contentType: response.headers.get("content-type")?.split(";")[0] || mimeFromUrl(absolute) }; } catch (error) { clearTimeout(timeout); if (attempt < 2) { continue; } this.warnings.push(`Failed to fetch ${absolute}: ${error.message}`); return null; } } } catch (error) { this.warnings.push(`Failed to fetch ${absolute}: ${error.message}`); return null; } } } function removeExternalBookkeepingUrls(html) { return html.replace( /\s(?:old-src|currentSourceUrl|data-original-src|data-archived-src)=(["'])(https?:\/\/[\s\S]*?)\1/gi, "" ); } function restoreArchiveProxyLinks(html) { return html.replace(/\s(href|action)=(["'])([\s\S]*?)\2/gi, (full, attr, quote, rawValue) => { const restored = restoreArchiveProxyUrl(htmlDecode(rawValue)); if (restored === rawValue) { return full; } return ` ${attr}=${quote}${htmlEscape(restored)}${quote}`; }); } function restoreArchiveProxyUrl(rawValue) { const value = rawValue.trim(); const archiveHost = "archive\\.(?:ph|today|is|li|md|fo|vn|pm)"; const proxied = value.match(new RegExp(`^https?://${archiveHost}/o/[^/]+/(https?://.+)$`, "i")); if (proxied) { return safeDecodeUrl(proxied[1]); } const samePage = value.match(new RegExp(`^https?://${archiveHost}/[^/#?]+(#.+)$`, "i")); if (samePage) { return samePage[1]; } return rawValue; } function safeDecodeUrl(value) { try { return decodeURIComponent(value); } catch { return value; } } function mimeFromUrl(rawUrl) { let pathname = rawUrl; try { pathname = new URL(rawUrl).pathname; } catch { // Keep raw string. } return MIME_BY_EXT.get(path.extname(pathname).toLowerCase()) || "application/octet-stream"; } function getAttribute(tag, attr) { const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i")); if (!match) { return null; } return htmlDecode(match[2] ?? match[3] ?? match[4] ?? ""); } function setAttribute(tag, attr, value) { const escaped = htmlEscape(value); const attrRegex = new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"); if (attrRegex.test(tag)) { return tag.replace(attrRegex, `${attr}="${escaped}"`); } return tag.replace(/\/?>$/, (end) => ` ${attr}="${escaped}"${end}`); } function removeAttribute(tag, attr) { return tag.replace(new RegExp(`\\s+${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"), ""); } function replaceMissingMediaAttribute(tag, attr) { const tagName = tag.match(/^<([a-z0-9:-]+)/i)?.[1]?.toLowerCase() || ""; if (attr === "src" && (tagName === "img" || tagName === "input")) { return setAttribute(tag, attr, TRANSPARENT_IMAGE_DATA_URI); } return removeAttribute(tag, attr); }