522 lines
16 KiB
JavaScript
522 lines
16 KiB
JavaScript
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
|
|
export const DEFAULT_USER_AGENT =
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36";
|
|
|
|
const TEXT_TYPES = new Set([
|
|
"application/javascript",
|
|
"application/json",
|
|
"application/ld+json",
|
|
"application/xml",
|
|
"image/svg+xml",
|
|
"text/css",
|
|
"text/html",
|
|
"text/javascript",
|
|
"text/plain",
|
|
"text/xml"
|
|
]);
|
|
|
|
const MIME_BY_EXT = new Map([
|
|
[".apng", "image/apng"],
|
|
[".avif", "image/avif"],
|
|
[".css", "text/css"],
|
|
[".gif", "image/gif"],
|
|
[".html", "text/html"],
|
|
[".ico", "image/x-icon"],
|
|
[".jpeg", "image/jpeg"],
|
|
[".jpg", "image/jpeg"],
|
|
[".js", "text/javascript"],
|
|
[".json", "application/json"],
|
|
[".m4a", "audio/mp4"],
|
|
[".mp3", "audio/mpeg"],
|
|
[".mp4", "video/mp4"],
|
|
[".otf", "font/otf"],
|
|
[".png", "image/png"],
|
|
[".svg", "image/svg+xml"],
|
|
[".ttf", "font/ttf"],
|
|
[".webm", "video/webm"],
|
|
[".webp", "image/webp"],
|
|
[".woff", "font/woff"],
|
|
[".woff2", "font/woff2"],
|
|
[".xml", "application/xml"]
|
|
]);
|
|
|
|
const TRANSPARENT_IMAGE_DATA_URI =
|
|
"data:image/gif;base64,R0lGODlhAQABAAAAACwAAAAAAQABAAA=";
|
|
|
|
export function defaultArchivePath() {
|
|
return process.env.ARCHIVE_PATH || path.join(process.env.TMPDIR || "/tmp", "local-page-archives");
|
|
}
|
|
|
|
export function isHttpUrl(value) {
|
|
return /^https?:\/\//i.test(value);
|
|
}
|
|
|
|
export function isFileUrl(value) {
|
|
return /^file:\/\//i.test(value);
|
|
}
|
|
|
|
export function inputToUrl(input) {
|
|
if (/^[a-z][a-z0-9+.-]*:/i.test(input)) {
|
|
return input;
|
|
}
|
|
return pathToFileURL(path.resolve(input)).href;
|
|
}
|
|
|
|
export function slugForUrl(inputUrl) {
|
|
const url = new URL(inputUrl);
|
|
const stem =
|
|
`${url.hostname}${url.pathname}`
|
|
.replace(/\/+$/, "")
|
|
.replace(/[^a-z0-9]+/gi, "-")
|
|
.replace(/^-+|-+$/g, "")
|
|
.slice(0, 90) || "archive";
|
|
return `${stem}-${new Date().toISOString().replace(/[:.]/g, "-")}`;
|
|
}
|
|
|
|
export function findEffectiveBase(html, fallbackBaseUrl) {
|
|
const match = html.match(/<base\b[^>]*\bhref=(["']?)([^"'\s>]+)\1/i);
|
|
if (!match) {
|
|
return fallbackBaseUrl;
|
|
}
|
|
return resolveUrl(match[2], fallbackBaseUrl) || fallbackBaseUrl;
|
|
}
|
|
|
|
export function resolveUrl(rawUrl, baseUrl) {
|
|
if (!rawUrl) {
|
|
return null;
|
|
}
|
|
const trimmed = htmlDecode(rawUrl.trim());
|
|
if (
|
|
!trimmed ||
|
|
trimmed.startsWith("#") ||
|
|
/^(?:about|blob|data|javascript|mailto|tel):/i.test(trimmed)
|
|
) {
|
|
return trimmed;
|
|
}
|
|
try {
|
|
if (trimmed.startsWith("//") && (!baseUrl || /^file:/i.test(baseUrl))) {
|
|
return `https:${trimmed}`;
|
|
}
|
|
return new URL(trimmed, baseUrl).href;
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
export function htmlEscape(value) {
|
|
return String(value)
|
|
.replaceAll("&", "&")
|
|
.replaceAll("<", "<")
|
|
.replaceAll(">", ">")
|
|
.replaceAll('"', """);
|
|
}
|
|
|
|
export function htmlDecode(value) {
|
|
return String(value)
|
|
.replaceAll("&", "&")
|
|
.replaceAll(""", '"')
|
|
.replaceAll("'", "'")
|
|
.replaceAll("'", "'")
|
|
.replaceAll("<", "<")
|
|
.replaceAll(">", ">");
|
|
}
|
|
|
|
export function stripArchiveShell(html) {
|
|
if (!html.includes('id="CONTENT"') && !html.includes("id='CONTENT'")) {
|
|
return html;
|
|
}
|
|
const contentStart = html.search(/<div\b[^>]*\bid=(["'])CONTENT\1[^>]*>/i);
|
|
const marker = html.search(
|
|
/<!--\[if !IE\]><!--><div\b[^>]*>\s*<table\b[^>]*\bid=(["'])hashtags\1/i
|
|
);
|
|
if (contentStart === -1 || marker === -1 || marker <= contentStart) {
|
|
return html;
|
|
}
|
|
const title = html.match(/<title\b[^>]*>[\s\S]*?<\/title>/i)?.[0] || "<title>Archived page</title>";
|
|
const fontStyle = html.match(/<style\b[^>]*type=(["'])text\/css\1[^>]*>[\s\S]*?<\/style>/i)?.[0] || "";
|
|
const capturedStart = html.slice(contentStart, marker).search(/<div\b[^>]*\bclass=(["'])html1\1[^>]*>/i);
|
|
const fragmentStart = capturedStart === -1 ? contentStart : contentStart + capturedStart;
|
|
const fragmentEnd = findMatchingDivEnd(html, fragmentStart) || marker;
|
|
const content = html.slice(fragmentStart, Math.min(fragmentEnd, marker));
|
|
return `<!doctype html><html><head><meta charset="utf-8">${title}${fontStyle}</head><body style="margin:0;background:#fff">${content}</body></html>`;
|
|
}
|
|
|
|
function findMatchingDivEnd(html, startIndex) {
|
|
const tags = /<\/?div\b[^>]*>/gi;
|
|
tags.lastIndex = startIndex;
|
|
let depth = 0;
|
|
for (const match of html.matchAll(tags)) {
|
|
const tag = match[0];
|
|
if (match.index < startIndex) {
|
|
continue;
|
|
}
|
|
if (/^<div\b/i.test(tag) && !/\/>$/.test(tag)) {
|
|
depth += 1;
|
|
} else if (/^<\/div/i.test(tag)) {
|
|
depth -= 1;
|
|
if (depth === 0) {
|
|
return match.index + tag.length;
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
export async function replaceAsync(input, regex, replacer) {
|
|
const parts = [];
|
|
let lastIndex = 0;
|
|
for (const match of input.matchAll(regex)) {
|
|
parts.push(input.slice(lastIndex, match.index));
|
|
parts.push(await replacer(match));
|
|
lastIndex = match.index + match[0].length;
|
|
}
|
|
parts.push(input.slice(lastIndex));
|
|
return parts.join("");
|
|
}
|
|
|
|
export class AssetInliner {
|
|
constructor(options = {}) {
|
|
this.userAgent = options.userAgent || DEFAULT_USER_AGENT;
|
|
this.referer = options.referer;
|
|
this.maxAssetBytes = options.maxAssetBytes || 30 * 1024 * 1024;
|
|
this.cache = new Map();
|
|
this.warnings = [];
|
|
}
|
|
|
|
async inlineHtml(html, baseUrl, options = {}) {
|
|
const depth = options.depth || 0;
|
|
const effectiveBase = findEffectiveBase(html, baseUrl);
|
|
let output = html;
|
|
|
|
output = output.replace(/<base\b[^>]*>/gi, "");
|
|
output = output.replace(/<script\b[\s\S]*?<\/script>/gi, "");
|
|
output = output.replace(/<noscript\b[\s\S]*?<\/noscript>/gi, "");
|
|
output = output.replace(/<link\b[^>]*\brel=(["']?)(?:preconnect|dns-prefetch|modulepreload)\1[^>]*>/gi, "");
|
|
|
|
output = await replaceAsync(output, /<style\b([^>]*)>([\s\S]*?)<\/style>/gi, async (match) => {
|
|
const attrs = match[1] || "";
|
|
const css = await this.inlineCss(match[2] || "", effectiveBase);
|
|
return `<style${attrs}>${css}</style>`;
|
|
});
|
|
|
|
output = await replaceAsync(output, /\sstyle=(["'])([\s\S]*?)\1/gi, async (match) => {
|
|
const css = await this.inlineCss(htmlDecode(match[2]), effectiveBase);
|
|
return ` style=${match[1]}${htmlEscape(css)}${match[1]}`;
|
|
});
|
|
|
|
output = await replaceAsync(output, /<link\b[^>]*>/gi, async (match) => {
|
|
return this.rewriteLinkTag(match[0], effectiveBase);
|
|
});
|
|
|
|
output = await replaceAsync(output, /<iframe\b[^>]*>[\s\S]*?<\/iframe>|<iframe\b[^>]*\/?>/gi, async (match) => {
|
|
if (depth >= 1) {
|
|
return this.rewriteMediaAttributes(match[0], effectiveBase);
|
|
}
|
|
return this.rewriteIframeTag(match[0], effectiveBase, depth);
|
|
});
|
|
|
|
output = await replaceAsync(
|
|
output,
|
|
/<(?:img|source|audio|video|track|embed|object|input)\b[^>]*>/gi,
|
|
async (match) => this.rewriteMediaAttributes(match[0], effectiveBase)
|
|
);
|
|
|
|
output = removeExternalBookkeepingUrls(output);
|
|
output = restoreArchiveProxyLinks(output);
|
|
|
|
output = await replaceAsync(output, /srcset=(["'])([\s\S]*?)\1/gi, async (match) => {
|
|
const rewritten = await this.inlineSrcset(match[2], effectiveBase);
|
|
return `srcset=${match[1]}${htmlEscape(rewritten)}${match[1]}`;
|
|
});
|
|
|
|
return output;
|
|
}
|
|
|
|
async rewriteLinkTag(tag, baseUrl) {
|
|
const rel = getAttribute(tag, "rel") || "";
|
|
const href = getAttribute(tag, "href");
|
|
const asValue = getAttribute(tag, "as") || "";
|
|
if (!href) {
|
|
return tag;
|
|
}
|
|
|
|
if (/\bstylesheet\b/i.test(rel)) {
|
|
const absolute = resolveUrl(href, baseUrl);
|
|
if (!absolute || absolute.startsWith("data:")) {
|
|
return "";
|
|
}
|
|
const css = await this.fetchText(absolute, baseUrl);
|
|
if (css == null) {
|
|
return "";
|
|
}
|
|
const inlinedCss = await this.inlineCss(css, absolute);
|
|
return `<style data-archived-href="${htmlEscape(absolute)}">${inlinedCss}</style>`;
|
|
}
|
|
|
|
const isInlineableLink =
|
|
/\b(?:icon|apple-touch-icon|image_src)\b/i.test(rel) ||
|
|
(/\bpreload\b/i.test(rel) && /^(?:font|image|style)$/i.test(asValue));
|
|
if (!isInlineableLink) {
|
|
return tag;
|
|
}
|
|
if (/\bpreload\b/i.test(rel) && /^style$/i.test(asValue)) {
|
|
return "";
|
|
}
|
|
const dataUri = await this.toDataUri(href, baseUrl);
|
|
if (!dataUri) {
|
|
return "";
|
|
}
|
|
return setAttribute(tag, "href", dataUri);
|
|
}
|
|
|
|
async rewriteMediaAttributes(tag, baseUrl) {
|
|
let output = tag;
|
|
for (const attr of ["src", "poster", "data"]) {
|
|
const value = getAttribute(output, attr);
|
|
if (!value) {
|
|
continue;
|
|
}
|
|
const dataUri = await this.toDataUri(value, baseUrl);
|
|
if (dataUri) {
|
|
output = setAttribute(output, attr, dataUri);
|
|
} else {
|
|
output = replaceMissingMediaAttribute(output, attr);
|
|
}
|
|
}
|
|
return output;
|
|
}
|
|
|
|
async rewriteIframeTag(tag, baseUrl, depth) {
|
|
const src = getAttribute(tag, "src");
|
|
if (!src || getAttribute(tag, "srcdoc")) {
|
|
return this.rewriteMediaAttributes(tag, baseUrl);
|
|
}
|
|
const absolute = resolveUrl(src, baseUrl);
|
|
if (!absolute || absolute.startsWith("data:")) {
|
|
return tag;
|
|
}
|
|
const text = await this.fetchText(absolute, baseUrl);
|
|
if (text != null) {
|
|
const inlined = await this.inlineHtml(text, absolute, { depth: depth + 1 });
|
|
let rewritten = removeAttribute(tag, "src");
|
|
rewritten = setAttribute(rewritten, "srcdoc", inlined);
|
|
rewritten = setAttribute(rewritten, "data-archived-src", absolute);
|
|
return rewritten;
|
|
}
|
|
return this.rewriteMediaAttributes(tag, baseUrl);
|
|
}
|
|
|
|
async inlineSrcset(value, baseUrl) {
|
|
const candidates = value
|
|
.split(",")
|
|
.map((part) => part.trim())
|
|
.filter(Boolean);
|
|
const rewritten = [];
|
|
for (const candidate of candidates) {
|
|
const [urlPart, ...descriptor] = candidate.split(/\s+/);
|
|
const dataUri = await this.toDataUri(urlPart, baseUrl);
|
|
rewritten.push([dataUri || TRANSPARENT_IMAGE_DATA_URI, ...descriptor].join(" "));
|
|
}
|
|
return rewritten.join(", ");
|
|
}
|
|
|
|
async inlineCss(css, baseUrl) {
|
|
let output = await replaceAsync(
|
|
css,
|
|
/@import\s+(?:url\()?["']?([^"')\s;]+)["']?\)?[^;]*;/gi,
|
|
async (match) => {
|
|
const absolute = resolveUrl(match[1], baseUrl);
|
|
if (!absolute || absolute.startsWith("data:")) {
|
|
return "";
|
|
}
|
|
const imported = await this.fetchText(absolute, baseUrl);
|
|
if (imported == null) {
|
|
return "";
|
|
}
|
|
return this.inlineCss(imported, absolute);
|
|
}
|
|
);
|
|
|
|
output = await replaceAsync(output, /url\(\s*(["']?)([^"')]+)\1\s*\)/gi, async (match) => {
|
|
const raw = htmlDecode(match[2].trim());
|
|
if (!raw || raw.startsWith("#") || /^(?:data|blob|about|javascript):/i.test(raw)) {
|
|
return match[0];
|
|
}
|
|
const dataUri = await this.toDataUri(raw, baseUrl);
|
|
return dataUri ? `url("${dataUri}")` : "url(about:blank)";
|
|
});
|
|
|
|
return output;
|
|
}
|
|
|
|
async toDataUri(rawUrl, baseUrl) {
|
|
const absolute = resolveUrl(rawUrl, baseUrl);
|
|
if (!absolute || absolute.startsWith("data:")) {
|
|
return absolute;
|
|
}
|
|
if (this.cache.has(absolute)) {
|
|
return this.cache.get(absolute);
|
|
}
|
|
const asset = await this.fetchAsset(absolute, baseUrl);
|
|
if (!asset) {
|
|
this.cache.set(absolute, null);
|
|
return null;
|
|
}
|
|
const dataUri = `data:${asset.contentType};base64,${asset.bytes.toString("base64")}`;
|
|
this.cache.set(absolute, dataUri);
|
|
return dataUri;
|
|
}
|
|
|
|
async fetchText(rawUrl, baseUrl) {
|
|
const asset = await this.fetchAsset(rawUrl, baseUrl);
|
|
if (!asset) {
|
|
return null;
|
|
}
|
|
const contentType = asset.contentType.split(";")[0].toLowerCase();
|
|
if (!TEXT_TYPES.has(contentType) && !contentType.endsWith("+xml")) {
|
|
return null;
|
|
}
|
|
return asset.bytes.toString("utf8");
|
|
}
|
|
|
|
async fetchAsset(rawUrl, baseUrl) {
|
|
const absolute = resolveUrl(rawUrl, baseUrl);
|
|
if (!absolute || absolute.startsWith("data:")) {
|
|
return null;
|
|
}
|
|
try {
|
|
if (isFileUrl(absolute)) {
|
|
const filePath = fileURLToPath(absolute);
|
|
const bytes = await fs.readFile(filePath);
|
|
return {
|
|
bytes,
|
|
contentType: mimeFromUrl(absolute)
|
|
};
|
|
}
|
|
if (!isHttpUrl(absolute)) {
|
|
return null;
|
|
}
|
|
for (let attempt = 1; attempt <= 2; attempt += 1) {
|
|
const controller = new AbortController();
|
|
const timeout = setTimeout(() => controller.abort(), 30000);
|
|
try {
|
|
const response = await fetch(absolute, {
|
|
headers: {
|
|
"user-agent": this.userAgent,
|
|
accept: "*/*",
|
|
...(this.referer ? { referer: this.referer } : {})
|
|
},
|
|
redirect: "follow",
|
|
signal: controller.signal
|
|
});
|
|
clearTimeout(timeout);
|
|
if (!response.ok) {
|
|
this.warnings.push(`Failed to fetch ${absolute}: HTTP ${response.status}`);
|
|
return null;
|
|
}
|
|
const arrayBuffer = await response.arrayBuffer();
|
|
if (arrayBuffer.byteLength > this.maxAssetBytes) {
|
|
this.warnings.push(`Skipped ${absolute}: ${arrayBuffer.byteLength} bytes exceeds ${this.maxAssetBytes}`);
|
|
return null;
|
|
}
|
|
return {
|
|
bytes: Buffer.from(arrayBuffer),
|
|
contentType: response.headers.get("content-type")?.split(";")[0] || mimeFromUrl(absolute)
|
|
};
|
|
} catch (error) {
|
|
clearTimeout(timeout);
|
|
if (attempt < 2) {
|
|
continue;
|
|
}
|
|
this.warnings.push(`Failed to fetch ${absolute}: ${error.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
} catch (error) {
|
|
this.warnings.push(`Failed to fetch ${absolute}: ${error.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
}
|
|
|
|
function removeExternalBookkeepingUrls(html) {
|
|
return html.replace(
|
|
/\s(?:old-src|currentSourceUrl|data-original-src|data-archived-src)=(["'])(https?:\/\/[\s\S]*?)\1/gi,
|
|
""
|
|
);
|
|
}
|
|
|
|
function restoreArchiveProxyLinks(html) {
|
|
return html.replace(/\s(href|action)=(["'])([\s\S]*?)\2/gi, (full, attr, quote, rawValue) => {
|
|
const restored = restoreArchiveProxyUrl(htmlDecode(rawValue));
|
|
if (restored === rawValue) {
|
|
return full;
|
|
}
|
|
return ` ${attr}=${quote}${htmlEscape(restored)}${quote}`;
|
|
});
|
|
}
|
|
|
|
function restoreArchiveProxyUrl(rawValue) {
|
|
const value = rawValue.trim();
|
|
const archiveHost = "archive\\.(?:ph|today|is|li|md|fo|vn|pm)";
|
|
const proxied = value.match(new RegExp(`^https?://${archiveHost}/o/[^/]+/(https?://.+)$`, "i"));
|
|
if (proxied) {
|
|
return safeDecodeUrl(proxied[1]);
|
|
}
|
|
const samePage = value.match(new RegExp(`^https?://${archiveHost}/[^/#?]+(#.+)$`, "i"));
|
|
if (samePage) {
|
|
return samePage[1];
|
|
}
|
|
return rawValue;
|
|
}
|
|
|
|
function safeDecodeUrl(value) {
|
|
try {
|
|
return decodeURIComponent(value);
|
|
} catch {
|
|
return value;
|
|
}
|
|
}
|
|
|
|
function mimeFromUrl(rawUrl) {
|
|
let pathname = rawUrl;
|
|
try {
|
|
pathname = new URL(rawUrl).pathname;
|
|
} catch {
|
|
// Keep raw string.
|
|
}
|
|
return MIME_BY_EXT.get(path.extname(pathname).toLowerCase()) || "application/octet-stream";
|
|
}
|
|
|
|
function getAttribute(tag, attr) {
|
|
const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"));
|
|
if (!match) {
|
|
return null;
|
|
}
|
|
return htmlDecode(match[2] ?? match[3] ?? match[4] ?? "");
|
|
}
|
|
|
|
function setAttribute(tag, attr, value) {
|
|
const escaped = htmlEscape(value);
|
|
const attrRegex = new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i");
|
|
if (attrRegex.test(tag)) {
|
|
return tag.replace(attrRegex, `${attr}="${escaped}"`);
|
|
}
|
|
return tag.replace(/\/?>$/, (end) => ` ${attr}="${escaped}"${end}`);
|
|
}
|
|
|
|
function removeAttribute(tag, attr) {
|
|
return tag.replace(new RegExp(`\\s+${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"), "");
|
|
}
|
|
|
|
function replaceMissingMediaAttribute(tag, attr) {
|
|
const tagName = tag.match(/^<([a-z0-9:-]+)/i)?.[1]?.toLowerCase() || "";
|
|
if (attr === "src" && (tagName === "img" || tagName === "input")) {
|
|
return setAttribute(tag, attr, TRANSPARENT_IMAGE_DATA_URI);
|
|
}
|
|
return removeAttribute(tag, attr);
|
|
}
|