396 lines
12 KiB
JavaScript
396 lines
12 KiB
JavaScript
|
|
import fs from "node:fs/promises";
|
||
|
|
import path from "node:path";
|
||
|
|
import { createRequire } from "node:module";
|
||
|
|
import {
|
||
|
|
AssetInliner,
|
||
|
|
DEFAULT_USER_AGENT,
|
||
|
|
defaultArchivePath,
|
||
|
|
findEffectiveBase,
|
||
|
|
htmlEscape,
|
||
|
|
inputToUrl,
|
||
|
|
isFileUrl,
|
||
|
|
isHttpUrl,
|
||
|
|
slugForUrl,
|
||
|
|
stripArchiveShell
|
||
|
|
} from "./asset-inliner.mjs";
|
||
|
|
|
||
|
|
const require = createRequire(import.meta.url);
|
||
|
|
|
||
|
|
export { DEFAULT_USER_AGENT, defaultArchivePath };
|
||
|
|
|
||
|
|
const AD_SELECTORS = [
|
||
|
|
"[data-ad-status]",
|
||
|
|
"[data-ad-type]",
|
||
|
|
"[aria-label*='advertisement' i]",
|
||
|
|
"[id^='leaderboard']",
|
||
|
|
"[class*='LeaderboardAd_']",
|
||
|
|
"[class*='FullWidthAd_']",
|
||
|
|
"[class*='BaseAd_']",
|
||
|
|
".adWrapper",
|
||
|
|
".dvz-v0-ad",
|
||
|
|
"amp-ad",
|
||
|
|
"iframe[src*='doubleclick']",
|
||
|
|
"iframe[src*='googletagmanager']",
|
||
|
|
"iframe[src*='googlesyndication']"
|
||
|
|
];
|
||
|
|
|
||
|
|
const TRACKER_HOST_PATTERNS = [
|
||
|
|
"doubleclick.net",
|
||
|
|
"googletagmanager.com",
|
||
|
|
"googlesyndication.com",
|
||
|
|
"google-analytics.com",
|
||
|
|
"pub.doubleverify.com",
|
||
|
|
"securepubads.g.doubleclick.net",
|
||
|
|
"s10.histats.com",
|
||
|
|
"sstatic1.histats.com"
|
||
|
|
];
|
||
|
|
|
||
|
|
export async function archivePage(input, options = {}) {
|
||
|
|
const sourceUrl = inputToUrl(input);
|
||
|
|
const archivePath = options.archivePath || defaultArchivePath();
|
||
|
|
const id = options.id || slugForUrl(sourceUrl);
|
||
|
|
const filePath = path.join(archivePath, `${id}.html`);
|
||
|
|
await fs.mkdir(archivePath, { recursive: true });
|
||
|
|
|
||
|
|
const rawHtml = await readInputHtml(sourceUrl, options);
|
||
|
|
const baseUrl = rawHtml ? findEffectiveBase(rawHtml, sourceUrl) : sourceUrl;
|
||
|
|
const useStatic = options.static || (isFileUrl(sourceUrl) && options.stripArchiveShell && options.render !== true);
|
||
|
|
const renderedHtml = useStatic
|
||
|
|
? prepareStaticHtml(rawHtml, options)
|
||
|
|
: await renderPage(sourceUrl, { ...options, rawHtml, baseUrl });
|
||
|
|
|
||
|
|
const inliner = new AssetInliner({
|
||
|
|
userAgent: options.userAgent || DEFAULT_USER_AGENT,
|
||
|
|
referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined,
|
||
|
|
maxAssetBytes: options.maxAssetBytes
|
||
|
|
});
|
||
|
|
const inlined = await inliner.inlineHtml(renderedHtml, baseUrl);
|
||
|
|
const finalHtml = addArchiveComment(inlined, sourceUrl, options);
|
||
|
|
await fs.writeFile(filePath, finalHtml, "utf8");
|
||
|
|
|
||
|
|
return {
|
||
|
|
id,
|
||
|
|
filePath,
|
||
|
|
sourceUrl,
|
||
|
|
archivePath,
|
||
|
|
warnings: inliner.warnings,
|
||
|
|
externalAssets: findExternalAssetRefs(finalHtml)
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
export async function readInputHtml(sourceUrl, options = {}) {
|
||
|
|
if (isFileUrl(sourceUrl)) {
|
||
|
|
return fs.readFile(new URL(sourceUrl), "utf8");
|
||
|
|
}
|
||
|
|
if (!isHttpUrl(sourceUrl) || !options.static) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
const response = await fetch(sourceUrl, {
|
||
|
|
headers: {
|
||
|
|
"user-agent": options.userAgent || DEFAULT_USER_AGENT,
|
||
|
|
accept: "text/html,application/xhtml+xml"
|
||
|
|
},
|
||
|
|
redirect: "follow"
|
||
|
|
});
|
||
|
|
if (!response.ok) {
|
||
|
|
throw new Error(`Failed to fetch ${sourceUrl}: HTTP ${response.status}`);
|
||
|
|
}
|
||
|
|
return response.text();
|
||
|
|
}
|
||
|
|
|
||
|
|
function prepareStaticHtml(rawHtml, options = {}) {
|
||
|
|
if (!rawHtml) {
|
||
|
|
throw new Error("Static mode requires an HTML input file or fetched HTML document.");
|
||
|
|
}
|
||
|
|
return options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml;
|
||
|
|
}
|
||
|
|
|
||
|
|
export async function renderPage(sourceUrl, options = {}) {
|
||
|
|
const playwright = loadPlaywright();
|
||
|
|
const browser = await playwright.chromium.launch({
|
||
|
|
headless: true
|
||
|
|
});
|
||
|
|
try {
|
||
|
|
const context = await browser.newContext({
|
||
|
|
javaScriptEnabled: options.javaScriptEnabled ?? !(options.rawHtml && isFileUrl(sourceUrl)),
|
||
|
|
userAgent: options.userAgent || DEFAULT_USER_AGENT,
|
||
|
|
viewport: {
|
||
|
|
width: options.viewportWidth || 1024,
|
||
|
|
height: options.viewportHeight || 768
|
||
|
|
}
|
||
|
|
});
|
||
|
|
const page = await context.newPage();
|
||
|
|
|
||
|
|
if (options.stripAds !== false) {
|
||
|
|
await page.route("**/*", (route) => {
|
||
|
|
const url = route.request().url();
|
||
|
|
if (isTrackerUrl(url)) {
|
||
|
|
return route.abort();
|
||
|
|
}
|
||
|
|
return route.continue();
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
if (options.rawHtml && isFileUrl(sourceUrl)) {
|
||
|
|
const content = prepareRenderInputHtml(options.rawHtml, options);
|
||
|
|
await page.setContent(content, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 });
|
||
|
|
} else {
|
||
|
|
await page.goto(sourceUrl, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 });
|
||
|
|
}
|
||
|
|
|
||
|
|
await settlePage(page, options);
|
||
|
|
await cleanupAndFreezePage(page, options);
|
||
|
|
return await page.content();
|
||
|
|
} finally {
|
||
|
|
await browser.close();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async function settlePage(page, options) {
|
||
|
|
try {
|
||
|
|
await page.waitForLoadState("networkidle", { timeout: options.networkIdleTimeoutMs || 15000 });
|
||
|
|
} catch {
|
||
|
|
// Dynamic pages often keep long-lived connections open; DOM capture can still proceed.
|
||
|
|
}
|
||
|
|
|
||
|
|
if (options.scroll !== false) {
|
||
|
|
await page.evaluate(async () => {
|
||
|
|
await new Promise((resolve) => {
|
||
|
|
let total = 0;
|
||
|
|
const step = Math.max(400, Math.floor(window.innerHeight * 0.8));
|
||
|
|
const timer = setInterval(() => {
|
||
|
|
const previous = document.scrollingElement?.scrollTop || window.scrollY;
|
||
|
|
window.scrollBy(0, step);
|
||
|
|
total += step;
|
||
|
|
const current = document.scrollingElement?.scrollTop || window.scrollY;
|
||
|
|
if (current === previous || total > Math.max(document.body.scrollHeight, 20000)) {
|
||
|
|
clearInterval(timer);
|
||
|
|
window.scrollTo(0, 0);
|
||
|
|
resolve();
|
||
|
|
}
|
||
|
|
}, 120);
|
||
|
|
});
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async function cleanupAndFreezePage(page, options) {
|
||
|
|
await page.evaluate(
|
||
|
|
({
|
||
|
|
adSelectors,
|
||
|
|
freezeStyles,
|
||
|
|
maxFreezeElements,
|
||
|
|
maxSanitizeElements,
|
||
|
|
stripAds,
|
||
|
|
stripArchiveShell: shouldStripArchiveShell
|
||
|
|
}) => {
|
||
|
|
function removeAll(selector) {
|
||
|
|
document.querySelectorAll(selector).forEach((node) => node.remove());
|
||
|
|
}
|
||
|
|
|
||
|
|
if (shouldStripArchiveShell) {
|
||
|
|
const content = document.querySelector("#CONTENT .html1") || document.querySelector("#CONTENT");
|
||
|
|
if (content) {
|
||
|
|
document.body.innerHTML = "";
|
||
|
|
document.body.appendChild(content.cloneNode(true));
|
||
|
|
document.documentElement.removeAttribute("prefix");
|
||
|
|
document.documentElement.removeAttribute("itemscope");
|
||
|
|
document.documentElement.removeAttribute("itemtype");
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
removeAll("script");
|
||
|
|
removeAll("noscript");
|
||
|
|
removeAll("link[rel='preconnect'],link[rel='dns-prefetch'],link[rel='modulepreload']");
|
||
|
|
removeAll("meta[name='next-head-count']");
|
||
|
|
|
||
|
|
if (stripAds) {
|
||
|
|
for (const selector of adSelectors) {
|
||
|
|
try {
|
||
|
|
removeAll(selector);
|
||
|
|
} catch {
|
||
|
|
// Ignore unsupported selectors in older browser engines.
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
document.querySelectorAll("img").forEach((img) => {
|
||
|
|
if (img.currentSrc) {
|
||
|
|
img.setAttribute("data-original-src", img.getAttribute("src") || "");
|
||
|
|
img.setAttribute("src", img.currentSrc);
|
||
|
|
}
|
||
|
|
img.removeAttribute("srcset");
|
||
|
|
img.removeAttribute("sizes");
|
||
|
|
img.setAttribute("loading", "lazy");
|
||
|
|
});
|
||
|
|
|
||
|
|
document.querySelectorAll("source").forEach((source) => {
|
||
|
|
source.removeAttribute("srcset");
|
||
|
|
});
|
||
|
|
|
||
|
|
document.querySelectorAll("video,audio").forEach((media) => {
|
||
|
|
if (media.currentSrc) {
|
||
|
|
media.setAttribute("src", media.currentSrc);
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
document.querySelectorAll("iframe").forEach((frame) => {
|
||
|
|
const src = frame.getAttribute("src");
|
||
|
|
if (src) {
|
||
|
|
frame.setAttribute("data-archived-src", src);
|
||
|
|
}
|
||
|
|
try {
|
||
|
|
const doc = frame.contentDocument;
|
||
|
|
if (doc?.documentElement) {
|
||
|
|
frame.setAttribute("srcdoc", "<!doctype html>" + doc.documentElement.outerHTML);
|
||
|
|
frame.removeAttribute("src");
|
||
|
|
}
|
||
|
|
} catch {
|
||
|
|
// Cross-origin iframe sources are handled in the Node-side inliner when possible.
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
const walkedElements = [];
|
||
|
|
const walker = document.createTreeWalker(document.documentElement, NodeFilter.SHOW_ELEMENT);
|
||
|
|
let element = document.documentElement;
|
||
|
|
let visited = 0;
|
||
|
|
while (element && visited < maxSanitizeElements) {
|
||
|
|
walkedElements.push(element);
|
||
|
|
for (const attr of Array.from(element.attributes)) {
|
||
|
|
if (/^on/i.test(attr.name) || attr.name === "integrity" || attr.name === "nonce") {
|
||
|
|
element.removeAttribute(attr.name);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
visited += 1;
|
||
|
|
element = walker.nextNode();
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!freezeStyles || element || walkedElements.length > maxFreezeElements) {
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
for (const element of walkedElements) {
|
||
|
|
if (element.tagName === "SCRIPT" || element.tagName === "STYLE") {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
const computed = window.getComputedStyle(element);
|
||
|
|
const declarations = [];
|
||
|
|
for (let i = 0; i < computed.length; i += 1) {
|
||
|
|
const property = computed[i];
|
||
|
|
const value = computed.getPropertyValue(property);
|
||
|
|
if (value) {
|
||
|
|
declarations.push(`${property}:${value}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (declarations.length) {
|
||
|
|
element.setAttribute("style", declarations.join(";"));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
{
|
||
|
|
adSelectors: AD_SELECTORS,
|
||
|
|
freezeStyles: options.freezeStyles !== false,
|
||
|
|
maxFreezeElements: options.maxFreezeElements || 2500,
|
||
|
|
maxSanitizeElements: options.maxSanitizeElements || 5000,
|
||
|
|
stripAds: options.stripAds !== false,
|
||
|
|
stripArchiveShell: Boolean(options.stripArchiveShell)
|
||
|
|
}
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
function prepareRenderInputHtml(rawHtml, options) {
|
||
|
|
let html = options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml;
|
||
|
|
html = html
|
||
|
|
.replace(/<script\b[\s\S]*?<\/script>/gi, "")
|
||
|
|
.replace(/<noscript\b[\s\S]*?<\/noscript>/gi, "");
|
||
|
|
if (!options.baseUrl) {
|
||
|
|
return html;
|
||
|
|
}
|
||
|
|
if (/<base\b/i.test(html)) {
|
||
|
|
return html;
|
||
|
|
}
|
||
|
|
const baseTag = `<base href="${htmlEscape(options.baseUrl)}">`;
|
||
|
|
if (/<head\b[^>]*>/i.test(html)) {
|
||
|
|
return html.replace(/<head\b[^>]*>/i, (match) => `${match}${baseTag}`);
|
||
|
|
}
|
||
|
|
return `${baseTag}${html}`;
|
||
|
|
}
|
||
|
|
|
||
|
|
function loadPlaywright() {
|
||
|
|
try {
|
||
|
|
return require("playwright");
|
||
|
|
} catch (error) {
|
||
|
|
throw new Error(
|
||
|
|
`Playwright is required for render mode. Run "npm install" and "npm run install-browsers", or use --static for HTML input files. Original error: ${error.message}`
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function isTrackerUrl(rawUrl) {
|
||
|
|
let host = "";
|
||
|
|
try {
|
||
|
|
host = new URL(rawUrl).hostname;
|
||
|
|
} catch {
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
return TRACKER_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`));
|
||
|
|
}
|
||
|
|
|
||
|
|
function addArchiveComment(html, sourceUrl, options) {
|
||
|
|
const comment = `<!-- Archived locally. Source: ${sourceUrl}. Created: ${new Date().toISOString()}. Paywall bypass filters were not executed. -->`;
|
||
|
|
if (/<!doctype/i.test(html)) {
|
||
|
|
return html.replace(/<!doctype[^>]*>/i, (doctype) => `${doctype}\n${comment}`);
|
||
|
|
}
|
||
|
|
return `<!doctype html>\n${comment}\n${html}`;
|
||
|
|
}
|
||
|
|
|
||
|
|
export function findExternalAssetRefs(html) {
|
||
|
|
const refs = new Set();
|
||
|
|
const attrPattern = /\b(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi;
|
||
|
|
for (const match of html.matchAll(attrPattern)) {
|
||
|
|
if (isSelfContainedAssetRef(match[2])) {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
for (const part of match[2].split(",")) {
|
||
|
|
const candidate = part.trim().split(/\s+/)[0];
|
||
|
|
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
||
|
|
refs.add(candidate);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
const linkPattern = /<link\b[^>]*>/gi;
|
||
|
|
for (const match of html.matchAll(linkPattern)) {
|
||
|
|
const tag = match[0];
|
||
|
|
const rel = readAttribute(tag, "rel") || "";
|
||
|
|
if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) {
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
const href = readAttribute(tag, "href");
|
||
|
|
if (href && !isSelfContainedAssetRef(href)) {
|
||
|
|
refs.add(href);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
|
||
|
|
for (const match of html.matchAll(cssUrlPattern)) {
|
||
|
|
const candidate = match[2].trim();
|
||
|
|
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
||
|
|
refs.add(candidate);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return Array.from(refs).sort();
|
||
|
|
}
|
||
|
|
|
||
|
|
function isSelfContainedAssetRef(value) {
|
||
|
|
const trimmed = value.trim();
|
||
|
|
return (
|
||
|
|
!trimmed ||
|
||
|
|
trimmed.startsWith("#") ||
|
||
|
|
/^(?:data|about|javascript|mailto|tel):/i.test(trimmed)
|
||
|
|
);
|
||
|
|
}
|
||
|
|
|
||
|
|
function readAttribute(tag, attr) {
|
||
|
|
const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"));
|
||
|
|
return match ? match[2] ?? match[3] ?? match[4] ?? "" : "";
|
||
|
|
}
|