simplify
This commit is contained in:
34
README.md
34
README.md
@@ -1,8 +1,6 @@
|
|||||||
# Local Page Archiver
|
# Local Page Archiver
|
||||||
|
|
||||||
This project saves self-contained HTML archives for pages the operator is authorized to access. It sends a real browser user agent, renders web URLs with Playwright, strips ad/tracker-like elements, normalizes the captured DOM, and inlines page requisites as `data:` URLs.
|
This project saves self-contained HTML archives. It opens the input with Playwright, captures the rendered HTML, and inlines external resources as `data:` URLs.
|
||||||
|
|
||||||
It intentionally does not execute paywall-bypass rules. The bundled `bypass-paywalls-clean-filters` files are treated as reference material only; paywall selectors and scripts are not applied.
|
|
||||||
|
|
||||||
## CLI
|
## CLI
|
||||||
|
|
||||||
@@ -15,35 +13,7 @@ node src/cli.mjs archive "https://example.com/article"
|
|||||||
For an existing HTML file:
|
For an existing HTML file:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
node src/cli.mjs archive ./page.html --static
|
node src/cli.mjs archive ./page.html
|
||||||
```
|
```
|
||||||
|
|
||||||
For an `archive.ph` HTML export where you want the captured page without the archive shell:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
node src/cli.mjs archive ./bloomberg-archive.html --static --strip-archive-shell
|
|
||||||
```
|
|
||||||
|
|
||||||
Local `archive.ph` HTML inputs with `--strip-archive-shell` use the static extractor by default because those files already contain the rendered page. Add `--render` only when you explicitly want Chromium to load the local HTML first.
|
|
||||||
|
|
||||||
Computed-style freezing is off by default for live web pages because it can inflate modern article pages into very large HTML files. Add `--freeze-styles` only when stylesheet inlining is not enough to preserve layout.
|
|
||||||
|
|
||||||
Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set.
|
Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set.
|
||||||
|
|
||||||
## API
|
|
||||||
|
|
||||||
```sh
|
|
||||||
ARCHIVE_PATH=/tmp/local-page-archives npm run serve
|
|
||||||
```
|
|
||||||
|
|
||||||
Archive a page:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
curl -X POST http://127.0.0.1:8787/archive \
|
|
||||||
-H 'content-type: application/json' \
|
|
||||||
-d '{"url":"https://example.com/article"}'
|
|
||||||
```
|
|
||||||
|
|
||||||
The response includes the archived file path and a local `viewUrl`.
|
|
||||||
|
|
||||||
Set `PORT` to choose a port other than the default `8787`.
|
|
||||||
|
|||||||
@@ -3,13 +3,12 @@
|
|||||||
"version": "0.1.0",
|
"version": "0.1.0",
|
||||||
"private": true,
|
"private": true,
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"description": "Render and save self-contained HTML archives for pages the operator is authorized to access.",
|
"description": "Render and save self-contained HTML archives.",
|
||||||
"bin": {
|
"bin": {
|
||||||
"archive-page": "./src/cli.mjs"
|
"archive-page": "./src/cli.mjs"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"archive": "node src/cli.mjs archive",
|
"archive": "node src/cli.mjs archive",
|
||||||
"serve": "node src/server.mjs",
|
|
||||||
"install-browsers": "playwright install chromium"
|
"install-browsers": "playwright install chromium"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
|||||||
544
src/archiver.mjs
544
src/archiver.mjs
@@ -6,115 +6,38 @@ import {
|
|||||||
DEFAULT_USER_AGENT,
|
DEFAULT_USER_AGENT,
|
||||||
defaultArchivePath,
|
defaultArchivePath,
|
||||||
findEffectiveBase,
|
findEffectiveBase,
|
||||||
htmlEscape,
|
|
||||||
inputToUrl,
|
inputToUrl,
|
||||||
isFileUrl,
|
|
||||||
isHttpUrl,
|
isHttpUrl,
|
||||||
slugForUrl,
|
slugForUrl
|
||||||
stripArchiveShell
|
|
||||||
} from "./asset-inliner.mjs";
|
} from "./asset-inliner.mjs";
|
||||||
|
|
||||||
const require = createRequire(import.meta.url);
|
const require = createRequire(import.meta.url);
|
||||||
|
const PAGE_TIMEOUT_MS = 60000;
|
||||||
|
const NETWORK_IDLE_TIMEOUT_MS = 5000;
|
||||||
|
const VIEWPORT = {
|
||||||
|
width: 1024,
|
||||||
|
height: 768
|
||||||
|
};
|
||||||
|
|
||||||
export { DEFAULT_USER_AGENT, defaultArchivePath };
|
export { DEFAULT_USER_AGENT, defaultArchivePath };
|
||||||
|
|
||||||
const AD_SELECTORS = [
|
|
||||||
"[data-ad-status]",
|
|
||||||
"[data-ad-type]",
|
|
||||||
"[aria-label*='advertisement' i]",
|
|
||||||
"[id^='leaderboard']",
|
|
||||||
"[class*='LeaderboardAd_']",
|
|
||||||
"[class*='FullWidthAd_']",
|
|
||||||
"[class*='BaseAd_']",
|
|
||||||
".adWrapper",
|
|
||||||
".dvz-v0-ad",
|
|
||||||
"amp-ad",
|
|
||||||
"iframe[src*='doubleclick']",
|
|
||||||
"iframe[src*='googletagmanager']",
|
|
||||||
"iframe[src*='googlesyndication']"
|
|
||||||
];
|
|
||||||
|
|
||||||
const TRACKER_HOST_PATTERNS = [
|
|
||||||
"doubleclick.net",
|
|
||||||
"googletagmanager.com",
|
|
||||||
"googlesyndication.com",
|
|
||||||
"google-analytics.com",
|
|
||||||
"amazon-adsystem.com",
|
|
||||||
"pub.doubleverify.com",
|
|
||||||
"securepubads.g.doubleclick.net",
|
|
||||||
"s10.histats.com",
|
|
||||||
"sstatic1.histats.com"
|
|
||||||
];
|
|
||||||
|
|
||||||
const ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS = [
|
|
||||||
"getadmiral.com"
|
|
||||||
];
|
|
||||||
|
|
||||||
const BLOCKED_HOST_PATTERNS = [
|
|
||||||
...TRACKER_HOST_PATTERNS,
|
|
||||||
...ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS
|
|
||||||
];
|
|
||||||
|
|
||||||
const ANTI_ADBLOCK_TEXT_PATTERNS = [
|
|
||||||
"\\bad\\s*block(?:er|ing)?\\b",
|
|
||||||
"\\bad[-\\s]?block\\b",
|
|
||||||
"\\badblock(?:er|ing)?\\b",
|
|
||||||
"\\badvertis(?:e|ing)\\s+block(?:er|ing)?\\b",
|
|
||||||
"\\bblock(?:ing)?\\s+(?:ads|advertis(?:ements?|ing))\\b",
|
|
||||||
"\\b(?:disable|turn\\s+off|pause)\\s+(?:your\\s+)?ad[-\\s]?block(?:er|ing)?\\b",
|
|
||||||
"\\b(?:allowlist|whitelist)\\s+(?:our\\s+|this\\s+)?(?:site|website|domain)\\b",
|
|
||||||
"\\ballow(?:ing)?\\s+(?:our\\s+)?(?:ads|advertis(?:ements?|ing))\\b",
|
|
||||||
"\\bads?\\s+(?:are\\s+)?blocked\\b"
|
|
||||||
];
|
|
||||||
|
|
||||||
const BLOCKED_CAPTURE_PATTERNS = [
|
|
||||||
{
|
|
||||||
reason: "DataDome CAPTCHA/bot challenge",
|
|
||||||
any: [
|
|
||||||
/DataDome CAPTCHA/i,
|
|
||||||
/captcha-delivery\.com/i
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
reason: "blocked/CAPTCHA challenge",
|
|
||||||
any: [
|
|
||||||
/<title[^>]*>\s*You have been blocked\s*<\/title>/i,
|
|
||||||
/<title[^>]*>\s*Access Denied\s*<\/title>/i,
|
|
||||||
/\bunusual traffic\b/i
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
reason: "human verification challenge",
|
|
||||||
all: [
|
|
||||||
/\bverify you are (?:a )?human\b/i,
|
|
||||||
/\b(?:captcha|challenge|g-recaptcha|hcaptcha|turnstile)\b/i
|
|
||||||
]
|
|
||||||
}
|
|
||||||
];
|
|
||||||
|
|
||||||
export async function archivePage(input, options = {}) {
|
export async function archivePage(input, options = {}) {
|
||||||
const sourceUrl = inputToUrl(input);
|
const sourceUrl = inputToUrl(input);
|
||||||
const archivePath = options.archivePath || defaultArchivePath();
|
const archivePath = options.archivePath || defaultArchivePath();
|
||||||
const id = options.id || slugForUrl(sourceUrl);
|
const id = options.id || slugForUrl(sourceUrl);
|
||||||
const filePath = path.join(archivePath, `${id}.html`);
|
const filePath = path.join(archivePath, `${id}.html`);
|
||||||
|
|
||||||
await fs.mkdir(archivePath, { recursive: true });
|
await fs.mkdir(archivePath, { recursive: true });
|
||||||
|
|
||||||
const rawHtml = await readInputHtml(sourceUrl, options);
|
const renderedHtml = await renderPage(sourceUrl);
|
||||||
const baseUrl = rawHtml ? findEffectiveBase(rawHtml, sourceUrl) : sourceUrl;
|
const baseUrl = findEffectiveBase(renderedHtml, sourceUrl);
|
||||||
const useStatic = options.static || (isFileUrl(sourceUrl) && options.stripArchiveShell && options.render !== true);
|
|
||||||
const renderedHtml = useStatic
|
|
||||||
? prepareStaticHtml(rawHtml, options)
|
|
||||||
: await renderPage(sourceUrl, { ...options, rawHtml, baseUrl });
|
|
||||||
assertNotBlockedCapture(renderedHtml, sourceUrl);
|
|
||||||
|
|
||||||
const inliner = new AssetInliner({
|
const inliner = new AssetInliner({
|
||||||
userAgent: options.userAgent || DEFAULT_USER_AGENT,
|
userAgent: DEFAULT_USER_AGENT,
|
||||||
referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined,
|
referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined
|
||||||
maxAssetBytes: options.maxAssetBytes,
|
|
||||||
maxInlineStyleBytes: options.maxInlineStyleBytes
|
|
||||||
});
|
});
|
||||||
|
|
||||||
const inlined = await inliner.inlineHtml(renderedHtml, baseUrl);
|
const inlined = await inliner.inlineHtml(renderedHtml, baseUrl);
|
||||||
const finalHtml = addArchiveComment(inlined, sourceUrl, options);
|
const finalHtml = addArchiveComment(inlined, sourceUrl);
|
||||||
await fs.writeFile(filePath, finalHtml, "utf8");
|
await fs.writeFile(filePath, finalHtml, "utf8");
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -127,409 +50,66 @@ export async function archivePage(input, options = {}) {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function readInputHtml(sourceUrl, options = {}) {
|
export async function renderPage(sourceUrl) {
|
||||||
if (isFileUrl(sourceUrl)) {
|
|
||||||
return fs.readFile(new URL(sourceUrl), "utf8");
|
|
||||||
}
|
|
||||||
if (!isHttpUrl(sourceUrl) || !options.static) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
const response = await fetch(sourceUrl, {
|
|
||||||
headers: {
|
|
||||||
"user-agent": options.userAgent || DEFAULT_USER_AGENT,
|
|
||||||
accept: "text/html,application/xhtml+xml"
|
|
||||||
},
|
|
||||||
redirect: "follow"
|
|
||||||
});
|
|
||||||
if (!response.ok) {
|
|
||||||
throw new Error(`Failed to fetch ${sourceUrl}: HTTP ${response.status}`);
|
|
||||||
}
|
|
||||||
return response.text();
|
|
||||||
}
|
|
||||||
|
|
||||||
function prepareStaticHtml(rawHtml, options = {}) {
|
|
||||||
if (!rawHtml) {
|
|
||||||
throw new Error("Static mode requires an HTML input file or fetched HTML document.");
|
|
||||||
}
|
|
||||||
return options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml;
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function renderPage(sourceUrl, options = {}) {
|
|
||||||
const playwright = loadPlaywright();
|
const playwright = loadPlaywright();
|
||||||
const browser = await playwright.chromium.launch({
|
const browser = await playwright.chromium.launch({ headless: true });
|
||||||
headless: true
|
|
||||||
});
|
|
||||||
try {
|
try {
|
||||||
const context = await browser.newContext({
|
const context = await browser.newContext({
|
||||||
javaScriptEnabled: options.javaScriptEnabled ?? !(options.rawHtml && isFileUrl(sourceUrl)),
|
userAgent: DEFAULT_USER_AGENT,
|
||||||
userAgent: options.userAgent || DEFAULT_USER_AGENT,
|
viewport: VIEWPORT
|
||||||
viewport: {
|
|
||||||
width: options.viewportWidth || 1024,
|
|
||||||
height: options.viewportHeight || 768
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
const page = await context.newPage();
|
const page = await context.newPage();
|
||||||
|
|
||||||
if (options.stripAds !== false) {
|
await page.goto(sourceUrl, {
|
||||||
await page.route("**/*", (route) => {
|
waitUntil: "domcontentloaded",
|
||||||
const url = route.request().url();
|
timeout: PAGE_TIMEOUT_MS
|
||||||
if (isTrackerUrl(url)) {
|
});
|
||||||
return route.abort();
|
await waitForNetworkIdle(page);
|
||||||
}
|
await snapshotLoadedResourceUrls(page);
|
||||||
return route.continue();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options.rawHtml && isFileUrl(sourceUrl)) {
|
|
||||||
const content = prepareRenderInputHtml(options.rawHtml, options);
|
|
||||||
await page.setContent(content, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 });
|
|
||||||
} else {
|
|
||||||
await page.goto(sourceUrl, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 });
|
|
||||||
}
|
|
||||||
|
|
||||||
await settlePage(page, options);
|
|
||||||
await cleanupAndFreezePage(page, options);
|
|
||||||
return await page.content();
|
return await page.content();
|
||||||
} finally {
|
} finally {
|
||||||
await browser.close();
|
await browser.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function assertNotBlockedCapture(html, sourceUrl) {
|
async function waitForNetworkIdle(page) {
|
||||||
const detected = detectBlockedCapture(html);
|
|
||||||
if (!detected) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
throw new Error(
|
|
||||||
`Archive capture failed for ${sourceUrl}: ${detected}. The renderer received a challenge page instead of the requested content, so no archive was written.`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function detectBlockedCapture(html) {
|
|
||||||
for (const { reason, any, all } of BLOCKED_CAPTURE_PATTERNS) {
|
|
||||||
const anyMatched = !any || any.some((pattern) => pattern.test(html));
|
|
||||||
const allMatched = !all || all.every((pattern) => pattern.test(html));
|
|
||||||
if (anyMatched && allMatched) {
|
|
||||||
return reason;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function settlePage(page, options) {
|
|
||||||
try {
|
try {
|
||||||
await page.waitForLoadState("networkidle", { timeout: options.networkIdleTimeoutMs || 15000 });
|
await page.waitForLoadState("networkidle", {
|
||||||
} catch {
|
timeout: NETWORK_IDLE_TIMEOUT_MS
|
||||||
// Dynamic pages often keep long-lived connections open; DOM capture can still proceed.
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options.scroll !== false) {
|
|
||||||
await page.evaluate(async () => {
|
|
||||||
await new Promise((resolve) => {
|
|
||||||
let total = 0;
|
|
||||||
const step = Math.max(400, Math.floor(window.innerHeight * 0.8));
|
|
||||||
const timer = setInterval(() => {
|
|
||||||
const previous = document.scrollingElement?.scrollTop || window.scrollY;
|
|
||||||
window.scrollBy(0, step);
|
|
||||||
total += step;
|
|
||||||
const current = document.scrollingElement?.scrollTop || window.scrollY;
|
|
||||||
if (current === previous || total > Math.max(document.body.scrollHeight, 20000)) {
|
|
||||||
clearInterval(timer);
|
|
||||||
window.scrollTo(0, 0);
|
|
||||||
resolve();
|
|
||||||
}
|
|
||||||
}, 120);
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
} catch {
|
||||||
|
// Some pages keep sockets open; the DOM snapshot is still useful.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function cleanupAndFreezePage(page, options) {
|
async function snapshotLoadedResourceUrls(page) {
|
||||||
await page.evaluate(
|
await page.evaluate(() => {
|
||||||
({
|
document.querySelectorAll("img").forEach((img) => {
|
||||||
adSelectors,
|
if (img.currentSrc) {
|
||||||
antiAdblockProviderHostPatterns,
|
img.setAttribute("src", img.currentSrc);
|
||||||
antiAdblockTextPatterns,
|
|
||||||
freezeStyles,
|
|
||||||
maxFreezeElements,
|
|
||||||
maxSanitizeElements,
|
|
||||||
stripAds,
|
|
||||||
stripArchiveShell: shouldStripArchiveShell
|
|
||||||
}) => {
|
|
||||||
function removeAll(selector) {
|
|
||||||
document.querySelectorAll(selector).forEach((node) => node.remove());
|
|
||||||
}
|
}
|
||||||
|
});
|
||||||
|
|
||||||
if (shouldStripArchiveShell) {
|
document.querySelectorAll("video,audio").forEach((media) => {
|
||||||
const content = document.querySelector("#CONTENT .html1") || document.querySelector("#CONTENT");
|
if (media.currentSrc) {
|
||||||
if (content) {
|
media.setAttribute("src", media.currentSrc);
|
||||||
document.body.innerHTML = "";
|
}
|
||||||
document.body.appendChild(content.cloneNode(true));
|
});
|
||||||
document.documentElement.removeAttribute("prefix");
|
|
||||||
document.documentElement.removeAttribute("itemscope");
|
document.querySelectorAll("iframe").forEach((frame) => {
|
||||||
document.documentElement.removeAttribute("itemtype");
|
try {
|
||||||
|
const doc = frame.contentDocument;
|
||||||
|
if (doc?.documentElement) {
|
||||||
|
frame.setAttribute("srcdoc", "<!doctype html>" + doc.documentElement.outerHTML);
|
||||||
|
frame.removeAttribute("src");
|
||||||
}
|
}
|
||||||
|
} catch {
|
||||||
|
// Cross-origin frames are handled later by the asset inliner when possible.
|
||||||
}
|
}
|
||||||
|
});
|
||||||
removeAll("script");
|
});
|
||||||
removeAll("noscript");
|
|
||||||
removeAll("link[rel='preconnect'],link[rel='dns-prefetch'],link[rel='modulepreload']");
|
|
||||||
removeAll("meta[name='next-head-count']");
|
|
||||||
|
|
||||||
if (stripAds) {
|
|
||||||
for (const selector of adSelectors) {
|
|
||||||
try {
|
|
||||||
removeAll(selector);
|
|
||||||
} catch {
|
|
||||||
// Ignore unsupported selectors in older browser engines.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
removeAntiAdblockOverlays();
|
|
||||||
}
|
|
||||||
|
|
||||||
document.querySelectorAll("img").forEach((img) => {
|
|
||||||
if (img.currentSrc) {
|
|
||||||
img.setAttribute("data-original-src", img.getAttribute("src") || "");
|
|
||||||
img.setAttribute("src", img.currentSrc);
|
|
||||||
}
|
|
||||||
img.removeAttribute("srcset");
|
|
||||||
img.removeAttribute("sizes");
|
|
||||||
img.setAttribute("loading", "lazy");
|
|
||||||
});
|
|
||||||
|
|
||||||
document.querySelectorAll("source").forEach((source) => {
|
|
||||||
source.removeAttribute("srcset");
|
|
||||||
});
|
|
||||||
|
|
||||||
document.querySelectorAll("video,audio").forEach((media) => {
|
|
||||||
if (media.currentSrc) {
|
|
||||||
media.setAttribute("src", media.currentSrc);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
document.querySelectorAll("iframe").forEach((frame) => {
|
|
||||||
const src = frame.getAttribute("src");
|
|
||||||
if (src) {
|
|
||||||
frame.setAttribute("data-archived-src", src);
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
const doc = frame.contentDocument;
|
|
||||||
if (doc?.documentElement) {
|
|
||||||
frame.setAttribute("srcdoc", "<!doctype html>" + doc.documentElement.outerHTML);
|
|
||||||
frame.removeAttribute("src");
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// Cross-origin iframe sources are handled in the Node-side inliner when possible.
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
function removeAntiAdblockOverlays() {
|
|
||||||
const textPatterns = antiAdblockTextPatterns.map((pattern) => new RegExp(pattern, "i"));
|
|
||||||
const candidates = new Set();
|
|
||||||
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ELEMENT);
|
|
||||||
let node = walker.currentNode;
|
|
||||||
let visited = 0;
|
|
||||||
|
|
||||||
while (node && visited < maxSanitizeElements) {
|
|
||||||
if (node !== document.body && node !== document.documentElement && hasAntiAdblockSignal(node, textPatterns)) {
|
|
||||||
candidates.add(node);
|
|
||||||
}
|
|
||||||
visited += 1;
|
|
||||||
node = walker.nextNode();
|
|
||||||
}
|
|
||||||
|
|
||||||
let removed = 0;
|
|
||||||
candidates.forEach((node) => {
|
|
||||||
const container = findRoadblockContainer(node, textPatterns);
|
|
||||||
if (container && isLikelyAntiAdblockRoadblock(container, textPatterns)) {
|
|
||||||
container.remove();
|
|
||||||
removed += 1;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
if (removed > 0) {
|
|
||||||
for (const element of [document.documentElement, document.body]) {
|
|
||||||
element.style.removeProperty("overflow");
|
|
||||||
element.style.removeProperty("position");
|
|
||||||
element.style.removeProperty("inset");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function findRoadblockContainer(node, textPatterns) {
|
|
||||||
let current = node;
|
|
||||||
let best = null;
|
|
||||||
while (current?.parentElement && current.parentElement !== document.body) {
|
|
||||||
if (isLikelyAntiAdblockRoadblock(current, textPatterns)) {
|
|
||||||
best = current;
|
|
||||||
}
|
|
||||||
const parentTextLength = normalizeText(current.parentElement.textContent || "").length;
|
|
||||||
if (parentTextLength > 8000) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
current = current.parentElement;
|
|
||||||
}
|
|
||||||
return best || (isLikelyAntiAdblockRoadblock(node, textPatterns) ? node : null);
|
|
||||||
}
|
|
||||||
|
|
||||||
function normalizeText(text) {
|
|
||||||
return text.replace(/\s+/g, " ").trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
function hasAntiAdblockSignal(node, textPatterns) {
|
|
||||||
return hasAntiAdblockText(node, textPatterns) || hasAntiAdblockProviderUrl(node) || hasAntiAdblockProviderDescendant(node);
|
|
||||||
}
|
|
||||||
|
|
||||||
function isLikelyAntiAdblockRoadblock(node, textPatterns) {
|
|
||||||
if (!node || node === document.body || node === document.documentElement) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
const hasSignal =
|
|
||||||
hasAntiAdblockText(node, textPatterns) ||
|
|
||||||
hasAntiAdblockProviderUrl(node) ||
|
|
||||||
hasAntiAdblockProviderDescendant(node);
|
|
||||||
if (!hasSignal) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
const looksBlocking =
|
|
||||||
isOverlayLike(node) ||
|
|
||||||
hasDialogSemantics(node) ||
|
|
||||||
hasBlockingClassName(node) ||
|
|
||||||
hasActionControl(node) ||
|
|
||||||
hasAntiAdblockProviderDescendant(node);
|
|
||||||
if (!looksBlocking) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
const embeddedInContent = node.closest("article, main");
|
|
||||||
return !embeddedInContent || isOverlayLike(node) || hasDialogSemantics(node) || hasBlockingClassName(node);
|
|
||||||
}
|
|
||||||
|
|
||||||
function hasAntiAdblockText(node, textPatterns) {
|
|
||||||
const text = normalizeText(node.textContent || "");
|
|
||||||
return text.length > 0 && text.length <= 2000 && textPatterns.some((pattern) => pattern.test(text));
|
|
||||||
}
|
|
||||||
|
|
||||||
function hasAntiAdblockProviderDescendant(node) {
|
|
||||||
return Array.from(node.querySelectorAll?.("a[href], iframe[src], script[src]") || []).some((descendant) =>
|
|
||||||
hasAntiAdblockProviderUrl(descendant)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function hasAntiAdblockProviderUrl(node) {
|
|
||||||
for (const attr of ["href", "src", "data-src"]) {
|
|
||||||
const value = node.getAttribute?.(attr) || node[attr] || "";
|
|
||||||
if (value && antiAdblockProviderHostPatterns.some((host) => decodedText(value).includes(host))) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
function isOverlayLike(node) {
|
|
||||||
const style = window.getComputedStyle(node);
|
|
||||||
const rect = node.getBoundingClientRect();
|
|
||||||
const viewportArea = Math.max(1, window.innerWidth * window.innerHeight);
|
|
||||||
const area = Math.max(0, rect.width) * Math.max(0, rect.height);
|
|
||||||
const zIndex = Number.parseInt(style.zIndex, 10);
|
|
||||||
const hasHighZIndex = Number.isFinite(zIndex) && zIndex >= 10;
|
|
||||||
const positionIsBlocking = style.position === "fixed" || style.position === "sticky";
|
|
||||||
const coversMeaningfulArea = area / viewportArea >= 0.15;
|
|
||||||
const coversMostViewport = area / viewportArea >= 0.45;
|
|
||||||
return (
|
|
||||||
(positionIsBlocking && (hasHighZIndex || coversMeaningfulArea)) ||
|
|
||||||
(hasHighZIndex && coversMostViewport)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function hasDialogSemantics(node) {
|
|
||||||
const role = node.getAttribute?.("role");
|
|
||||||
return node.tagName === "DIALOG" || role === "dialog" || role === "alertdialog" || node.getAttribute?.("aria-modal") === "true";
|
|
||||||
}
|
|
||||||
|
|
||||||
function hasBlockingClassName(node) {
|
|
||||||
return /(?:ad[-_ ]?block|adblock|allow[-_ ]?ads|overlay|modal|interstitial|backdrop|roadblock)/i.test(
|
|
||||||
`${node.id || ""} ${node.className || ""}`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function hasActionControl(node) {
|
|
||||||
return Boolean(node.querySelector?.('button, [role="button"], a[href], input[type="button"], input[type="submit"]'));
|
|
||||||
}
|
|
||||||
|
|
||||||
function decodedText(value) {
|
|
||||||
try {
|
|
||||||
return decodeURIComponent(value).toLowerCase();
|
|
||||||
} catch {
|
|
||||||
return value.toLowerCase();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const walkedElements = [];
|
|
||||||
const walker = document.createTreeWalker(document.documentElement, NodeFilter.SHOW_ELEMENT);
|
|
||||||
let element = document.documentElement;
|
|
||||||
let visited = 0;
|
|
||||||
while (element && visited < maxSanitizeElements) {
|
|
||||||
walkedElements.push(element);
|
|
||||||
for (const attr of Array.from(element.attributes)) {
|
|
||||||
if (/^on/i.test(attr.name) || attr.name === "integrity" || attr.name === "nonce") {
|
|
||||||
element.removeAttribute(attr.name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
visited += 1;
|
|
||||||
element = walker.nextNode();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!freezeStyles || element || walkedElements.length > maxFreezeElements) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const element of walkedElements) {
|
|
||||||
if (element.tagName === "SCRIPT" || element.tagName === "STYLE") {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const computed = window.getComputedStyle(element);
|
|
||||||
const declarations = [];
|
|
||||||
for (let i = 0; i < computed.length; i += 1) {
|
|
||||||
const property = computed[i];
|
|
||||||
const value = computed.getPropertyValue(property);
|
|
||||||
if (value) {
|
|
||||||
declarations.push(`${property}:${value}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (declarations.length) {
|
|
||||||
element.setAttribute("style", declarations.join(";"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
adSelectors: AD_SELECTORS,
|
|
||||||
antiAdblockProviderHostPatterns: ANTI_ADBLOCK_PROVIDER_HOST_PATTERNS,
|
|
||||||
antiAdblockTextPatterns: ANTI_ADBLOCK_TEXT_PATTERNS,
|
|
||||||
freezeStyles: options.freezeStyles === true,
|
|
||||||
maxFreezeElements: options.maxFreezeElements || 2500,
|
|
||||||
maxSanitizeElements: options.maxSanitizeElements || 5000,
|
|
||||||
stripAds: options.stripAds !== false,
|
|
||||||
stripArchiveShell: Boolean(options.stripArchiveShell)
|
|
||||||
}
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function prepareRenderInputHtml(rawHtml, options) {
|
|
||||||
let html = options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml;
|
|
||||||
html = html
|
|
||||||
.replace(/<script\b[\s\S]*?<\/script>/gi, "")
|
|
||||||
.replace(/<noscript\b[\s\S]*?<\/noscript>/gi, "");
|
|
||||||
if (!options.baseUrl) {
|
|
||||||
return html;
|
|
||||||
}
|
|
||||||
if (/<base\b/i.test(html)) {
|
|
||||||
return html;
|
|
||||||
}
|
|
||||||
const baseTag = `<base href="${htmlEscape(options.baseUrl)}">`;
|
|
||||||
if (/<head\b[^>]*>/i.test(html)) {
|
|
||||||
return html.replace(/<head\b[^>]*>/i, (match) => `${match}${baseTag}`);
|
|
||||||
}
|
|
||||||
return `${baseTag}${html}`;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function loadPlaywright() {
|
function loadPlaywright() {
|
||||||
@@ -537,23 +117,14 @@ function loadPlaywright() {
|
|||||||
return require("playwright");
|
return require("playwright");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
`Playwright is required for render mode. Run "npm install" and "npm run install-browsers", or use --static for HTML input files. Original error: ${error.message}`
|
`Playwright is required. Run "npm install" and "npm run install-browsers". Original error: ${error.message}`
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function isTrackerUrl(rawUrl) {
|
function addArchiveComment(html, sourceUrl) {
|
||||||
let host = "";
|
const safeSource = String(sourceUrl).replaceAll("--", "- -");
|
||||||
try {
|
const comment = `<!-- Archived locally. Source: ${safeSource}. Created: ${new Date().toISOString()}. -->`;
|
||||||
host = new URL(rawUrl).hostname;
|
|
||||||
} catch {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return BLOCKED_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`));
|
|
||||||
}
|
|
||||||
|
|
||||||
function addArchiveComment(html, sourceUrl, options) {
|
|
||||||
const comment = `<!-- Archived locally. Source: ${sourceUrl}. Created: ${new Date().toISOString()}. Paywall bypass filters were not executed. -->`;
|
|
||||||
if (/<!doctype/i.test(html)) {
|
if (/<!doctype/i.test(html)) {
|
||||||
return html.replace(/<!doctype[^>]*>/i, (doctype) => `${doctype}\n${comment}`);
|
return html.replace(/<!doctype[^>]*>/i, (doctype) => `${doctype}\n${comment}`);
|
||||||
}
|
}
|
||||||
@@ -574,6 +145,7 @@ export function findExternalAssetRefs(html) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const linkPattern = /<link\b[^>]*>/gi;
|
const linkPattern = /<link\b[^>]*>/gi;
|
||||||
for (const match of html.matchAll(linkPattern)) {
|
for (const match of html.matchAll(linkPattern)) {
|
||||||
const tag = match[0];
|
const tag = match[0];
|
||||||
@@ -586,6 +158,7 @@ export function findExternalAssetRefs(html) {
|
|||||||
refs.add(href);
|
refs.add(href);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
|
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
|
||||||
for (const match of html.matchAll(cssUrlPattern)) {
|
for (const match of html.matchAll(cssUrlPattern)) {
|
||||||
const candidate = cleanCssUrl(match[2]);
|
const candidate = cleanCssUrl(match[2]);
|
||||||
@@ -593,6 +166,7 @@ export function findExternalAssetRefs(html) {
|
|||||||
refs.add(candidate);
|
refs.add(candidate);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return Array.from(refs).sort();
|
return Array.from(refs).sort();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -125,47 +125,6 @@ export function htmlDecode(value) {
|
|||||||
.replaceAll(">", ">");
|
.replaceAll(">", ">");
|
||||||
}
|
}
|
||||||
|
|
||||||
export function stripArchiveShell(html) {
|
|
||||||
if (!html.includes('id="CONTENT"') && !html.includes("id='CONTENT'")) {
|
|
||||||
return html;
|
|
||||||
}
|
|
||||||
const contentStart = html.search(/<div\b[^>]*\bid=(["'])CONTENT\1[^>]*>/i);
|
|
||||||
const marker = html.search(
|
|
||||||
/<!--\[if !IE\]><!--><div\b[^>]*>\s*<table\b[^>]*\bid=(["'])hashtags\1/i
|
|
||||||
);
|
|
||||||
if (contentStart === -1 || marker === -1 || marker <= contentStart) {
|
|
||||||
return html;
|
|
||||||
}
|
|
||||||
const title = html.match(/<title\b[^>]*>[\s\S]*?<\/title>/i)?.[0] || "<title>Archived page</title>";
|
|
||||||
const fontStyle = html.match(/<style\b[^>]*type=(["'])text\/css\1[^>]*>[\s\S]*?<\/style>/i)?.[0] || "";
|
|
||||||
const capturedStart = html.slice(contentStart, marker).search(/<div\b[^>]*\bclass=(["'])html1\1[^>]*>/i);
|
|
||||||
const fragmentStart = capturedStart === -1 ? contentStart : contentStart + capturedStart;
|
|
||||||
const fragmentEnd = findMatchingDivEnd(html, fragmentStart) || marker;
|
|
||||||
const content = html.slice(fragmentStart, Math.min(fragmentEnd, marker));
|
|
||||||
return `<!doctype html><html><head><meta charset="utf-8">${title}${fontStyle}</head><body style="margin:0;background:#fff">${content}</body></html>`;
|
|
||||||
}
|
|
||||||
|
|
||||||
function findMatchingDivEnd(html, startIndex) {
|
|
||||||
const tags = /<\/?div\b[^>]*>/gi;
|
|
||||||
tags.lastIndex = startIndex;
|
|
||||||
let depth = 0;
|
|
||||||
for (const match of html.matchAll(tags)) {
|
|
||||||
const tag = match[0];
|
|
||||||
if (match.index < startIndex) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (/^<div\b/i.test(tag) && !/\/>$/.test(tag)) {
|
|
||||||
depth += 1;
|
|
||||||
} else if (/^<\/div/i.test(tag)) {
|
|
||||||
depth -= 1;
|
|
||||||
if (depth === 0) {
|
|
||||||
return match.index + tag.length;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function replaceAsync(input, regex, replacer) {
|
export async function replaceAsync(input, regex, replacer) {
|
||||||
const parts = [];
|
const parts = [];
|
||||||
let lastIndex = 0;
|
let lastIndex = 0;
|
||||||
@@ -238,9 +197,6 @@ export class AssetInliner {
|
|||||||
async (match) => this.rewriteMediaAttributes(match[0], effectiveBase)
|
async (match) => this.rewriteMediaAttributes(match[0], effectiveBase)
|
||||||
);
|
);
|
||||||
|
|
||||||
output = removeExternalBookkeepingUrls(output);
|
|
||||||
output = restoreArchiveProxyLinks(output);
|
|
||||||
|
|
||||||
output = await replaceAsync(output, /srcset=(["'])([\s\S]*?)\1/gi, async (match) => {
|
output = await replaceAsync(output, /srcset=(["'])([\s\S]*?)\1/gi, async (match) => {
|
||||||
const rewritten = await this.inlineSrcset(match[2], effectiveBase);
|
const rewritten = await this.inlineSrcset(match[2], effectiveBase);
|
||||||
return `srcset=${match[1]}${htmlEscape(rewritten)}${match[1]}`;
|
return `srcset=${match[1]}${htmlEscape(rewritten)}${match[1]}`;
|
||||||
@@ -267,7 +223,7 @@ export class AssetInliner {
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
const inlinedCss = await this.inlineCss(css, absolute);
|
const inlinedCss = await this.inlineCss(css, absolute);
|
||||||
return `<style data-archived-href="${htmlEscape(absolute)}">${inlinedCss}</style>`;
|
return `<style>${inlinedCss}</style>`;
|
||||||
}
|
}
|
||||||
|
|
||||||
const isInlineableLink =
|
const isInlineableLink =
|
||||||
@@ -320,7 +276,6 @@ export class AssetInliner {
|
|||||||
const inlined = await this.inlineHtml(text, absolute, { depth: depth + 1 });
|
const inlined = await this.inlineHtml(text, absolute, { depth: depth + 1 });
|
||||||
let rewritten = removeAttribute(tag, "src");
|
let rewritten = removeAttribute(tag, "src");
|
||||||
rewritten = setAttribute(rewritten, "srcdoc", inlined);
|
rewritten = setAttribute(rewritten, "srcdoc", inlined);
|
||||||
rewritten = setAttribute(rewritten, "data-archived-src", absolute);
|
|
||||||
return rewritten;
|
return rewritten;
|
||||||
}
|
}
|
||||||
return this.rewriteMediaAttributes(tag, baseUrl);
|
return this.rewriteMediaAttributes(tag, baseUrl);
|
||||||
@@ -459,45 +414,6 @@ export class AssetInliner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function removeExternalBookkeepingUrls(html) {
|
|
||||||
return html.replace(
|
|
||||||
/\s(?:old-src|currentSourceUrl|data-original-src|data-archived-src)=(["'])([\s\S]*?)\1/gi,
|
|
||||||
""
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
function restoreArchiveProxyLinks(html) {
|
|
||||||
return html.replace(/\s(href|action)=(["'])([\s\S]*?)\2/gi, (full, attr, quote, rawValue) => {
|
|
||||||
const restored = restoreArchiveProxyUrl(htmlDecode(rawValue));
|
|
||||||
if (restored === rawValue) {
|
|
||||||
return full;
|
|
||||||
}
|
|
||||||
return ` ${attr}=${quote}${htmlEscape(restored)}${quote}`;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function restoreArchiveProxyUrl(rawValue) {
|
|
||||||
const value = rawValue.trim();
|
|
||||||
const archiveHost = "archive\\.(?:ph|today|is|li|md|fo|vn|pm)";
|
|
||||||
const proxied = value.match(new RegExp(`^https?://${archiveHost}/o/[^/]+/(https?://.+)$`, "i"));
|
|
||||||
if (proxied) {
|
|
||||||
return safeDecodeUrl(proxied[1]);
|
|
||||||
}
|
|
||||||
const samePage = value.match(new RegExp(`^https?://${archiveHost}/[^/#?]+(#.+)$`, "i"));
|
|
||||||
if (samePage) {
|
|
||||||
return samePage[1];
|
|
||||||
}
|
|
||||||
return rawValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
function safeDecodeUrl(value) {
|
|
||||||
try {
|
|
||||||
return decodeURIComponent(value);
|
|
||||||
} catch {
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function mimeFromUrl(rawUrl) {
|
function mimeFromUrl(rawUrl) {
|
||||||
let pathname = rawUrl;
|
let pathname = rawUrl;
|
||||||
try {
|
try {
|
||||||
|
|||||||
26
src/cli.mjs
26
src/cli.mjs
@@ -1,5 +1,5 @@
|
|||||||
#!/usr/bin/env node
|
#!/usr/bin/env node
|
||||||
import { archivePage, DEFAULT_USER_AGENT, defaultArchivePath } from "./archiver.mjs";
|
import { archivePage, defaultArchivePath } from "./archiver.mjs";
|
||||||
|
|
||||||
function parseArgs(argv) {
|
function parseArgs(argv) {
|
||||||
const args = {
|
const args = {
|
||||||
@@ -33,19 +33,7 @@ function usage() {
|
|||||||
|
|
||||||
Options:
|
Options:
|
||||||
--archive-path <dir> Output directory. Defaults to ARCHIVE_PATH or ${defaultArchivePath()}
|
--archive-path <dir> Output directory. Defaults to ARCHIVE_PATH or ${defaultArchivePath()}
|
||||||
--id <id> Output id/file stem
|
--id <id> Output id/file stem`);
|
||||||
--static Do not use a browser; transform the input HTML only
|
|
||||||
--render Force browser rendering for local archive-shell HTML
|
|
||||||
--freeze-styles Snapshot computed styles into inline style attributes
|
|
||||||
--strip-archive-shell Remove an archive.ph shell from an already archived HTML file
|
|
||||||
--no-strip-ads Keep ad-like elements
|
|
||||||
--user-agent <ua> User agent to send for page and asset requests
|
|
||||||
--max-asset-bytes <bytes> Per-asset inline limit
|
|
||||||
--max-inline-style-bytes <bytes>
|
|
||||||
Per-style-attribute inline rewrite limit
|
|
||||||
|
|
||||||
Default user agent:
|
|
||||||
${DEFAULT_USER_AGENT}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
@@ -66,15 +54,7 @@ async function main() {
|
|||||||
|
|
||||||
const result = await archivePage(input, {
|
const result = await archivePage(input, {
|
||||||
archivePath: args["archive-path"],
|
archivePath: args["archive-path"],
|
||||||
id: args.id,
|
id: args.id
|
||||||
freezeStyles: Boolean(args["freeze-styles"]),
|
|
||||||
render: Boolean(args.render),
|
|
||||||
static: Boolean(args.static),
|
|
||||||
stripArchiveShell: Boolean(args["strip-archive-shell"]),
|
|
||||||
stripAds: args["strip-ads"] !== false,
|
|
||||||
userAgent: args["user-agent"] || DEFAULT_USER_AGENT,
|
|
||||||
maxAssetBytes: args["max-asset-bytes"] ? Number(args["max-asset-bytes"]) : undefined,
|
|
||||||
maxInlineStyleBytes: args["max-inline-style-bytes"] ? Number(args["max-inline-style-bytes"]) : undefined
|
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(`Archived: ${result.sourceUrl}`);
|
console.log(`Archived: ${result.sourceUrl}`);
|
||||||
|
|||||||
@@ -1,86 +0,0 @@
|
|||||||
import http from "node:http";
|
|
||||||
import fs from "node:fs/promises";
|
|
||||||
import path from "node:path";
|
|
||||||
import { archivePage, DEFAULT_USER_AGENT, defaultArchivePath } from "./archiver.mjs";
|
|
||||||
|
|
||||||
const archivePath = process.env.ARCHIVE_PATH || defaultArchivePath();
|
|
||||||
const port = Number(process.env.PORT || 8787);
|
|
||||||
|
|
||||||
const server = http.createServer(async (req, res) => {
|
|
||||||
try {
|
|
||||||
const url = new URL(req.url, `http://${req.headers.host}`);
|
|
||||||
if (req.method === "GET" && url.pathname === "/health") {
|
|
||||||
return sendJson(res, 200, { ok: true, archivePath });
|
|
||||||
}
|
|
||||||
|
|
||||||
if (req.method === "POST" && url.pathname === "/archive") {
|
|
||||||
const body = await readJson(req);
|
|
||||||
if (!body.url) {
|
|
||||||
return sendJson(res, 400, { error: "Missing required field: url" });
|
|
||||||
}
|
|
||||||
const result = await archivePage(body.url, {
|
|
||||||
archivePath,
|
|
||||||
id: body.id,
|
|
||||||
freezeStyles: Boolean(body.freezeStyles),
|
|
||||||
render: Boolean(body.render),
|
|
||||||
static: Boolean(body.static),
|
|
||||||
stripArchiveShell: Boolean(body.stripArchiveShell),
|
|
||||||
stripAds: body.stripAds !== false,
|
|
||||||
userAgent: body.userAgent || DEFAULT_USER_AGENT,
|
|
||||||
maxAssetBytes: body.maxAssetBytes,
|
|
||||||
maxInlineStyleBytes: body.maxInlineStyleBytes
|
|
||||||
});
|
|
||||||
return sendJson(res, 201, {
|
|
||||||
id: result.id,
|
|
||||||
sourceUrl: result.sourceUrl,
|
|
||||||
file: result.filePath,
|
|
||||||
externalAssets: result.externalAssets,
|
|
||||||
warnings: result.warnings,
|
|
||||||
viewUrl: `/archives/${encodeURIComponent(path.basename(result.filePath))}`
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (req.method === "GET" && url.pathname.startsWith("/archives/")) {
|
|
||||||
const file = decodeURIComponent(url.pathname.slice("/archives/".length));
|
|
||||||
if (!/^[a-zA-Z0-9._-]+\.html$/.test(file)) {
|
|
||||||
return sendJson(res, 400, { error: "Invalid archive file name" });
|
|
||||||
}
|
|
||||||
const fullPath = path.join(archivePath, file);
|
|
||||||
const html = await fs.readFile(fullPath);
|
|
||||||
res.writeHead(200, {
|
|
||||||
"content-type": "text/html; charset=utf-8",
|
|
||||||
"content-length": html.length
|
|
||||||
});
|
|
||||||
return res.end(html);
|
|
||||||
}
|
|
||||||
|
|
||||||
sendJson(res, 404, { error: "Not found" });
|
|
||||||
} catch (error) {
|
|
||||||
sendJson(res, 500, { error: error.message });
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
server.listen(port, () => {
|
|
||||||
console.log(`Archive API listening on http://127.0.0.1:${port}`);
|
|
||||||
console.log(`ARCHIVE_PATH=${archivePath}`);
|
|
||||||
});
|
|
||||||
|
|
||||||
async function readJson(req) {
|
|
||||||
const chunks = [];
|
|
||||||
for await (const chunk of req) {
|
|
||||||
chunks.push(chunk);
|
|
||||||
}
|
|
||||||
if (!chunks.length) {
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
return JSON.parse(Buffer.concat(chunks).toString("utf8"));
|
|
||||||
}
|
|
||||||
|
|
||||||
function sendJson(res, status, value) {
|
|
||||||
const body = Buffer.from(JSON.stringify(value, null, 2));
|
|
||||||
res.writeHead(status, {
|
|
||||||
"content-type": "application/json; charset=utf-8",
|
|
||||||
"content-length": body.length
|
|
||||||
});
|
|
||||||
res.end(body);
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user