initial commit
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
node_modules/
|
||||
|
||||
47
README.md
Normal file
47
README.md
Normal file
@@ -0,0 +1,47 @@
|
||||
# Local Page Archiver
|
||||
|
||||
This project saves self-contained HTML archives for pages the operator is authorized to access. It sends a real browser user agent, renders web URLs with Playwright, strips ad/tracker-like elements, normalizes the captured DOM, and inlines page requisites as `data:` URLs.
|
||||
|
||||
It intentionally does not execute paywall-bypass rules. The bundled `bypass-paywalls-clean-filters` files are treated as reference material only; paywall selectors and scripts are not applied.
|
||||
|
||||
## CLI
|
||||
|
||||
```sh
|
||||
npm install
|
||||
npm run install-browsers
|
||||
node src/cli.mjs archive "https://example.com/article"
|
||||
```
|
||||
|
||||
For an existing HTML file:
|
||||
|
||||
```sh
|
||||
node src/cli.mjs archive ./page.html --static
|
||||
```
|
||||
|
||||
For an `archive.ph` HTML export where you want the captured page without the archive shell:
|
||||
|
||||
```sh
|
||||
node src/cli.mjs archive ./bloomberg-archive.html --static --strip-archive-shell
|
||||
```
|
||||
|
||||
Local `archive.ph` HTML inputs with `--strip-archive-shell` use the static extractor by default because those files already contain the rendered page. Add `--render` only when you explicitly want Chromium to load the local HTML first.
|
||||
|
||||
Archives are written to `ARCHIVE_PATH`, or to a development directory under the system temp directory when `ARCHIVE_PATH` is not set.
|
||||
|
||||
## API
|
||||
|
||||
```sh
|
||||
ARCHIVE_PATH=/tmp/local-page-archives npm run serve
|
||||
```
|
||||
|
||||
Archive a page:
|
||||
|
||||
```sh
|
||||
curl -X POST http://127.0.0.1:8787/archive \
|
||||
-H 'content-type: application/json' \
|
||||
-d '{"url":"https://example.com/article"}'
|
||||
```
|
||||
|
||||
The response includes the archived file path and a local `viewUrl`.
|
||||
|
||||
Set `PORT` to choose a port other than the default `8787`.
|
||||
65
package-lock.json
generated
Normal file
65
package-lock.json
generated
Normal file
@@ -0,0 +1,65 @@
|
||||
{
|
||||
"name": "local-page-archiver",
|
||||
"version": "0.1.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "local-page-archiver",
|
||||
"version": "0.1.0",
|
||||
"dependencies": {
|
||||
"playwright": "^1.59.1"
|
||||
},
|
||||
"bin": {
|
||||
"archive-page": "src/cli.mjs"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=22"
|
||||
}
|
||||
},
|
||||
"node_modules/fsevents": {
|
||||
"version": "2.3.2",
|
||||
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz",
|
||||
"integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==",
|
||||
"hasInstallScript": true,
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"os": [
|
||||
"darwin"
|
||||
],
|
||||
"engines": {
|
||||
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/playwright": {
|
||||
"version": "1.60.0",
|
||||
"resolved": "https://registry.npmjs.org/playwright/-/playwright-1.60.0.tgz",
|
||||
"integrity": "sha512-hheHdokM8cdqCb0lcE3s+zT4t4W+vvjpGxsZlDnikarzx8tSzMebh3UiFtgqwFwnTnjYQcsyMF8ei2mCO/tpeA==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"playwright-core": "1.60.0"
|
||||
},
|
||||
"bin": {
|
||||
"playwright": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"fsevents": "2.3.2"
|
||||
}
|
||||
},
|
||||
"node_modules/playwright-core": {
|
||||
"version": "1.60.0",
|
||||
"resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.60.0.tgz",
|
||||
"integrity": "sha512-9bW6zvX/m0lEbgTKJ6YppOKx8H3VOPBMOCFh2irXFOT4BbHgrx5hPjwJYLT40Lu+4qtD36qKc/Hn56StUW57IA==",
|
||||
"license": "Apache-2.0",
|
||||
"bin": {
|
||||
"playwright-core": "cli.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
21
package.json
Normal file
21
package.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"name": "local-page-archiver",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"description": "Render and save self-contained HTML archives for pages the operator is authorized to access.",
|
||||
"bin": {
|
||||
"archive-page": "./src/cli.mjs"
|
||||
},
|
||||
"scripts": {
|
||||
"archive": "node src/cli.mjs archive",
|
||||
"serve": "node src/server.mjs",
|
||||
"install-browsers": "playwright install chromium"
|
||||
},
|
||||
"dependencies": {
|
||||
"playwright": "^1.59.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=22"
|
||||
}
|
||||
}
|
||||
395
src/archiver.mjs
Normal file
395
src/archiver.mjs
Normal file
@@ -0,0 +1,395 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { createRequire } from "node:module";
|
||||
import {
|
||||
AssetInliner,
|
||||
DEFAULT_USER_AGENT,
|
||||
defaultArchivePath,
|
||||
findEffectiveBase,
|
||||
htmlEscape,
|
||||
inputToUrl,
|
||||
isFileUrl,
|
||||
isHttpUrl,
|
||||
slugForUrl,
|
||||
stripArchiveShell
|
||||
} from "./asset-inliner.mjs";
|
||||
|
||||
const require = createRequire(import.meta.url);
|
||||
|
||||
export { DEFAULT_USER_AGENT, defaultArchivePath };
|
||||
|
||||
const AD_SELECTORS = [
|
||||
"[data-ad-status]",
|
||||
"[data-ad-type]",
|
||||
"[aria-label*='advertisement' i]",
|
||||
"[id^='leaderboard']",
|
||||
"[class*='LeaderboardAd_']",
|
||||
"[class*='FullWidthAd_']",
|
||||
"[class*='BaseAd_']",
|
||||
".adWrapper",
|
||||
".dvz-v0-ad",
|
||||
"amp-ad",
|
||||
"iframe[src*='doubleclick']",
|
||||
"iframe[src*='googletagmanager']",
|
||||
"iframe[src*='googlesyndication']"
|
||||
];
|
||||
|
||||
const TRACKER_HOST_PATTERNS = [
|
||||
"doubleclick.net",
|
||||
"googletagmanager.com",
|
||||
"googlesyndication.com",
|
||||
"google-analytics.com",
|
||||
"pub.doubleverify.com",
|
||||
"securepubads.g.doubleclick.net",
|
||||
"s10.histats.com",
|
||||
"sstatic1.histats.com"
|
||||
];
|
||||
|
||||
export async function archivePage(input, options = {}) {
|
||||
const sourceUrl = inputToUrl(input);
|
||||
const archivePath = options.archivePath || defaultArchivePath();
|
||||
const id = options.id || slugForUrl(sourceUrl);
|
||||
const filePath = path.join(archivePath, `${id}.html`);
|
||||
await fs.mkdir(archivePath, { recursive: true });
|
||||
|
||||
const rawHtml = await readInputHtml(sourceUrl, options);
|
||||
const baseUrl = rawHtml ? findEffectiveBase(rawHtml, sourceUrl) : sourceUrl;
|
||||
const useStatic = options.static || (isFileUrl(sourceUrl) && options.stripArchiveShell && options.render !== true);
|
||||
const renderedHtml = useStatic
|
||||
? prepareStaticHtml(rawHtml, options)
|
||||
: await renderPage(sourceUrl, { ...options, rawHtml, baseUrl });
|
||||
|
||||
const inliner = new AssetInliner({
|
||||
userAgent: options.userAgent || DEFAULT_USER_AGENT,
|
||||
referer: isHttpUrl(sourceUrl) ? sourceUrl : undefined,
|
||||
maxAssetBytes: options.maxAssetBytes
|
||||
});
|
||||
const inlined = await inliner.inlineHtml(renderedHtml, baseUrl);
|
||||
const finalHtml = addArchiveComment(inlined, sourceUrl, options);
|
||||
await fs.writeFile(filePath, finalHtml, "utf8");
|
||||
|
||||
return {
|
||||
id,
|
||||
filePath,
|
||||
sourceUrl,
|
||||
archivePath,
|
||||
warnings: inliner.warnings,
|
||||
externalAssets: findExternalAssetRefs(finalHtml)
|
||||
};
|
||||
}
|
||||
|
||||
export async function readInputHtml(sourceUrl, options = {}) {
|
||||
if (isFileUrl(sourceUrl)) {
|
||||
return fs.readFile(new URL(sourceUrl), "utf8");
|
||||
}
|
||||
if (!isHttpUrl(sourceUrl) || !options.static) {
|
||||
return null;
|
||||
}
|
||||
const response = await fetch(sourceUrl, {
|
||||
headers: {
|
||||
"user-agent": options.userAgent || DEFAULT_USER_AGENT,
|
||||
accept: "text/html,application/xhtml+xml"
|
||||
},
|
||||
redirect: "follow"
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch ${sourceUrl}: HTTP ${response.status}`);
|
||||
}
|
||||
return response.text();
|
||||
}
|
||||
|
||||
function prepareStaticHtml(rawHtml, options = {}) {
|
||||
if (!rawHtml) {
|
||||
throw new Error("Static mode requires an HTML input file or fetched HTML document.");
|
||||
}
|
||||
return options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml;
|
||||
}
|
||||
|
||||
export async function renderPage(sourceUrl, options = {}) {
|
||||
const playwright = loadPlaywright();
|
||||
const browser = await playwright.chromium.launch({
|
||||
headless: true
|
||||
});
|
||||
try {
|
||||
const context = await browser.newContext({
|
||||
javaScriptEnabled: options.javaScriptEnabled ?? !(options.rawHtml && isFileUrl(sourceUrl)),
|
||||
userAgent: options.userAgent || DEFAULT_USER_AGENT,
|
||||
viewport: {
|
||||
width: options.viewportWidth || 1024,
|
||||
height: options.viewportHeight || 768
|
||||
}
|
||||
});
|
||||
const page = await context.newPage();
|
||||
|
||||
if (options.stripAds !== false) {
|
||||
await page.route("**/*", (route) => {
|
||||
const url = route.request().url();
|
||||
if (isTrackerUrl(url)) {
|
||||
return route.abort();
|
||||
}
|
||||
return route.continue();
|
||||
});
|
||||
}
|
||||
|
||||
if (options.rawHtml && isFileUrl(sourceUrl)) {
|
||||
const content = prepareRenderInputHtml(options.rawHtml, options);
|
||||
await page.setContent(content, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 });
|
||||
} else {
|
||||
await page.goto(sourceUrl, { waitUntil: "domcontentloaded", timeout: options.timeoutMs || 60000 });
|
||||
}
|
||||
|
||||
await settlePage(page, options);
|
||||
await cleanupAndFreezePage(page, options);
|
||||
return await page.content();
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
async function settlePage(page, options) {
|
||||
try {
|
||||
await page.waitForLoadState("networkidle", { timeout: options.networkIdleTimeoutMs || 15000 });
|
||||
} catch {
|
||||
// Dynamic pages often keep long-lived connections open; DOM capture can still proceed.
|
||||
}
|
||||
|
||||
if (options.scroll !== false) {
|
||||
await page.evaluate(async () => {
|
||||
await new Promise((resolve) => {
|
||||
let total = 0;
|
||||
const step = Math.max(400, Math.floor(window.innerHeight * 0.8));
|
||||
const timer = setInterval(() => {
|
||||
const previous = document.scrollingElement?.scrollTop || window.scrollY;
|
||||
window.scrollBy(0, step);
|
||||
total += step;
|
||||
const current = document.scrollingElement?.scrollTop || window.scrollY;
|
||||
if (current === previous || total > Math.max(document.body.scrollHeight, 20000)) {
|
||||
clearInterval(timer);
|
||||
window.scrollTo(0, 0);
|
||||
resolve();
|
||||
}
|
||||
}, 120);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async function cleanupAndFreezePage(page, options) {
|
||||
await page.evaluate(
|
||||
({
|
||||
adSelectors,
|
||||
freezeStyles,
|
||||
maxFreezeElements,
|
||||
maxSanitizeElements,
|
||||
stripAds,
|
||||
stripArchiveShell: shouldStripArchiveShell
|
||||
}) => {
|
||||
function removeAll(selector) {
|
||||
document.querySelectorAll(selector).forEach((node) => node.remove());
|
||||
}
|
||||
|
||||
if (shouldStripArchiveShell) {
|
||||
const content = document.querySelector("#CONTENT .html1") || document.querySelector("#CONTENT");
|
||||
if (content) {
|
||||
document.body.innerHTML = "";
|
||||
document.body.appendChild(content.cloneNode(true));
|
||||
document.documentElement.removeAttribute("prefix");
|
||||
document.documentElement.removeAttribute("itemscope");
|
||||
document.documentElement.removeAttribute("itemtype");
|
||||
}
|
||||
}
|
||||
|
||||
removeAll("script");
|
||||
removeAll("noscript");
|
||||
removeAll("link[rel='preconnect'],link[rel='dns-prefetch'],link[rel='modulepreload']");
|
||||
removeAll("meta[name='next-head-count']");
|
||||
|
||||
if (stripAds) {
|
||||
for (const selector of adSelectors) {
|
||||
try {
|
||||
removeAll(selector);
|
||||
} catch {
|
||||
// Ignore unsupported selectors in older browser engines.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
document.querySelectorAll("img").forEach((img) => {
|
||||
if (img.currentSrc) {
|
||||
img.setAttribute("data-original-src", img.getAttribute("src") || "");
|
||||
img.setAttribute("src", img.currentSrc);
|
||||
}
|
||||
img.removeAttribute("srcset");
|
||||
img.removeAttribute("sizes");
|
||||
img.setAttribute("loading", "lazy");
|
||||
});
|
||||
|
||||
document.querySelectorAll("source").forEach((source) => {
|
||||
source.removeAttribute("srcset");
|
||||
});
|
||||
|
||||
document.querySelectorAll("video,audio").forEach((media) => {
|
||||
if (media.currentSrc) {
|
||||
media.setAttribute("src", media.currentSrc);
|
||||
}
|
||||
});
|
||||
|
||||
document.querySelectorAll("iframe").forEach((frame) => {
|
||||
const src = frame.getAttribute("src");
|
||||
if (src) {
|
||||
frame.setAttribute("data-archived-src", src);
|
||||
}
|
||||
try {
|
||||
const doc = frame.contentDocument;
|
||||
if (doc?.documentElement) {
|
||||
frame.setAttribute("srcdoc", "<!doctype html>" + doc.documentElement.outerHTML);
|
||||
frame.removeAttribute("src");
|
||||
}
|
||||
} catch {
|
||||
// Cross-origin iframe sources are handled in the Node-side inliner when possible.
|
||||
}
|
||||
});
|
||||
|
||||
const walkedElements = [];
|
||||
const walker = document.createTreeWalker(document.documentElement, NodeFilter.SHOW_ELEMENT);
|
||||
let element = document.documentElement;
|
||||
let visited = 0;
|
||||
while (element && visited < maxSanitizeElements) {
|
||||
walkedElements.push(element);
|
||||
for (const attr of Array.from(element.attributes)) {
|
||||
if (/^on/i.test(attr.name) || attr.name === "integrity" || attr.name === "nonce") {
|
||||
element.removeAttribute(attr.name);
|
||||
}
|
||||
}
|
||||
visited += 1;
|
||||
element = walker.nextNode();
|
||||
}
|
||||
|
||||
if (!freezeStyles || element || walkedElements.length > maxFreezeElements) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const element of walkedElements) {
|
||||
if (element.tagName === "SCRIPT" || element.tagName === "STYLE") {
|
||||
continue;
|
||||
}
|
||||
const computed = window.getComputedStyle(element);
|
||||
const declarations = [];
|
||||
for (let i = 0; i < computed.length; i += 1) {
|
||||
const property = computed[i];
|
||||
const value = computed.getPropertyValue(property);
|
||||
if (value) {
|
||||
declarations.push(`${property}:${value}`);
|
||||
}
|
||||
}
|
||||
if (declarations.length) {
|
||||
element.setAttribute("style", declarations.join(";"));
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
adSelectors: AD_SELECTORS,
|
||||
freezeStyles: options.freezeStyles !== false,
|
||||
maxFreezeElements: options.maxFreezeElements || 2500,
|
||||
maxSanitizeElements: options.maxSanitizeElements || 5000,
|
||||
stripAds: options.stripAds !== false,
|
||||
stripArchiveShell: Boolean(options.stripArchiveShell)
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
function prepareRenderInputHtml(rawHtml, options) {
|
||||
let html = options.stripArchiveShell ? stripArchiveShell(rawHtml) : rawHtml;
|
||||
html = html
|
||||
.replace(/<script\b[\s\S]*?<\/script>/gi, "")
|
||||
.replace(/<noscript\b[\s\S]*?<\/noscript>/gi, "");
|
||||
if (!options.baseUrl) {
|
||||
return html;
|
||||
}
|
||||
if (/<base\b/i.test(html)) {
|
||||
return html;
|
||||
}
|
||||
const baseTag = `<base href="${htmlEscape(options.baseUrl)}">`;
|
||||
if (/<head\b[^>]*>/i.test(html)) {
|
||||
return html.replace(/<head\b[^>]*>/i, (match) => `${match}${baseTag}`);
|
||||
}
|
||||
return `${baseTag}${html}`;
|
||||
}
|
||||
|
||||
function loadPlaywright() {
|
||||
try {
|
||||
return require("playwright");
|
||||
} catch (error) {
|
||||
throw new Error(
|
||||
`Playwright is required for render mode. Run "npm install" and "npm run install-browsers", or use --static for HTML input files. Original error: ${error.message}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function isTrackerUrl(rawUrl) {
|
||||
let host = "";
|
||||
try {
|
||||
host = new URL(rawUrl).hostname;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
return TRACKER_HOST_PATTERNS.some((pattern) => host === pattern || host.endsWith(`.${pattern}`));
|
||||
}
|
||||
|
||||
function addArchiveComment(html, sourceUrl, options) {
|
||||
const comment = `<!-- Archived locally. Source: ${sourceUrl}. Created: ${new Date().toISOString()}. Paywall bypass filters were not executed. -->`;
|
||||
if (/<!doctype/i.test(html)) {
|
||||
return html.replace(/<!doctype[^>]*>/i, (doctype) => `${doctype}\n${comment}`);
|
||||
}
|
||||
return `<!doctype html>\n${comment}\n${html}`;
|
||||
}
|
||||
|
||||
export function findExternalAssetRefs(html) {
|
||||
const refs = new Set();
|
||||
const attrPattern = /\b(?:src|srcset|poster|data)\s*=\s*(["'])([\s\S]*?)\1/gi;
|
||||
for (const match of html.matchAll(attrPattern)) {
|
||||
if (isSelfContainedAssetRef(match[2])) {
|
||||
continue;
|
||||
}
|
||||
for (const part of match[2].split(",")) {
|
||||
const candidate = part.trim().split(/\s+/)[0];
|
||||
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
||||
refs.add(candidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
const linkPattern = /<link\b[^>]*>/gi;
|
||||
for (const match of html.matchAll(linkPattern)) {
|
||||
const tag = match[0];
|
||||
const rel = readAttribute(tag, "rel") || "";
|
||||
if (!/\b(?:stylesheet|icon|apple-touch-icon|image_src|preload)\b/i.test(rel)) {
|
||||
continue;
|
||||
}
|
||||
const href = readAttribute(tag, "href");
|
||||
if (href && !isSelfContainedAssetRef(href)) {
|
||||
refs.add(href);
|
||||
}
|
||||
}
|
||||
const cssUrlPattern = /url\(\s*(["']?)([^"')]+)\1\s*\)/gi;
|
||||
for (const match of html.matchAll(cssUrlPattern)) {
|
||||
const candidate = match[2].trim();
|
||||
if (candidate && !isSelfContainedAssetRef(candidate)) {
|
||||
refs.add(candidate);
|
||||
}
|
||||
}
|
||||
return Array.from(refs).sort();
|
||||
}
|
||||
|
||||
function isSelfContainedAssetRef(value) {
|
||||
const trimmed = value.trim();
|
||||
return (
|
||||
!trimmed ||
|
||||
trimmed.startsWith("#") ||
|
||||
/^(?:data|about|javascript|mailto|tel):/i.test(trimmed)
|
||||
);
|
||||
}
|
||||
|
||||
function readAttribute(tag, attr) {
|
||||
const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"));
|
||||
return match ? match[2] ?? match[3] ?? match[4] ?? "" : "";
|
||||
}
|
||||
521
src/asset-inliner.mjs
Normal file
521
src/asset-inliner.mjs
Normal file
@@ -0,0 +1,521 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { fileURLToPath, pathToFileURL } from "node:url";
|
||||
|
||||
export const DEFAULT_USER_AGENT =
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36";
|
||||
|
||||
const TEXT_TYPES = new Set([
|
||||
"application/javascript",
|
||||
"application/json",
|
||||
"application/ld+json",
|
||||
"application/xml",
|
||||
"image/svg+xml",
|
||||
"text/css",
|
||||
"text/html",
|
||||
"text/javascript",
|
||||
"text/plain",
|
||||
"text/xml"
|
||||
]);
|
||||
|
||||
const MIME_BY_EXT = new Map([
|
||||
[".apng", "image/apng"],
|
||||
[".avif", "image/avif"],
|
||||
[".css", "text/css"],
|
||||
[".gif", "image/gif"],
|
||||
[".html", "text/html"],
|
||||
[".ico", "image/x-icon"],
|
||||
[".jpeg", "image/jpeg"],
|
||||
[".jpg", "image/jpeg"],
|
||||
[".js", "text/javascript"],
|
||||
[".json", "application/json"],
|
||||
[".m4a", "audio/mp4"],
|
||||
[".mp3", "audio/mpeg"],
|
||||
[".mp4", "video/mp4"],
|
||||
[".otf", "font/otf"],
|
||||
[".png", "image/png"],
|
||||
[".svg", "image/svg+xml"],
|
||||
[".ttf", "font/ttf"],
|
||||
[".webm", "video/webm"],
|
||||
[".webp", "image/webp"],
|
||||
[".woff", "font/woff"],
|
||||
[".woff2", "font/woff2"],
|
||||
[".xml", "application/xml"]
|
||||
]);
|
||||
|
||||
const TRANSPARENT_IMAGE_DATA_URI =
|
||||
"data:image/gif;base64,R0lGODlhAQABAAAAACwAAAAAAQABAAA=";
|
||||
|
||||
export function defaultArchivePath() {
|
||||
return process.env.ARCHIVE_PATH || path.join(process.env.TMPDIR || "/tmp", "local-page-archives");
|
||||
}
|
||||
|
||||
export function isHttpUrl(value) {
|
||||
return /^https?:\/\//i.test(value);
|
||||
}
|
||||
|
||||
export function isFileUrl(value) {
|
||||
return /^file:\/\//i.test(value);
|
||||
}
|
||||
|
||||
export function inputToUrl(input) {
|
||||
if (/^[a-z][a-z0-9+.-]*:/i.test(input)) {
|
||||
return input;
|
||||
}
|
||||
return pathToFileURL(path.resolve(input)).href;
|
||||
}
|
||||
|
||||
export function slugForUrl(inputUrl) {
|
||||
const url = new URL(inputUrl);
|
||||
const stem =
|
||||
`${url.hostname}${url.pathname}`
|
||||
.replace(/\/+$/, "")
|
||||
.replace(/[^a-z0-9]+/gi, "-")
|
||||
.replace(/^-+|-+$/g, "")
|
||||
.slice(0, 90) || "archive";
|
||||
return `${stem}-${new Date().toISOString().replace(/[:.]/g, "-")}`;
|
||||
}
|
||||
|
||||
export function findEffectiveBase(html, fallbackBaseUrl) {
|
||||
const match = html.match(/<base\b[^>]*\bhref=(["']?)([^"'\s>]+)\1/i);
|
||||
if (!match) {
|
||||
return fallbackBaseUrl;
|
||||
}
|
||||
return resolveUrl(match[2], fallbackBaseUrl) || fallbackBaseUrl;
|
||||
}
|
||||
|
||||
export function resolveUrl(rawUrl, baseUrl) {
|
||||
if (!rawUrl) {
|
||||
return null;
|
||||
}
|
||||
const trimmed = htmlDecode(rawUrl.trim());
|
||||
if (
|
||||
!trimmed ||
|
||||
trimmed.startsWith("#") ||
|
||||
/^(?:about|blob|data|javascript|mailto|tel):/i.test(trimmed)
|
||||
) {
|
||||
return trimmed;
|
||||
}
|
||||
try {
|
||||
if (trimmed.startsWith("//") && (!baseUrl || /^file:/i.test(baseUrl))) {
|
||||
return `https:${trimmed}`;
|
||||
}
|
||||
return new URL(trimmed, baseUrl).href;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export function htmlEscape(value) {
|
||||
return String(value)
|
||||
.replaceAll("&", "&")
|
||||
.replaceAll("<", "<")
|
||||
.replaceAll(">", ">")
|
||||
.replaceAll('"', """);
|
||||
}
|
||||
|
||||
export function htmlDecode(value) {
|
||||
return String(value)
|
||||
.replaceAll("&", "&")
|
||||
.replaceAll(""", '"')
|
||||
.replaceAll("'", "'")
|
||||
.replaceAll("'", "'")
|
||||
.replaceAll("<", "<")
|
||||
.replaceAll(">", ">");
|
||||
}
|
||||
|
||||
export function stripArchiveShell(html) {
|
||||
if (!html.includes('id="CONTENT"') && !html.includes("id='CONTENT'")) {
|
||||
return html;
|
||||
}
|
||||
const contentStart = html.search(/<div\b[^>]*\bid=(["'])CONTENT\1[^>]*>/i);
|
||||
const marker = html.search(
|
||||
/<!--\[if !IE\]><!--><div\b[^>]*>\s*<table\b[^>]*\bid=(["'])hashtags\1/i
|
||||
);
|
||||
if (contentStart === -1 || marker === -1 || marker <= contentStart) {
|
||||
return html;
|
||||
}
|
||||
const title = html.match(/<title\b[^>]*>[\s\S]*?<\/title>/i)?.[0] || "<title>Archived page</title>";
|
||||
const fontStyle = html.match(/<style\b[^>]*type=(["'])text\/css\1[^>]*>[\s\S]*?<\/style>/i)?.[0] || "";
|
||||
const capturedStart = html.slice(contentStart, marker).search(/<div\b[^>]*\bclass=(["'])html1\1[^>]*>/i);
|
||||
const fragmentStart = capturedStart === -1 ? contentStart : contentStart + capturedStart;
|
||||
const fragmentEnd = findMatchingDivEnd(html, fragmentStart) || marker;
|
||||
const content = html.slice(fragmentStart, Math.min(fragmentEnd, marker));
|
||||
return `<!doctype html><html><head><meta charset="utf-8">${title}${fontStyle}</head><body style="margin:0;background:#fff">${content}</body></html>`;
|
||||
}
|
||||
|
||||
function findMatchingDivEnd(html, startIndex) {
|
||||
const tags = /<\/?div\b[^>]*>/gi;
|
||||
tags.lastIndex = startIndex;
|
||||
let depth = 0;
|
||||
for (const match of html.matchAll(tags)) {
|
||||
const tag = match[0];
|
||||
if (match.index < startIndex) {
|
||||
continue;
|
||||
}
|
||||
if (/^<div\b/i.test(tag) && !/\/>$/.test(tag)) {
|
||||
depth += 1;
|
||||
} else if (/^<\/div/i.test(tag)) {
|
||||
depth -= 1;
|
||||
if (depth === 0) {
|
||||
return match.index + tag.length;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export async function replaceAsync(input, regex, replacer) {
|
||||
const parts = [];
|
||||
let lastIndex = 0;
|
||||
for (const match of input.matchAll(regex)) {
|
||||
parts.push(input.slice(lastIndex, match.index));
|
||||
parts.push(await replacer(match));
|
||||
lastIndex = match.index + match[0].length;
|
||||
}
|
||||
parts.push(input.slice(lastIndex));
|
||||
return parts.join("");
|
||||
}
|
||||
|
||||
export class AssetInliner {
|
||||
constructor(options = {}) {
|
||||
this.userAgent = options.userAgent || DEFAULT_USER_AGENT;
|
||||
this.referer = options.referer;
|
||||
this.maxAssetBytes = options.maxAssetBytes || 30 * 1024 * 1024;
|
||||
this.cache = new Map();
|
||||
this.warnings = [];
|
||||
}
|
||||
|
||||
async inlineHtml(html, baseUrl, options = {}) {
|
||||
const depth = options.depth || 0;
|
||||
const effectiveBase = findEffectiveBase(html, baseUrl);
|
||||
let output = html;
|
||||
|
||||
output = output.replace(/<base\b[^>]*>/gi, "");
|
||||
output = output.replace(/<script\b[\s\S]*?<\/script>/gi, "");
|
||||
output = output.replace(/<noscript\b[\s\S]*?<\/noscript>/gi, "");
|
||||
output = output.replace(/<link\b[^>]*\brel=(["']?)(?:preconnect|dns-prefetch|modulepreload)\1[^>]*>/gi, "");
|
||||
|
||||
output = await replaceAsync(output, /<style\b([^>]*)>([\s\S]*?)<\/style>/gi, async (match) => {
|
||||
const attrs = match[1] || "";
|
||||
const css = await this.inlineCss(match[2] || "", effectiveBase);
|
||||
return `<style${attrs}>${css}</style>`;
|
||||
});
|
||||
|
||||
output = await replaceAsync(output, /\sstyle=(["'])([\s\S]*?)\1/gi, async (match) => {
|
||||
const css = await this.inlineCss(htmlDecode(match[2]), effectiveBase);
|
||||
return ` style=${match[1]}${htmlEscape(css)}${match[1]}`;
|
||||
});
|
||||
|
||||
output = await replaceAsync(output, /<link\b[^>]*>/gi, async (match) => {
|
||||
return this.rewriteLinkTag(match[0], effectiveBase);
|
||||
});
|
||||
|
||||
output = await replaceAsync(output, /<iframe\b[^>]*>[\s\S]*?<\/iframe>|<iframe\b[^>]*\/?>/gi, async (match) => {
|
||||
if (depth >= 1) {
|
||||
return this.rewriteMediaAttributes(match[0], effectiveBase);
|
||||
}
|
||||
return this.rewriteIframeTag(match[0], effectiveBase, depth);
|
||||
});
|
||||
|
||||
output = await replaceAsync(
|
||||
output,
|
||||
/<(?:img|source|audio|video|track|embed|object|input)\b[^>]*>/gi,
|
||||
async (match) => this.rewriteMediaAttributes(match[0], effectiveBase)
|
||||
);
|
||||
|
||||
output = removeExternalBookkeepingUrls(output);
|
||||
output = restoreArchiveProxyLinks(output);
|
||||
|
||||
output = await replaceAsync(output, /srcset=(["'])([\s\S]*?)\1/gi, async (match) => {
|
||||
const rewritten = await this.inlineSrcset(match[2], effectiveBase);
|
||||
return `srcset=${match[1]}${htmlEscape(rewritten)}${match[1]}`;
|
||||
});
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
async rewriteLinkTag(tag, baseUrl) {
|
||||
const rel = getAttribute(tag, "rel") || "";
|
||||
const href = getAttribute(tag, "href");
|
||||
const asValue = getAttribute(tag, "as") || "";
|
||||
if (!href) {
|
||||
return tag;
|
||||
}
|
||||
|
||||
if (/\bstylesheet\b/i.test(rel)) {
|
||||
const absolute = resolveUrl(href, baseUrl);
|
||||
if (!absolute || absolute.startsWith("data:")) {
|
||||
return "";
|
||||
}
|
||||
const css = await this.fetchText(absolute, baseUrl);
|
||||
if (css == null) {
|
||||
return "";
|
||||
}
|
||||
const inlinedCss = await this.inlineCss(css, absolute);
|
||||
return `<style data-archived-href="${htmlEscape(absolute)}">${inlinedCss}</style>`;
|
||||
}
|
||||
|
||||
const isInlineableLink =
|
||||
/\b(?:icon|apple-touch-icon|image_src)\b/i.test(rel) ||
|
||||
(/\bpreload\b/i.test(rel) && /^(?:font|image|style)$/i.test(asValue));
|
||||
if (!isInlineableLink) {
|
||||
return tag;
|
||||
}
|
||||
if (/\bpreload\b/i.test(rel) && /^style$/i.test(asValue)) {
|
||||
return "";
|
||||
}
|
||||
const dataUri = await this.toDataUri(href, baseUrl);
|
||||
if (!dataUri) {
|
||||
return "";
|
||||
}
|
||||
return setAttribute(tag, "href", dataUri);
|
||||
}
|
||||
|
||||
async rewriteMediaAttributes(tag, baseUrl) {
|
||||
let output = tag;
|
||||
for (const attr of ["src", "poster", "data"]) {
|
||||
const value = getAttribute(output, attr);
|
||||
if (!value) {
|
||||
continue;
|
||||
}
|
||||
const dataUri = await this.toDataUri(value, baseUrl);
|
||||
if (dataUri) {
|
||||
output = setAttribute(output, attr, dataUri);
|
||||
} else {
|
||||
output = replaceMissingMediaAttribute(output, attr);
|
||||
}
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
async rewriteIframeTag(tag, baseUrl, depth) {
|
||||
const src = getAttribute(tag, "src");
|
||||
if (!src || getAttribute(tag, "srcdoc")) {
|
||||
return this.rewriteMediaAttributes(tag, baseUrl);
|
||||
}
|
||||
const absolute = resolveUrl(src, baseUrl);
|
||||
if (!absolute || absolute.startsWith("data:")) {
|
||||
return tag;
|
||||
}
|
||||
const text = await this.fetchText(absolute, baseUrl);
|
||||
if (text != null) {
|
||||
const inlined = await this.inlineHtml(text, absolute, { depth: depth + 1 });
|
||||
let rewritten = removeAttribute(tag, "src");
|
||||
rewritten = setAttribute(rewritten, "srcdoc", inlined);
|
||||
rewritten = setAttribute(rewritten, "data-archived-src", absolute);
|
||||
return rewritten;
|
||||
}
|
||||
return this.rewriteMediaAttributes(tag, baseUrl);
|
||||
}
|
||||
|
||||
async inlineSrcset(value, baseUrl) {
|
||||
const candidates = value
|
||||
.split(",")
|
||||
.map((part) => part.trim())
|
||||
.filter(Boolean);
|
||||
const rewritten = [];
|
||||
for (const candidate of candidates) {
|
||||
const [urlPart, ...descriptor] = candidate.split(/\s+/);
|
||||
const dataUri = await this.toDataUri(urlPart, baseUrl);
|
||||
rewritten.push([dataUri || TRANSPARENT_IMAGE_DATA_URI, ...descriptor].join(" "));
|
||||
}
|
||||
return rewritten.join(", ");
|
||||
}
|
||||
|
||||
async inlineCss(css, baseUrl) {
|
||||
let output = await replaceAsync(
|
||||
css,
|
||||
/@import\s+(?:url\()?["']?([^"')\s;]+)["']?\)?[^;]*;/gi,
|
||||
async (match) => {
|
||||
const absolute = resolveUrl(match[1], baseUrl);
|
||||
if (!absolute || absolute.startsWith("data:")) {
|
||||
return "";
|
||||
}
|
||||
const imported = await this.fetchText(absolute, baseUrl);
|
||||
if (imported == null) {
|
||||
return "";
|
||||
}
|
||||
return this.inlineCss(imported, absolute);
|
||||
}
|
||||
);
|
||||
|
||||
output = await replaceAsync(output, /url\(\s*(["']?)([^"')]+)\1\s*\)/gi, async (match) => {
|
||||
const raw = htmlDecode(match[2].trim());
|
||||
if (!raw || raw.startsWith("#") || /^(?:data|blob|about|javascript):/i.test(raw)) {
|
||||
return match[0];
|
||||
}
|
||||
const dataUri = await this.toDataUri(raw, baseUrl);
|
||||
return dataUri ? `url("${dataUri}")` : "url(about:blank)";
|
||||
});
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
async toDataUri(rawUrl, baseUrl) {
|
||||
const absolute = resolveUrl(rawUrl, baseUrl);
|
||||
if (!absolute || absolute.startsWith("data:")) {
|
||||
return absolute;
|
||||
}
|
||||
if (this.cache.has(absolute)) {
|
||||
return this.cache.get(absolute);
|
||||
}
|
||||
const asset = await this.fetchAsset(absolute, baseUrl);
|
||||
if (!asset) {
|
||||
this.cache.set(absolute, null);
|
||||
return null;
|
||||
}
|
||||
const dataUri = `data:${asset.contentType};base64,${asset.bytes.toString("base64")}`;
|
||||
this.cache.set(absolute, dataUri);
|
||||
return dataUri;
|
||||
}
|
||||
|
||||
async fetchText(rawUrl, baseUrl) {
|
||||
const asset = await this.fetchAsset(rawUrl, baseUrl);
|
||||
if (!asset) {
|
||||
return null;
|
||||
}
|
||||
const contentType = asset.contentType.split(";")[0].toLowerCase();
|
||||
if (!TEXT_TYPES.has(contentType) && !contentType.endsWith("+xml")) {
|
||||
return null;
|
||||
}
|
||||
return asset.bytes.toString("utf8");
|
||||
}
|
||||
|
||||
async fetchAsset(rawUrl, baseUrl) {
|
||||
const absolute = resolveUrl(rawUrl, baseUrl);
|
||||
if (!absolute || absolute.startsWith("data:")) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
if (isFileUrl(absolute)) {
|
||||
const filePath = fileURLToPath(absolute);
|
||||
const bytes = await fs.readFile(filePath);
|
||||
return {
|
||||
bytes,
|
||||
contentType: mimeFromUrl(absolute)
|
||||
};
|
||||
}
|
||||
if (!isHttpUrl(absolute)) {
|
||||
return null;
|
||||
}
|
||||
for (let attempt = 1; attempt <= 2; attempt += 1) {
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), 30000);
|
||||
try {
|
||||
const response = await fetch(absolute, {
|
||||
headers: {
|
||||
"user-agent": this.userAgent,
|
||||
accept: "*/*",
|
||||
...(this.referer ? { referer: this.referer } : {})
|
||||
},
|
||||
redirect: "follow",
|
||||
signal: controller.signal
|
||||
});
|
||||
clearTimeout(timeout);
|
||||
if (!response.ok) {
|
||||
this.warnings.push(`Failed to fetch ${absolute}: HTTP ${response.status}`);
|
||||
return null;
|
||||
}
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
if (arrayBuffer.byteLength > this.maxAssetBytes) {
|
||||
this.warnings.push(`Skipped ${absolute}: ${arrayBuffer.byteLength} bytes exceeds ${this.maxAssetBytes}`);
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
bytes: Buffer.from(arrayBuffer),
|
||||
contentType: response.headers.get("content-type")?.split(";")[0] || mimeFromUrl(absolute)
|
||||
};
|
||||
} catch (error) {
|
||||
clearTimeout(timeout);
|
||||
if (attempt < 2) {
|
||||
continue;
|
||||
}
|
||||
this.warnings.push(`Failed to fetch ${absolute}: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
this.warnings.push(`Failed to fetch ${absolute}: ${error.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function removeExternalBookkeepingUrls(html) {
|
||||
return html.replace(
|
||||
/\s(?:old-src|currentSourceUrl|data-original-src|data-archived-src)=(["'])(https?:\/\/[\s\S]*?)\1/gi,
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
function restoreArchiveProxyLinks(html) {
|
||||
return html.replace(/\s(href|action)=(["'])([\s\S]*?)\2/gi, (full, attr, quote, rawValue) => {
|
||||
const restored = restoreArchiveProxyUrl(htmlDecode(rawValue));
|
||||
if (restored === rawValue) {
|
||||
return full;
|
||||
}
|
||||
return ` ${attr}=${quote}${htmlEscape(restored)}${quote}`;
|
||||
});
|
||||
}
|
||||
|
||||
function restoreArchiveProxyUrl(rawValue) {
|
||||
const value = rawValue.trim();
|
||||
const archiveHost = "archive\\.(?:ph|today|is|li|md|fo|vn|pm)";
|
||||
const proxied = value.match(new RegExp(`^https?://${archiveHost}/o/[^/]+/(https?://.+)$`, "i"));
|
||||
if (proxied) {
|
||||
return safeDecodeUrl(proxied[1]);
|
||||
}
|
||||
const samePage = value.match(new RegExp(`^https?://${archiveHost}/[^/#?]+(#.+)$`, "i"));
|
||||
if (samePage) {
|
||||
return samePage[1];
|
||||
}
|
||||
return rawValue;
|
||||
}
|
||||
|
||||
function safeDecodeUrl(value) {
|
||||
try {
|
||||
return decodeURIComponent(value);
|
||||
} catch {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
function mimeFromUrl(rawUrl) {
|
||||
let pathname = rawUrl;
|
||||
try {
|
||||
pathname = new URL(rawUrl).pathname;
|
||||
} catch {
|
||||
// Keep raw string.
|
||||
}
|
||||
return MIME_BY_EXT.get(path.extname(pathname).toLowerCase()) || "application/octet-stream";
|
||||
}
|
||||
|
||||
function getAttribute(tag, attr) {
|
||||
const match = tag.match(new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"));
|
||||
if (!match) {
|
||||
return null;
|
||||
}
|
||||
return htmlDecode(match[2] ?? match[3] ?? match[4] ?? "");
|
||||
}
|
||||
|
||||
function setAttribute(tag, attr, value) {
|
||||
const escaped = htmlEscape(value);
|
||||
const attrRegex = new RegExp(`\\b${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i");
|
||||
if (attrRegex.test(tag)) {
|
||||
return tag.replace(attrRegex, `${attr}="${escaped}"`);
|
||||
}
|
||||
return tag.replace(/\/?>$/, (end) => ` ${attr}="${escaped}"${end}`);
|
||||
}
|
||||
|
||||
function removeAttribute(tag, attr) {
|
||||
return tag.replace(new RegExp(`\\s+${attr}\\s*=\\s*("([^"]*)"|'([^']*)'|([^\\s>]+))`, "i"), "");
|
||||
}
|
||||
|
||||
function replaceMissingMediaAttribute(tag, attr) {
|
||||
const tagName = tag.match(/^<([a-z0-9:-]+)/i)?.[1]?.toLowerCase() || "";
|
||||
if (attr === "src" && (tagName === "img" || tagName === "input")) {
|
||||
return setAttribute(tag, attr, TRANSPARENT_IMAGE_DATA_URI);
|
||||
}
|
||||
return removeAttribute(tag, attr);
|
||||
}
|
||||
96
src/cli.mjs
Normal file
96
src/cli.mjs
Normal file
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env node
|
||||
import { archivePage, DEFAULT_USER_AGENT, defaultArchivePath } from "./archiver.mjs";
|
||||
|
||||
function parseArgs(argv) {
|
||||
const args = {
|
||||
command: argv[2],
|
||||
positional: []
|
||||
};
|
||||
for (let i = 3; i < argv.length; i += 1) {
|
||||
const arg = argv[i];
|
||||
if (!arg.startsWith("--")) {
|
||||
args.positional.push(arg);
|
||||
continue;
|
||||
}
|
||||
const [flag, inlineValue] = arg.split("=", 2);
|
||||
const key = flag.slice(2);
|
||||
if (key.startsWith("no-")) {
|
||||
args[key.slice(3)] = false;
|
||||
} else if (inlineValue !== undefined) {
|
||||
args[key] = inlineValue;
|
||||
} else if (i + 1 < argv.length && !argv[i + 1].startsWith("--")) {
|
||||
args[key] = argv[++i];
|
||||
} else {
|
||||
args[key] = true;
|
||||
}
|
||||
}
|
||||
return args;
|
||||
}
|
||||
|
||||
function usage() {
|
||||
console.log(`Usage:
|
||||
node src/cli.mjs archive <url-or-html-file> [options]
|
||||
|
||||
Options:
|
||||
--archive-path <dir> Output directory. Defaults to ARCHIVE_PATH or ${defaultArchivePath()}
|
||||
--id <id> Output id/file stem
|
||||
--static Do not use a browser; transform the input HTML only
|
||||
--render Force browser rendering for local archive-shell HTML
|
||||
--strip-archive-shell Remove an archive.ph shell from an already archived HTML file
|
||||
--no-strip-ads Keep ad-like elements
|
||||
--user-agent <ua> User agent to send for page and asset requests
|
||||
--max-asset-bytes <bytes> Per-asset inline limit
|
||||
|
||||
Default user agent:
|
||||
${DEFAULT_USER_AGENT}`);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs(process.argv);
|
||||
if (!args.command || args.command === "help" || args["help"]) {
|
||||
usage();
|
||||
return;
|
||||
}
|
||||
if (args.command !== "archive") {
|
||||
throw new Error(`Unknown command: ${args.command}`);
|
||||
}
|
||||
const input = args.positional[0];
|
||||
if (!input) {
|
||||
usage();
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
const result = await archivePage(input, {
|
||||
archivePath: args["archive-path"],
|
||||
id: args.id,
|
||||
render: Boolean(args.render),
|
||||
static: Boolean(args.static),
|
||||
stripArchiveShell: Boolean(args["strip-archive-shell"]),
|
||||
stripAds: args["strip-ads"] !== false,
|
||||
userAgent: args["user-agent"] || DEFAULT_USER_AGENT,
|
||||
maxAssetBytes: args["max-asset-bytes"] ? Number(args["max-asset-bytes"]) : undefined
|
||||
});
|
||||
|
||||
console.log(`Archived: ${result.sourceUrl}`);
|
||||
console.log(`Output: ${result.filePath}`);
|
||||
if (result.externalAssets.length) {
|
||||
console.log(`External asset references remaining: ${result.externalAssets.length}`);
|
||||
for (const ref of result.externalAssets.slice(0, 20)) {
|
||||
console.log(` ${ref}`);
|
||||
}
|
||||
} else {
|
||||
console.log("External asset references remaining: 0");
|
||||
}
|
||||
if (result.warnings.length) {
|
||||
console.log(`Warnings: ${result.warnings.length}`);
|
||||
for (const warning of result.warnings.slice(0, 20)) {
|
||||
console.log(` ${warning}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(error.message);
|
||||
process.exitCode = 1;
|
||||
});
|
||||
84
src/server.mjs
Normal file
84
src/server.mjs
Normal file
@@ -0,0 +1,84 @@
|
||||
import http from "node:http";
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { archivePage, DEFAULT_USER_AGENT, defaultArchivePath } from "./archiver.mjs";
|
||||
|
||||
const archivePath = process.env.ARCHIVE_PATH || defaultArchivePath();
|
||||
const port = Number(process.env.PORT || 8787);
|
||||
|
||||
const server = http.createServer(async (req, res) => {
|
||||
try {
|
||||
const url = new URL(req.url, `http://${req.headers.host}`);
|
||||
if (req.method === "GET" && url.pathname === "/health") {
|
||||
return sendJson(res, 200, { ok: true, archivePath });
|
||||
}
|
||||
|
||||
if (req.method === "POST" && url.pathname === "/archive") {
|
||||
const body = await readJson(req);
|
||||
if (!body.url) {
|
||||
return sendJson(res, 400, { error: "Missing required field: url" });
|
||||
}
|
||||
const result = await archivePage(body.url, {
|
||||
archivePath,
|
||||
id: body.id,
|
||||
render: Boolean(body.render),
|
||||
static: Boolean(body.static),
|
||||
stripArchiveShell: Boolean(body.stripArchiveShell),
|
||||
stripAds: body.stripAds !== false,
|
||||
userAgent: body.userAgent || DEFAULT_USER_AGENT,
|
||||
maxAssetBytes: body.maxAssetBytes
|
||||
});
|
||||
return sendJson(res, 201, {
|
||||
id: result.id,
|
||||
sourceUrl: result.sourceUrl,
|
||||
file: result.filePath,
|
||||
externalAssets: result.externalAssets,
|
||||
warnings: result.warnings,
|
||||
viewUrl: `/archives/${encodeURIComponent(path.basename(result.filePath))}`
|
||||
});
|
||||
}
|
||||
|
||||
if (req.method === "GET" && url.pathname.startsWith("/archives/")) {
|
||||
const file = decodeURIComponent(url.pathname.slice("/archives/".length));
|
||||
if (!/^[a-zA-Z0-9._-]+\.html$/.test(file)) {
|
||||
return sendJson(res, 400, { error: "Invalid archive file name" });
|
||||
}
|
||||
const fullPath = path.join(archivePath, file);
|
||||
const html = await fs.readFile(fullPath);
|
||||
res.writeHead(200, {
|
||||
"content-type": "text/html; charset=utf-8",
|
||||
"content-length": html.length
|
||||
});
|
||||
return res.end(html);
|
||||
}
|
||||
|
||||
sendJson(res, 404, { error: "Not found" });
|
||||
} catch (error) {
|
||||
sendJson(res, 500, { error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
server.listen(port, () => {
|
||||
console.log(`Archive API listening on http://127.0.0.1:${port}`);
|
||||
console.log(`ARCHIVE_PATH=${archivePath}`);
|
||||
});
|
||||
|
||||
async function readJson(req) {
|
||||
const chunks = [];
|
||||
for await (const chunk of req) {
|
||||
chunks.push(chunk);
|
||||
}
|
||||
if (!chunks.length) {
|
||||
return {};
|
||||
}
|
||||
return JSON.parse(Buffer.concat(chunks).toString("utf8"));
|
||||
}
|
||||
|
||||
function sendJson(res, status, value) {
|
||||
const body = Buffer.from(JSON.stringify(value, null, 2));
|
||||
res.writeHead(status, {
|
||||
"content-type": "application/json; charset=utf-8",
|
||||
"content-length": body.length
|
||||
});
|
||||
res.end(body);
|
||||
}
|
||||
Reference in New Issue
Block a user