161 lines
5.6 KiB
TypeScript
161 lines
5.6 KiB
TypeScript
|
|
import { env } from "../env.js";
|
||
|
|
|
||
|
|
const SEARXNG_TIMEOUT_MS = 12_000;
|
||
|
|
const DEFAULT_SEARXNG_CATEGORIES = "general";
|
||
|
|
|
||
|
|
export type SearxngSearchOptions = {
|
||
|
|
numResults: number;
|
||
|
|
includeDomains?: string[];
|
||
|
|
excludeDomains?: string[];
|
||
|
|
};
|
||
|
|
|
||
|
|
export type SearxngSearchResult = {
|
||
|
|
title: string | null;
|
||
|
|
url: string | null;
|
||
|
|
publishedDate: string | null;
|
||
|
|
summary: string | null;
|
||
|
|
text: string | null;
|
||
|
|
engines: string[];
|
||
|
|
};
|
||
|
|
|
||
|
|
export type SearxngSearchResponse = {
|
||
|
|
query: string;
|
||
|
|
requestId: null;
|
||
|
|
results: SearxngSearchResult[];
|
||
|
|
};
|
||
|
|
|
||
|
|
function clipText(input: string, maxCharacters: number) {
|
||
|
|
return input.length <= maxCharacters ? input : `${input.slice(0, maxCharacters)}...`;
|
||
|
|
}
|
||
|
|
|
||
|
|
function compactWhitespace(input: string) {
|
||
|
|
return input.replace(/\r/g, "").replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").replace(/\s+/g, " ").trim();
|
||
|
|
}
|
||
|
|
|
||
|
|
function requireSearxngBaseUrl() {
|
||
|
|
if (!env.SEARXNG_BASE_URL) {
|
||
|
|
throw new Error("SEARXNG_BASE_URL not set");
|
||
|
|
}
|
||
|
|
return env.SEARXNG_BASE_URL.endsWith("/") ? env.SEARXNG_BASE_URL : `${env.SEARXNG_BASE_URL}/`;
|
||
|
|
}
|
||
|
|
|
||
|
|
function normalizeDomain(input: string) {
|
||
|
|
const trimmed = input.trim().toLowerCase();
|
||
|
|
if (!trimmed) return null;
|
||
|
|
|
||
|
|
try {
|
||
|
|
const parsed = new URL(trimmed.includes("://") ? trimmed : `https://${trimmed}`);
|
||
|
|
return parsed.hostname.replace(/^www\./, "");
|
||
|
|
} catch {
|
||
|
|
return trimmed.split(/[/?#]/, 1)[0]?.replace(/^www\./, "") || null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function normalizeDomains(input: string[] | undefined) {
|
||
|
|
return Array.from(new Set((input ?? []).map(normalizeDomain).filter((domain): domain is string => Boolean(domain))));
|
||
|
|
}
|
||
|
|
|
||
|
|
function hostnameMatchesDomain(urlRaw: string | null, domain: string) {
|
||
|
|
if (!urlRaw) return false;
|
||
|
|
try {
|
||
|
|
const hostname = new URL(urlRaw).hostname.toLowerCase().replace(/^www\./, "");
|
||
|
|
return hostname === domain || hostname.endsWith(`.${domain}`);
|
||
|
|
} catch {
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function filterResultsByDomains(results: SearxngSearchResult[], options: SearxngSearchOptions) {
|
||
|
|
const includeDomains = normalizeDomains(options.includeDomains);
|
||
|
|
const excludeDomains = normalizeDomains(options.excludeDomains);
|
||
|
|
return results.filter((result) => {
|
||
|
|
if (includeDomains.length && !includeDomains.some((domain) => hostnameMatchesDomain(result.url, domain))) return false;
|
||
|
|
if (excludeDomains.some((domain) => hostnameMatchesDomain(result.url, domain))) return false;
|
||
|
|
return true;
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
function buildSearxngQuery(query: string, options: SearxngSearchOptions) {
|
||
|
|
const includeDomains = normalizeDomains(options.includeDomains);
|
||
|
|
const excludeDomains = normalizeDomains(options.excludeDomains);
|
||
|
|
const includeClause =
|
||
|
|
includeDomains.length === 0
|
||
|
|
? ""
|
||
|
|
: includeDomains.length === 1
|
||
|
|
? `site:${includeDomains[0]}`
|
||
|
|
: `(${includeDomains.map((domain) => `site:${domain}`).join(" OR ")})`;
|
||
|
|
const excludeClause = excludeDomains.map((domain) => `-site:${domain}`).join(" ");
|
||
|
|
return [query, includeClause, excludeClause].filter(Boolean).join(" ");
|
||
|
|
}
|
||
|
|
|
||
|
|
function buildSearchUrl(query: string, options: SearxngSearchOptions) {
|
||
|
|
const url = new URL("search", requireSearxngBaseUrl());
|
||
|
|
url.searchParams.set("q", buildSearxngQuery(query, options));
|
||
|
|
url.searchParams.set("categories", DEFAULT_SEARXNG_CATEGORIES);
|
||
|
|
url.searchParams.set("language", "auto");
|
||
|
|
url.searchParams.set("safesearch", "1");
|
||
|
|
url.searchParams.set("format", "json");
|
||
|
|
return url;
|
||
|
|
}
|
||
|
|
|
||
|
|
async function fetchSearxng(url: URL, accept: string) {
|
||
|
|
const controller = new AbortController();
|
||
|
|
const timeout = setTimeout(() => controller.abort(), SEARXNG_TIMEOUT_MS);
|
||
|
|
|
||
|
|
try {
|
||
|
|
return await fetch(url, {
|
||
|
|
redirect: "follow",
|
||
|
|
signal: controller.signal,
|
||
|
|
headers: {
|
||
|
|
"User-Agent": "SybilBot/1.0 (+https://sybil.local)",
|
||
|
|
Accept: accept,
|
||
|
|
},
|
||
|
|
});
|
||
|
|
} finally {
|
||
|
|
clearTimeout(timeout);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function stringOrNull(value: unknown) {
|
||
|
|
if (typeof value !== "string") return null;
|
||
|
|
const normalized = compactWhitespace(value);
|
||
|
|
return normalized || null;
|
||
|
|
}
|
||
|
|
|
||
|
|
function stringArray(value: unknown) {
|
||
|
|
if (!Array.isArray(value)) return [];
|
||
|
|
return value.filter((item): item is string => typeof item === "string").map(compactWhitespace).filter(Boolean);
|
||
|
|
}
|
||
|
|
|
||
|
|
function mapJsonResult(result: any): SearxngSearchResult {
|
||
|
|
const summary = stringOrNull(result?.content) ?? stringOrNull(result?.snippet);
|
||
|
|
const text = summary ? clipText(summary, 700) : null;
|
||
|
|
return {
|
||
|
|
title: stringOrNull(result?.title),
|
||
|
|
url: stringOrNull(result?.url),
|
||
|
|
publishedDate: stringOrNull(result?.publishedDate) ?? stringOrNull(result?.published_date),
|
||
|
|
summary: summary ? clipText(summary, 1_400) : null,
|
||
|
|
text,
|
||
|
|
engines: stringArray(result?.engines ?? (typeof result?.engine === "string" ? [result.engine] : [])),
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
export async function searchSearxng(query: string, options: SearxngSearchOptions): Promise<SearxngSearchResponse> {
|
||
|
|
const url = buildSearchUrl(query, options);
|
||
|
|
const response = await fetchSearxng(url, "application/json");
|
||
|
|
if (!response.ok) {
|
||
|
|
await response.arrayBuffer();
|
||
|
|
throw new Error(`SearXNG JSON search failed with status ${response.status}. Verify search.formats includes json.`);
|
||
|
|
}
|
||
|
|
|
||
|
|
const contentType = response.headers.get("content-type")?.toLowerCase() ?? "";
|
||
|
|
if (!contentType.includes("application/json")) {
|
||
|
|
await response.arrayBuffer();
|
||
|
|
throw new Error(`SearXNG JSON search returned ${contentType || "unknown content type"}.`);
|
||
|
|
}
|
||
|
|
|
||
|
|
const data: any = await response.json();
|
||
|
|
const results = Array.isArray(data?.results) ? data.results.map(mapJsonResult) : [];
|
||
|
|
return { query, requestId: null, results: filterResultsByDomains(results, options).slice(0, options.numResults) };
|
||
|
|
}
|