import { env } from "../env.js"; const SEARXNG_TIMEOUT_MS = 12_000; const DEFAULT_SEARXNG_CATEGORIES = "general"; export type SearxngSearchOptions = { numResults: number; includeDomains?: string[]; excludeDomains?: string[]; }; export type SearxngSearchResult = { title: string | null; url: string | null; publishedDate: string | null; summary: string | null; text: string | null; engines: string[]; }; export type SearxngSearchResponse = { query: string; requestId: null; results: SearxngSearchResult[]; }; function clipText(input: string, maxCharacters: number) { return input.length <= maxCharacters ? input : `${input.slice(0, maxCharacters)}...`; } function compactWhitespace(input: string) { return input.replace(/\r/g, "").replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").replace(/\s+/g, " ").trim(); } function requireSearxngBaseUrl() { if (!env.SEARXNG_BASE_URL) { throw new Error("SEARXNG_BASE_URL not set"); } return env.SEARXNG_BASE_URL.endsWith("/") ? env.SEARXNG_BASE_URL : `${env.SEARXNG_BASE_URL}/`; } function normalizeDomain(input: string) { const trimmed = input.trim().toLowerCase(); if (!trimmed) return null; try { const parsed = new URL(trimmed.includes("://") ? trimmed : `https://${trimmed}`); return parsed.hostname.replace(/^www\./, ""); } catch { return trimmed.split(/[/?#]/, 1)[0]?.replace(/^www\./, "") || null; } } function normalizeDomains(input: string[] | undefined) { return Array.from(new Set((input ?? []).map(normalizeDomain).filter((domain): domain is string => Boolean(domain)))); } function hostnameMatchesDomain(urlRaw: string | null, domain: string) { if (!urlRaw) return false; try { const hostname = new URL(urlRaw).hostname.toLowerCase().replace(/^www\./, ""); return hostname === domain || hostname.endsWith(`.${domain}`); } catch { return false; } } function filterResultsByDomains(results: SearxngSearchResult[], options: SearxngSearchOptions) { const includeDomains = normalizeDomains(options.includeDomains); const excludeDomains = normalizeDomains(options.excludeDomains); return results.filter((result) => { if (includeDomains.length && !includeDomains.some((domain) => hostnameMatchesDomain(result.url, domain))) return false; if (excludeDomains.some((domain) => hostnameMatchesDomain(result.url, domain))) return false; return true; }); } function buildSearxngQuery(query: string, options: SearxngSearchOptions) { const includeDomains = normalizeDomains(options.includeDomains); const excludeDomains = normalizeDomains(options.excludeDomains); const includeClause = includeDomains.length === 0 ? "" : includeDomains.length === 1 ? `site:${includeDomains[0]}` : `(${includeDomains.map((domain) => `site:${domain}`).join(" OR ")})`; const excludeClause = excludeDomains.map((domain) => `-site:${domain}`).join(" "); return [query, includeClause, excludeClause].filter(Boolean).join(" "); } function buildSearchUrl(query: string, options: SearxngSearchOptions) { const url = new URL("search", requireSearxngBaseUrl()); url.searchParams.set("q", buildSearxngQuery(query, options)); url.searchParams.set("categories", DEFAULT_SEARXNG_CATEGORIES); url.searchParams.set("language", "auto"); url.searchParams.set("safesearch", "1"); url.searchParams.set("format", "json"); return url; } async function fetchSearxng(url: URL, accept: string) { const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), SEARXNG_TIMEOUT_MS); try { return await fetch(url, { redirect: "follow", signal: controller.signal, headers: { "User-Agent": "SybilBot/1.0 (+https://sybil.local)", Accept: accept, }, }); } finally { clearTimeout(timeout); } } function stringOrNull(value: unknown) { if (typeof value !== "string") return null; const normalized = compactWhitespace(value); return normalized || null; } function stringArray(value: unknown) { if (!Array.isArray(value)) return []; return value.filter((item): item is string => typeof item === "string").map(compactWhitespace).filter(Boolean); } function mapJsonResult(result: any): SearxngSearchResult { const summary = stringOrNull(result?.content) ?? stringOrNull(result?.snippet); const text = summary ? clipText(summary, 700) : null; return { title: stringOrNull(result?.title), url: stringOrNull(result?.url), publishedDate: stringOrNull(result?.publishedDate) ?? stringOrNull(result?.published_date), summary: summary ? clipText(summary, 1_400) : null, text, engines: stringArray(result?.engines ?? (typeof result?.engine === "string" ? [result.engine] : [])), }; } export async function searchSearxng(query: string, options: SearxngSearchOptions): Promise { const url = buildSearchUrl(query, options); const response = await fetchSearxng(url, "application/json"); if (!response.ok) { await response.arrayBuffer(); throw new Error(`SearXNG JSON search failed with status ${response.status}. Verify search.formats includes json.`); } const contentType = response.headers.get("content-type")?.toLowerCase() ?? ""; if (!contentType.includes("application/json")) { await response.arrayBuffer(); throw new Error(`SearXNG JSON search returned ${contentType || "unknown content type"}.`); } const data: any = await response.json(); const results = Array.isArray(data?.results) ? data.results.map(mapJsonResult) : []; return { query, requestId: null, results: filterResultsByDomains(results, options).slice(0, options.numResults) }; }