fix most web_fetches from getting blocked using a real user agent
This commit is contained in:
26
server/src/browser-fetch-headers.ts
Normal file
26
server/src/browser-fetch-headers.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
export const CHROMIUM_USER_AGENT =
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36";
|
||||
|
||||
export const BROWSER_ACCEPT_LANGUAGE = "en-US,en;q=0.9";
|
||||
|
||||
export const FETCH_URL_ACCEPT =
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,application/pdf;q=0.9,*/*;q=0.8";
|
||||
|
||||
export function buildBrowserLikeRequestHeaders(accept: string): Record<string, string> {
|
||||
return {
|
||||
"User-Agent": CHROMIUM_USER_AGENT,
|
||||
Accept: accept,
|
||||
"Accept-Language": BROWSER_ACCEPT_LANGUAGE,
|
||||
};
|
||||
}
|
||||
|
||||
export function buildBrowserLikeNavigationHeaders(accept = FETCH_URL_ACCEPT): Record<string, string> {
|
||||
return {
|
||||
...buildBrowserLikeRequestHeaders(accept),
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Fetch-User": "?1",
|
||||
};
|
||||
}
|
||||
@@ -6,6 +6,7 @@ import { promisify } from "node:util";
|
||||
import { convert as htmlToText } from "html-to-text";
|
||||
import type OpenAI from "openai";
|
||||
import { z } from "zod";
|
||||
import { buildBrowserLikeNavigationHeaders } from "../browser-fetch-headers.js";
|
||||
import { env } from "../env.js";
|
||||
import { exaClient } from "../search/exa.js";
|
||||
import { searchSearxng } from "../search/searxng.js";
|
||||
@@ -570,10 +571,7 @@ async function runFetchUrlTool(input: unknown): Promise<ToolRunOutcome> {
|
||||
response = await fetch(parsed.toString(), {
|
||||
redirect: "follow",
|
||||
signal: controller.signal,
|
||||
headers: {
|
||||
"User-Agent": "SybilBot/1.0 (+https://sybil.local)",
|
||||
Accept: "text/html, text/plain, application/json;q=0.9, */*;q=0.5",
|
||||
},
|
||||
headers: buildBrowserLikeNavigationHeaders(),
|
||||
});
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { buildBrowserLikeRequestHeaders } from "../browser-fetch-headers.js";
|
||||
import { env } from "../env.js";
|
||||
|
||||
const SEARXNG_TIMEOUT_MS = 12_000;
|
||||
@@ -106,10 +107,7 @@ async function fetchSearxng(url: URL, accept: string) {
|
||||
return await fetch(url, {
|
||||
redirect: "follow",
|
||||
signal: controller.signal,
|
||||
headers: {
|
||||
"User-Agent": "SybilBot/1.0 (+https://sybil.local)",
|
||||
Accept: accept,
|
||||
},
|
||||
headers: buildBrowserLikeRequestHeaders(accept),
|
||||
});
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
|
||||
@@ -2,6 +2,7 @@ import assert from "node:assert/strict";
|
||||
import test from "node:test";
|
||||
import {
|
||||
runPlainChatCompletionsStream,
|
||||
runToolAwareChatCompletions,
|
||||
runToolAwareChatCompletionsStream,
|
||||
runToolAwareOpenAIChatStream,
|
||||
type ToolAwareStreamingEvent,
|
||||
@@ -141,6 +142,79 @@ test("plain Chat Completions stream does not send Sybil-managed tools", async ()
|
||||
assert.equal(events.at(-1)?.type === "done" ? events.at(-1)?.result.text : null, "Hi");
|
||||
});
|
||||
|
||||
test("fetch_url sends browser-like navigation headers", async () => {
|
||||
const originalFetch = globalThis.fetch;
|
||||
const fetchCalls: Array<{ input: RequestInfo | URL; init?: RequestInit }> = [];
|
||||
globalThis.fetch = (async (input: RequestInfo | URL, init?: RequestInit) => {
|
||||
fetchCalls.push({ input, init });
|
||||
return new Response("<!doctype html><title>CPI</title><main>Consumer price index</main>", {
|
||||
status: 200,
|
||||
headers: { "content-type": "text/html; charset=utf-8" },
|
||||
});
|
||||
}) as typeof fetch;
|
||||
|
||||
try {
|
||||
let requestCount = 0;
|
||||
const client = {
|
||||
chat: {
|
||||
completions: {
|
||||
create: async () => {
|
||||
requestCount += 1;
|
||||
if (requestCount === 1) {
|
||||
return {
|
||||
choices: [
|
||||
{
|
||||
message: {
|
||||
tool_calls: [
|
||||
{
|
||||
id: "call_1",
|
||||
type: "function",
|
||||
function: {
|
||||
name: "fetch_url",
|
||||
arguments: JSON.stringify({ url: "https://www.bls.gov/news.release/pdf/cpi.pdf" }),
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
choices: [{ message: { content: "Fetched" } }],
|
||||
};
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
const result = await runToolAwareChatCompletions({
|
||||
client: client as any,
|
||||
model: "grok-test",
|
||||
messages: [{ role: "user", content: "Fetch CPI PDF" }],
|
||||
});
|
||||
|
||||
assert.equal(result.text, "Fetched");
|
||||
assert.equal(fetchCalls.length, 1);
|
||||
assert.equal(String(fetchCalls[0]?.input), "https://www.bls.gov/news.release/pdf/cpi.pdf");
|
||||
assert.deepEqual(fetchCalls[0]?.init?.headers, {
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
||||
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,application/pdf;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Fetch-User": "?1",
|
||||
});
|
||||
assert.equal(result.toolEvents[0]?.status, "completed");
|
||||
} finally {
|
||||
globalThis.fetch = originalFetch;
|
||||
}
|
||||
});
|
||||
|
||||
test("OpenAI-compatible Chat Completions stream emits initiated and terminal tool call updates", async () => {
|
||||
let requestCount = 0;
|
||||
const client = {
|
||||
|
||||
Reference in New Issue
Block a user