oai responses api, tool call retries

2026-05-02 21:44:32 -07:00
parent 8d6c069a33
commit 015253c0af
11 changed files with 369 additions and 40 deletions
--- a/server/README.md
+++ b/server/README.md
@@ -1,7 +1,7 @@
 # Sybil Server

 Backend API for:
- LLM multiplexer (OpenAI / Anthropic / xAI (Grok))
+- LLM multiplexer (OpenAI Responses / Anthropic / xAI Chat Completions-compatible Grok)
 - Personal chat database (chats/messages + LLM call log)

 ## Stack
@@ -46,7 +46,7 @@ If `ADMIN_TOKEN` is not set, the server runs in open mode (dev).
 - `EXA_API_KEY`
 - `CHAT_WEB_SEARCH_ENGINE` (`exa` by default, or `searxng` for chat tool calls only)
 - `SEARXNG_BASE_URL` (required when `CHAT_WEB_SEARCH_ENGINE=searxng`; instance must allow `format=json`)
- `CHAT_MAX_TOOL_ROUNDS` (`8` by default; maximum model/tool result cycles per chat completion)
+- `CHAT_MAX_TOOL_ROUNDS` (`100` by default; maximum model/tool result cycles per chat completion)
 - `CHAT_CODEX_TOOL_ENABLED` (`false` by default; enables the `codex_exec` chat tool for OpenAI/xAI)
 - `CHAT_CODEX_REMOTE_HOST` (required when Codex tool is enabled; SSH host/IP or `user@host`)
 - `CHAT_CODEX_REMOTE_USER` (optional SSH user when host does not include one)
--- a/server/src/env.ts
+++ b/server/src/env.ts
@@ -64,7 +64,7 @@ const EnvSchema = z.object({
  // Chat-mode web_search tool configuration. Search mode remains Exa-only for now.
  CHAT_WEB_SEARCH_ENGINE: ChatWebSearchEngineSchema,
  SEARXNG_BASE_URL: OptionalUrlSchema,
-  CHAT_MAX_TOOL_ROUNDS: defaultedPositiveInt(8),
+  CHAT_MAX_TOOL_ROUNDS: defaultedPositiveInt(100),

  // Optional chat-mode Codex tool. When enabled, the server SSHes into a remote
  // devbox and runs `codex exec` in a persistent scratch directory there.
--- a/server/src/llm/chat-tools.ts
+++ b/server/src/llm/chat-tools.ts
@@ -9,7 +9,7 @@ import { z } from "zod";
 import { env } from "../env.js";
 import { exaClient } from "../search/exa.js";
 import { searchSearxng } from "../search/searxng.js";
-import { buildOpenAIConversationMessage } from "./message-content.js";
+import { buildOpenAIConversationMessage, buildOpenAIResponsesInputMessage } from "./message-content.js";
 import type { ChatMessage } from "./types.js";

 const MAX_TOOL_ROUNDS = env.CHAT_MAX_TOOL_ROUNDS;
@@ -188,6 +188,17 @@ const CHAT_TOOLS: any[] = [
  ...(env.CHAT_SHELL_TOOL_ENABLED ? [SHELL_EXEC_TOOL] : []),
 ];

+const RESPONSES_CHAT_TOOLS: any[] = CHAT_TOOLS.map((tool) => {
+  if (tool?.type !== "function") return tool;
+  return {
+    type: "function",
+    name: tool.function.name,
+    description: tool.function.description,
+    parameters: tool.function.parameters,
+    strict: false,
+  };
+});
+
 export const CHAT_TOOL_SYSTEM_PROMPT =
  "You can use tools to gather up-to-date web information when needed. " +
  "Use web_search for discovery and recent facts, and fetch_url to read the full content of a specific page. " +
@@ -374,6 +385,12 @@ function normalizeIncomingMessages(messages: ChatMessage[]) {
  return [{ role: "system", content: CHAT_TOOL_SYSTEM_PROMPT }, ...normalized];
 }

+function normalizeIncomingResponsesInput(messages: ChatMessage[]) {
+  const normalized = messages.map((message) => buildOpenAIResponsesInputMessage(message));
+
+  return [{ role: "system", content: CHAT_TOOL_SYSTEM_PROMPT }, ...normalized];
+}
+
 async function runExaWebSearchTool(args: WebSearchArgs): Promise<ToolRunOutcome> {
  const exa = exaClient();
  const response = await exa.search(args.query, {
@@ -806,6 +823,52 @@ function mergeUsage(acc: Required<ToolAwareUsage>, usage: any) {
  return true;
 }

+function mergeResponsesUsage(acc: Required<ToolAwareUsage>, usage: any) {
+  if (!usage) return false;
+  acc.inputTokens += usage.input_tokens ?? 0;
+  acc.outputTokens += usage.output_tokens ?? 0;
+  acc.totalTokens += usage.total_tokens ?? 0;
+  return true;
+}
+
+function getResponseOutputItems(response: any) {
+  return Array.isArray(response?.output) ? response.output : [];
+}
+
+function extractResponsesText(response: any, fallback = "") {
+  if (typeof response?.output_text === "string") return response.output_text;
+
+  const parts: string[] = [];
+  for (const item of getResponseOutputItems(response)) {
+    if (item?.type !== "message" || !Array.isArray(item.content)) continue;
+    for (const content of item.content) {
+      if (content?.type === "output_text" && typeof content.text === "string") {
+        parts.push(content.text);
+      } else if (content?.type === "refusal" && typeof content.refusal === "string") {
+        parts.push(content.refusal);
+      }
+    }
+  }
+  return parts.join("") || fallback;
+}
+
+function getResponseFailureMessage(response: any) {
+  if (response?.status !== "failed" && response?.status !== "incomplete") return null;
+  const errorMessage = typeof response?.error?.message === "string" ? response.error.message : null;
+  const incompleteReason = typeof response?.incomplete_details?.reason === "string" ? response.incomplete_details.reason : null;
+  return errorMessage ?? (incompleteReason ? `Response incomplete: ${incompleteReason}` : `Response ${response.status}.`);
+}
+
+function normalizeResponsesToolCalls(outputItems: any[], round: number): NormalizedToolCall[] {
+  return outputItems
+    .filter((item) => item?.type === "function_call")
+    .map((call: any, index: number) => ({
+      id: call.call_id ?? call.id ?? `tool_call_${round}_${index}`,
+      name: call.name ?? "unknown_tool",
+      arguments: call.arguments ?? "{}",
+    }));
+}
+
 type NormalizedToolCall = {
  id: string;
  name: string;
@@ -869,6 +932,75 @@ async function executeToolCallAndBuildEvent(
 }

 export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams): Promise<ToolAwareCompletionResult> {
+  const input: any[] = normalizeIncomingResponsesInput(params.messages);
+  const rawResponses: unknown[] = [];
+  const toolEvents: ToolExecutionEvent[] = [];
+  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
+  let sawUsage = false;
+  let totalToolCalls = 0;
+  let danglingToolIntentRetries = 0;
+
+  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
+    const response = await params.client.responses.create({
+      model: params.model,
+      input,
+      temperature: params.temperature,
+      max_output_tokens: params.maxTokens,
+      tools: RESPONSES_CHAT_TOOLS,
+      tool_choice: "auto",
+      parallel_tool_calls: true,
+      // Tool loops pass response output items back as input; reasoning items need persistence.
+      store: true,
+    } as any);
+    rawResponses.push(response);
+    sawUsage = mergeResponsesUsage(usageAcc, response?.usage) || sawUsage;
+
+    const failureMessage = getResponseFailureMessage(response);
+    if (failureMessage) {
+      throw new Error(failureMessage);
+    }
+
+    const outputItems = getResponseOutputItems(response);
+    const normalizedToolCalls = normalizeResponsesToolCalls(outputItems, round);
+    if (!normalizedToolCalls.length) {
+      const text = extractResponsesText(response);
+      if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(text)) {
+        danglingToolIntentRetries += 1;
+        appendDanglingToolIntentCorrection(input, text);
+        continue;
+      }
+      return {
+        text,
+        usage: sawUsage ? usageAcc : undefined,
+        raw: { responses: rawResponses, toolCallsUsed: totalToolCalls, api: "responses" },
+        toolEvents,
+      };
+    }
+
+    totalToolCalls += normalizedToolCalls.length;
+    input.push(...outputItems);
+
+    for (const call of normalizedToolCalls) {
+      const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);
+      toolEvents.push(event);
+
+      input.push({
+        type: "function_call_output",
+        call_id: call.id,
+        output: JSON.stringify(toolResult),
+      });
+    }
+  }
+
+  return {
+    text: "I reached the tool-call limit while gathering information. Please narrow the request and try again.",
+    usage: sawUsage ? usageAcc : undefined,
+    raw: { responses: rawResponses, toolCallsUsed: totalToolCalls, toolCallLimitReached: true, api: "responses" },
+    toolEvents,
+  };
+}
+
+export async function runToolAwareChatCompletions(params: ToolAwareCompletionParams): Promise<ToolAwareCompletionResult> {
  const conversation: any[] = normalizeIncomingMessages(params.messages);
  const rawResponses: unknown[] = [];
  const toolEvents: ToolExecutionEvent[] = [];
@@ -956,6 +1088,109 @@ export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams):

 export async function* runToolAwareOpenAIChatStream(
  params: ToolAwareCompletionParams
+): AsyncGenerator<ToolAwareStreamingEvent> {
+  const input: any[] = normalizeIncomingResponsesInput(params.messages);
+  const rawResponses: unknown[] = [];
+  const toolEvents: ToolExecutionEvent[] = [];
+  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
+  let sawUsage = false;
+  let totalToolCalls = 0;
+  let danglingToolIntentRetries = 0;
+
+  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
+    const stream = await params.client.responses.create({
+      model: params.model,
+      input,
+      temperature: params.temperature,
+      max_output_tokens: params.maxTokens,
+      tools: RESPONSES_CHAT_TOOLS,
+      tool_choice: "auto",
+      parallel_tool_calls: true,
+      // Tool loops pass response output items back as input; reasoning items need persistence.
+      store: true,
+      stream: true,
+    } as any);
+
+    let roundText = "";
+    let completedResponse: any | null = null;
+    const completedOutputItems: any[] = [];
+
+    for await (const event of stream as any as AsyncIterable<any>) {
+      rawResponses.push(event);
+
+      if (event?.type === "response.output_text.delta" && typeof event.delta === "string") {
+        roundText += event.delta;
+      } else if (event?.type === "response.output_item.done" && event.item) {
+        completedOutputItems[event.output_index ?? completedOutputItems.length] = event.item;
+      } else if (event?.type === "response.completed") {
+        completedResponse = event.response;
+        sawUsage = mergeResponsesUsage(usageAcc, event.response?.usage) || sawUsage;
+      } else if (event?.type === "response.failed" || event?.type === "response.incomplete") {
+        completedResponse = event.response;
+        sawUsage = mergeResponsesUsage(usageAcc, event.response?.usage) || sawUsage;
+      } else if (event?.type === "error") {
+        throw new Error(event.message ?? "OpenAI Responses stream failed.");
+      }
+    }
+
+    const failureMessage = getResponseFailureMessage(completedResponse);
+    if (failureMessage) {
+      throw new Error(failureMessage);
+    }
+
+    const outputItems = getResponseOutputItems(completedResponse);
+    const responseOutputItems = outputItems.length ? outputItems : completedOutputItems.filter(Boolean);
+    const normalizedToolCalls = normalizeResponsesToolCalls(responseOutputItems, round);
+    if (!normalizedToolCalls.length) {
+      const text = extractResponsesText(completedResponse, roundText);
+      if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(text)) {
+        danglingToolIntentRetries += 1;
+        appendDanglingToolIntentCorrection(input, text);
+        continue;
+      }
+      if (text) {
+        yield { type: "delta", text };
+      }
+      yield {
+        type: "done",
+        result: {
+          text,
+          usage: sawUsage ? usageAcc : undefined,
+          raw: { streamed: true, responses: rawResponses, toolCallsUsed: totalToolCalls, api: "responses" },
+          toolEvents,
+        },
+      };
+      return;
+    }
+
+    totalToolCalls += normalizedToolCalls.length;
+    input.push(...responseOutputItems);
+
+    for (const call of normalizedToolCalls) {
+      const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);
+      toolEvents.push(event);
+      yield { type: "tool_call", event };
+      input.push({
+        type: "function_call_output",
+        call_id: call.id,
+        output: JSON.stringify(toolResult),
+      });
+    }
+  }
+
+  yield {
+    type: "done",
+    result: {
+      text: "I reached the tool-call limit while gathering information. Please narrow the request and try again.",
+      usage: sawUsage ? usageAcc : undefined,
+      raw: { streamed: true, responses: rawResponses, toolCallsUsed: totalToolCalls, toolCallLimitReached: true, api: "responses" },
+      toolEvents,
+    },
+  };
+}
+
+export async function* runToolAwareChatCompletionsStream(
+  params: ToolAwareCompletionParams
 ): AsyncGenerator<ToolAwareStreamingEvent> {
  const conversation: any[] = normalizeIncomingMessages(params.messages);
  const rawResponses: unknown[] = [];
--- a/server/src/llm/message-content.ts
+++ b/server/src/llm/message-content.ts
@@ -67,6 +67,43 @@ function toOpenAIContent(message: ChatMessage) {
  return parts;
 }

+function toOpenAIResponsesContent(message: ChatMessage) {
+  const imageAttachments = getImageAttachments(message);
+  const textAttachments = getTextAttachments(message);
+  if (!imageAttachments.length && !textAttachments.length) {
+    return message.content;
+  }
+
+  const parts: Array<Record<string, unknown>> = [];
+
+  for (const attachment of imageAttachments) {
+    parts.push({
+      type: "input_image",
+      image_url: attachment.dataUrl,
+      detail: "auto",
+    });
+  }
+
+  const imageSummary = buildImageSummaryText(imageAttachments);
+  if (imageSummary) {
+    parts.push({ type: "input_text", text: imageSummary });
+  }
+
+  for (const attachment of textAttachments) {
+    parts.push({ type: "input_text", text: buildTextAttachmentPrompt(attachment) });
+  }
+
+  if (message.content.trim()) {
+    parts.push({ type: "input_text", text: message.content });
+  }
+
+  if (parts.length === 1 && parts[0]?.type === "input_text" && typeof parts[0].text === "string") {
+    return parts[0].text;
+  }
+
+  return parts;
+}
+
 function parseImageDataUrl(attachment: ChatImageAttachment) {
  const match = attachment.dataUrl.match(/^data:(image\/(?:png|jpeg));base64,([a-z0-9+/=\s]+)$/i);
  if (!match) {
@@ -146,6 +183,21 @@ export function buildOpenAIConversationMessage(message: ChatMessage) {
  return out;
 }

+export function buildOpenAIResponsesInputMessage(message: ChatMessage) {
+  if (message.role === "tool") {
+    const name = message.name?.trim() || "tool";
+    return {
+      role: "user",
+      content: `Tool output (${name}):\n${message.content}`,
+    };
+  }
+
+  return {
+    role: message.role,
+    content: toOpenAIResponsesContent(message),
+  };
+}
+
 const ANTHROPIC_NO_SERVER_TOOLS_PROMPT =
  "This Anthropic backend path does not have server-managed tool calls. Do not claim to run shell commands, Codex tasks, web searches, or fetch URLs. If the user asks for tool execution, explain that they should switch to OpenAI or xAI in this app for tool-enabled chat.";

--- a/server/src/llm/model-catalog.ts
+++ b/server/src/llm/model-catalog.ts
@@ -23,13 +23,12 @@ function uniqSorted(models: string[]) {
  return [...new Set(models.map((value) => value.trim()).filter(Boolean))].sort((a, b) => a.localeCompare(b));
 }

-function isLikelyOpenAIChatCompletionsModel(model: string) {
+function isLikelyOpenAIResponsesModel(model: string) {
  const id = model.toLowerCase();
  if (id.includes("embedding") || id.includes("moderation")) return false;
  if (id.includes("audio") || id.includes("realtime") || id.includes("transcribe") || id.includes("tts")) return false;
  if (id.includes("image") || id.includes("dall-e") || id.includes("sora")) return false;
  if (id.includes("search") || id.includes("computer-use")) return false;
-  if (/^gpt-[\d.]+-pro(?:-|$)/.test(id)) return false;
  return /^(gpt-|o\d|chatgpt-)/.test(id);
 }

@@ -52,7 +51,7 @@ async function withTimeout<T>(promise: Promise<T>, timeoutMs: number, label: str
 async function fetchProviderModels(provider: Provider) {
  if (provider === "openai") {
    const page = await openaiClient().models.list();
-    return uniqSorted(page.data.map((model) => model.id).filter(isLikelyOpenAIChatCompletionsModel));
+    return uniqSorted(page.data.map((model) => model.id).filter(isLikelyOpenAIResponsesModel));
  }

  if (provider === "anthropic") {
--- a/server/src/llm/multiplexer.ts
+++ b/server/src/llm/multiplexer.ts
@@ -1,7 +1,7 @@
 import { performance } from "node:perf_hooks";
 import { prisma } from "../db.js";
 import { anthropicClient, openaiClient, xaiClient } from "./providers.js";
-import { buildToolLogMessageData, runToolAwareOpenAIChat } from "./chat-tools.js";
+import { buildToolLogMessageData, runToolAwareChatCompletions, runToolAwareOpenAIChat } from "./chat-tools.js";
 import { buildAnthropicConversationMessage, getAnthropicSystemPrompt } from "./message-content.js";
 import type { MultiplexRequest, MultiplexResponse, Provider } from "./types.js";

@@ -48,8 +48,8 @@ export async function runMultiplex(req: MultiplexRequest): Promise<MultiplexResp
    let raw: unknown;
    let toolMessages: ReturnType<typeof buildToolLogMessageData>[] = [];

-    if (req.provider === "openai" || req.provider === "xai") {
-      const client = req.provider === "openai" ? openaiClient() : xaiClient();
+    if (req.provider === "openai") {
+      const client = openaiClient();
      const r = await runToolAwareOpenAIChat({
        client,
        model: req.model,
@@ -66,6 +66,24 @@ export async function runMultiplex(req: MultiplexRequest): Promise<MultiplexResp
      outText = r.text;
      usage = r.usage;
      toolMessages = r.toolEvents.map((event) => buildToolLogMessageData(call.chatId, event));
+    } else if (req.provider === "xai") {
+      const client = xaiClient();
+      const r = await runToolAwareChatCompletions({
+        client,
+        model: req.model,
+        messages: req.messages,
+        temperature: req.temperature,
+        maxTokens: req.maxTokens,
+        logContext: {
+          provider: req.provider,
+          model: req.model,
+          chatId,
+        },
+      });
+      raw = r.raw;
+      outText = r.text;
+      usage = r.usage;
+      toolMessages = r.toolEvents.map((event) => buildToolLogMessageData(call.chatId, event));
    } else if (req.provider === "anthropic") {
      const client = anthropicClient();

--- a/server/src/llm/streaming.ts
+++ b/server/src/llm/streaming.ts
@@ -1,7 +1,12 @@
 import { performance } from "node:perf_hooks";
 import { prisma } from "../db.js";
 import { anthropicClient, openaiClient, xaiClient } from "./providers.js";
-import { buildToolLogMessageData, runToolAwareOpenAIChatStream, type ToolExecutionEvent } from "./chat-tools.js";
+import {
+  buildToolLogMessageData,
+  runToolAwareChatCompletionsStream,
+  runToolAwareOpenAIChatStream,
+  type ToolExecutionEvent,
+} from "./chat-tools.js";
 import { buildAnthropicConversationMessage, getAnthropicSystemPrompt } from "./message-content.js";
 import type { MultiplexRequest, Provider } from "./types.js";

@@ -58,18 +63,33 @@ export async function* runMultiplexStream(req: MultiplexRequest): AsyncGenerator
  try {
    if (req.provider === "openai" || req.provider === "xai") {
      const client = req.provider === "openai" ? openaiClient() : xaiClient();
-      for await (const ev of runToolAwareOpenAIChatStream({
-        client,
-        model: req.model,
-        messages: req.messages,
-        temperature: req.temperature,
-        maxTokens: req.maxTokens,
-        logContext: {
-          provider: req.provider,
-          model: req.model,
-          chatId,
-        },
-      })) {
+      const streamEvents =
+        req.provider === "openai"
+          ? runToolAwareOpenAIChatStream({
+              client,
+              model: req.model,
+              messages: req.messages,
+              temperature: req.temperature,
+              maxTokens: req.maxTokens,
+              logContext: {
+                provider: req.provider,
+                model: req.model,
+                chatId,
+              },
+            })
+          : runToolAwareChatCompletionsStream({
+              client,
+              model: req.model,
+              messages: req.messages,
+              temperature: req.temperature,
+              maxTokens: req.maxTokens,
+              logContext: {
+                provider: req.provider,
+                model: req.model,
+                chatId,
+              },
+            });
+      for await (const ev of streamEvents) {
        if (ev.type === "delta") {
          text += ev.text;
          yield { type: "delta", text: ev.text };
--- a/server/src/routes.ts
+++ b/server/src/routes.ts
@@ -203,16 +203,15 @@ async function generateChatTitle(content: string) {
  const systemPrompt =
    "You create short chat titles. Return exactly one line, maximum 4 words, no quotes, no trailing punctuation.";
  const userPrompt = `User request:\n${content}\n\nTitle:`;
-  const response = await openaiClient().chat.completions.create({
+  const response = await openaiClient().responses.create({
    model: "gpt-4.1-mini",
    temperature: 0,
-    max_completion_tokens: 20,
-    messages: [
-      { role: "system", content: systemPrompt },
-      { role: "user", content: userPrompt },
-    ],
+    max_output_tokens: 20,
+    instructions: systemPrompt,
+    input: userPrompt,
+    store: false,
  });
-  return response.choices?.[0]?.message?.content ?? "";
+  return response.output_text ?? "";
 }

 function normalizeUrlForMatch(input: string | null | undefined) {