oai responses api, tool call retries

2026-05-02 21:44:32 -07:00
parent 8d6c069a33
commit 015253c0af
11 changed files with 369 additions and 40 deletions
--- a/docker-compose.example.yml
+++ b/docker-compose.example.yml
@@ -15,7 +15,7 @@ services:
      EXA_API_KEY: ${EXA_API_KEY:-}
      CHAT_WEB_SEARCH_ENGINE: ${CHAT_WEB_SEARCH_ENGINE:-exa}
      SEARXNG_BASE_URL: ${SEARXNG_BASE_URL:-}
-      CHAT_MAX_TOOL_ROUNDS: ${CHAT_MAX_TOOL_ROUNDS:-8}
+      CHAT_MAX_TOOL_ROUNDS: ${CHAT_MAX_TOOL_ROUNDS:-100}
      CHAT_CODEX_TOOL_ENABLED: ${CHAT_CODEX_TOOL_ENABLED:-false}
      CHAT_CODEX_REMOTE_HOST: ${CHAT_CODEX_REMOTE_HOST:-}
      CHAT_CODEX_REMOTE_USER: ${CHAT_CODEX_REMOTE_USER:-}
--- a/docs/api/rest.md
+++ b/docs/api/rest.md
@@ -37,7 +37,7 @@ Chat upload limits:
  }
 }
 ```
- OpenAI model lists are filtered to models that are expected to work with the backend's current Chat Completions implementation.
+- OpenAI model lists are filtered to models that are expected to work with the backend's Responses API implementation.
 ## Chats
@@ -168,8 +168,11 @@ Behavior notes:
 - Attachments are optional and currently apply to `user` messages. Persisted chat history stores them under `message.metadata.attachments`.
 - Images are forwarded inline to providers as multimodal image parts. Use PNG or JPEG for cross-provider compatibility.
 - Text files are forwarded as explicit text blocks rather than provider-managed file references. Large text attachments should already be truncated client-side before submission.
- For `openai` and `xai`, backend enables tool use during chat completion with an internal system instruction.
+- For `openai`, backend calls OpenAI's Responses API and enables internal tool use with an internal system instruction.
- For `openai` and `xai`, image attachments are sent as chat-completions content parts alongside text.
+- For `xai`, backend calls xAI's OpenAI-compatible Chat Completions API and enables internal tool use with the same internal system instruction.
 - For `openai`, image attachments are sent as Responses `input_image` items and text attachments are sent as `input_text` items.
 - For `xai`, image attachments are sent as Chat Completions content parts alongside text.
 - For `openai`, Responses calls that can enter the server-managed tool loop use `store: true` so reasoning and function-call items can be passed between tool rounds.
 - For `anthropic`, image attachments are sent as Messages API `image` blocks using base64 source data; text attachments are added as `text` blocks.
 - Available tool calls for chat: `web_search` and `fetch_url`. When `CHAT_CODEX_TOOL_ENABLED=true`, `codex_exec` is also available. When `CHAT_SHELL_TOOL_ENABLED=true`, `shell_exec` is also available.
 - `web_search` returns ranked results with per-result summaries/snippets. Its backend engine is selected by `CHAT_WEB_SEARCH_ENGINE` (`exa` default, or `searxng` with `SEARXNG_BASE_URL` set). SearXNG mode requires the instance to allow `format=json`.
@@ -177,7 +180,7 @@ Behavior notes:
 - `codex_exec` delegates coding, shell, repository inspection, and other complex software tasks to a persistent remote Codex CLI workspace over SSH. The server runs `codex exec --skip-git-repo-check <non-interactive wrapped prompt>` on the configured devbox inside `CHAT_CODEX_REMOTE_WORKDIR`, with SSH stdin closed.
 - `shell_exec` runs arbitrary non-interactive shell commands on the same configured devbox, starting in `CHAT_CODEX_REMOTE_WORKDIR`. It uses `bash -lc` when bash exists, otherwise `sh -lc`, closes SSH stdin, and does not run inside the Sybil server container.
 - Devbox tool configuration:
-  - `CHAT_MAX_TOOL_ROUNDS=8` (optional; maximum model/tool result cycles before the backend returns a limit message)
+  - `CHAT_MAX_TOOL_ROUNDS=100` (optional; maximum model/tool result cycles before the backend returns a limit message)
  - `CHAT_CODEX_TOOL_ENABLED=true`
  - `CHAT_SHELL_TOOL_ENABLED=true`
  - `CHAT_CODEX_REMOTE_HOST=<host-or-ip>` (required when enabled)
--- a/docs/api/streaming-chat.md
+++ b/docs/api/streaming-chat.md
@@ -127,19 +127,22 @@ Event order:
 ## Provider Streaming Behavior
- `openai`/`xai`: backend may execute internal tool calls (`web_search`, `fetch_url`, optional `codex_exec`, and optional `shell_exec`) before producing final text.
+- `openai`: backend uses OpenAI's Responses API and may execute internal function tool calls (`web_search`, `fetch_url`, optional `codex_exec`, and optional `shell_exec`) before producing final text.
- `openai`: image attachments are sent as chat-completions content parts; text attachments are inlined as text parts.
+- `xai`: backend uses xAI's OpenAI-compatible Chat Completions API and may execute the same internal tool calls before producing final text.
- `xai`: same attachment behavior as OpenAI.
+- `openai`: image attachments are sent as Responses `input_image` items; text attachments are sent as `input_text` items.
 - `xai`: image attachments are sent as Chat Completions content parts; text attachments are inlined as text parts.
 - `openai`: Responses calls that can enter the server-managed tool loop use `store: true` so reasoning and function-call items can be passed between tool rounds.
 - `anthropic`: streamed via event stream; emits `delta` from `content_block_delta` with `text_delta`. Image attachments are sent as base64 `image` blocks and text attachments are appended as `text` blocks.
 - `web_search` uses `CHAT_WEB_SEARCH_ENGINE` (`exa` default, or `searxng` with `SEARXNG_BASE_URL` set). SearXNG mode requires the instance to allow `format=json`. This only affects chat-mode tool calls, not search-mode endpoints.
 - `codex_exec` is available only when `CHAT_CODEX_TOOL_ENABLED=true`. It SSHes to `CHAT_CODEX_REMOTE_HOST`, creates/uses `CHAT_CODEX_REMOTE_WORKDIR`, and runs `codex exec --skip-git-repo-check <non-interactive wrapped prompt>` there with SSH stdin closed. Prefer `CHAT_CODEX_SSH_KEY_PATH` with a read-only mounted private key; `CHAT_CODEX_SSH_PRIVATE_KEY_B64` is also supported.
 - `shell_exec` is available only when `CHAT_SHELL_TOOL_ENABLED=true`. It uses the same devbox SSH configuration, starts in `CHAT_CODEX_REMOTE_WORKDIR`, and runs non-interactive shell commands there with SSH stdin closed, not inside the Sybil server container.
- `CHAT_MAX_TOOL_ROUNDS` controls how many model/tool result cycles may occur before the backend returns a tool-call limit message; default is 8.
+- `CHAT_MAX_TOOL_ROUNDS` controls how many model/tool result cycles may occur before the backend returns a tool-call limit message; default is 100.
 Tool-enabled streaming notes (`openai`/`xai`):
 - Stream still emits standard `meta`, `delta`, `done|error` events.
 - Stream may emit `tool_call` events while tool calls are executed.
- `delta` events stream incrementally as text is generated.
+- `delta` events carry assistant text. The backend may buffer model-native text briefly while determining whether a provider round contains tool calls.
 - OpenAI Responses stream events are normalized by the backend into this SSE contract; clients do not consume OpenAI's raw Responses stream event names.
 ## Persistence + Consistency Model
--- a/server/README.md
+++ b/server/README.md
@@ -1,7 +1,7 @@
 # Sybil Server
 Backend API for:
- LLM multiplexer (OpenAI / Anthropic / xAI (Grok))
+- LLM multiplexer (OpenAI Responses / Anthropic / xAI Chat Completions-compatible Grok)
 - Personal chat database (chats/messages + LLM call log)
 ## Stack
@@ -46,7 +46,7 @@ If `ADMIN_TOKEN` is not set, the server runs in open mode (dev).
 - `EXA_API_KEY`
 - `CHAT_WEB_SEARCH_ENGINE` (`exa` by default, or `searxng` for chat tool calls only)
 - `SEARXNG_BASE_URL` (required when `CHAT_WEB_SEARCH_ENGINE=searxng`; instance must allow `format=json`)
- `CHAT_MAX_TOOL_ROUNDS` (`8` by default; maximum model/tool result cycles per chat completion)
+- `CHAT_MAX_TOOL_ROUNDS` (`100` by default; maximum model/tool result cycles per chat completion)
 - `CHAT_CODEX_TOOL_ENABLED` (`false` by default; enables the `codex_exec` chat tool for OpenAI/xAI)
 - `CHAT_CODEX_REMOTE_HOST` (required when Codex tool is enabled; SSH host/IP or `user@host`)
 - `CHAT_CODEX_REMOTE_USER` (optional SSH user when host does not include one)
--- a/server/src/env.ts
+++ b/server/src/env.ts
@@ -64,7 +64,7 @@ const EnvSchema = z.object({
  // Chat-mode web_search tool configuration. Search mode remains Exa-only for now.
  CHAT_WEB_SEARCH_ENGINE: ChatWebSearchEngineSchema,
  SEARXNG_BASE_URL: OptionalUrlSchema,
-  CHAT_MAX_TOOL_ROUNDS: defaultedPositiveInt(8),
+  CHAT_MAX_TOOL_ROUNDS: defaultedPositiveInt(100),
  // Optional chat-mode Codex tool. When enabled, the server SSHes into a remote
  // devbox and runs `codex exec` in a persistent scratch directory there.
--- a/server/src/llm/chat-tools.ts
+++ b/server/src/llm/chat-tools.ts
@@ -9,7 +9,7 @@ import { z } from "zod";
 import { env } from "../env.js";
 import { exaClient } from "../search/exa.js";
 import { searchSearxng } from "../search/searxng.js";
-import { buildOpenAIConversationMessage } from "./message-content.js";
+import { buildOpenAIConversationMessage, buildOpenAIResponsesInputMessage } from "./message-content.js";
 import type { ChatMessage } from "./types.js";
 const MAX_TOOL_ROUNDS = env.CHAT_MAX_TOOL_ROUNDS;
@@ -188,6 +188,17 @@ const CHAT_TOOLS: any[] = [
  ...(env.CHAT_SHELL_TOOL_ENABLED ? [SHELL_EXEC_TOOL] : []),
 ];
 const RESPONSES_CHAT_TOOLS: any[] = CHAT_TOOLS.map((tool) => {
  if (tool?.type !== "function") return tool;
  return {
    type: "function",
    name: tool.function.name,
    description: tool.function.description,
    parameters: tool.function.parameters,
    strict: false,
  };
 });
 export const CHAT_TOOL_SYSTEM_PROMPT =
  "You can use tools to gather up-to-date web information when needed. " +
  "Use web_search for discovery and recent facts, and fetch_url to read the full content of a specific page. " +
@@ -374,6 +385,12 @@ function normalizeIncomingMessages(messages: ChatMessage[]) {
  return [{ role: "system", content: CHAT_TOOL_SYSTEM_PROMPT }, ...normalized];
 }
 function normalizeIncomingResponsesInput(messages: ChatMessage[]) {
  const normalized = messages.map((message) => buildOpenAIResponsesInputMessage(message));
  return [{ role: "system", content: CHAT_TOOL_SYSTEM_PROMPT }, ...normalized];
 }
 async function runExaWebSearchTool(args: WebSearchArgs): Promise<ToolRunOutcome> {
  const exa = exaClient();
  const response = await exa.search(args.query, {
@@ -806,6 +823,52 @@ function mergeUsage(acc: Required<ToolAwareUsage>, usage: any) {
  return true;
 }
 function mergeResponsesUsage(acc: Required<ToolAwareUsage>, usage: any) {
  if (!usage) return false;
  acc.inputTokens += usage.input_tokens ?? 0;
  acc.outputTokens += usage.output_tokens ?? 0;
  acc.totalTokens += usage.total_tokens ?? 0;
  return true;
 }
 function getResponseOutputItems(response: any) {
  return Array.isArray(response?.output) ? response.output : [];
 }
 function extractResponsesText(response: any, fallback = "") {
  if (typeof response?.output_text === "string") return response.output_text;
  const parts: string[] = [];
  for (const item of getResponseOutputItems(response)) {
    if (item?.type !== "message" || !Array.isArray(item.content)) continue;
    for (const content of item.content) {
      if (content?.type === "output_text" && typeof content.text === "string") {
        parts.push(content.text);
      } else if (content?.type === "refusal" && typeof content.refusal === "string") {
        parts.push(content.refusal);
      }
    }
  }
  return parts.join("") || fallback;
 }
 function getResponseFailureMessage(response: any) {
  if (response?.status !== "failed" && response?.status !== "incomplete") return null;
  const errorMessage = typeof response?.error?.message === "string" ? response.error.message : null;
  const incompleteReason = typeof response?.incomplete_details?.reason === "string" ? response.incomplete_details.reason : null;
  return errorMessage ?? (incompleteReason ? `Response incomplete: ${incompleteReason}` : `Response ${response.status}.`);
 }
 function normalizeResponsesToolCalls(outputItems: any[], round: number): NormalizedToolCall[] {
  return outputItems
    .filter((item) => item?.type === "function_call")
    .map((call: any, index: number) => ({
      id: call.call_id ?? call.id ?? `tool_call_${round}_${index}`,
      name: call.name ?? "unknown_tool",
      arguments: call.arguments ?? "{}",
    }));
 }
 type NormalizedToolCall = {
  id: string;
  name: string;
@@ -869,6 +932,75 @@ async function executeToolCallAndBuildEvent(
 }
 export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams): Promise<ToolAwareCompletionResult> {
  const input: any[] = normalizeIncomingResponsesInput(params.messages);
  const rawResponses: unknown[] = [];
  const toolEvents: ToolExecutionEvent[] = [];
  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
  let sawUsage = false;
  let totalToolCalls = 0;
  let danglingToolIntentRetries = 0;
  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
    const response = await params.client.responses.create({
      model: params.model,
      input,
      temperature: params.temperature,
      max_output_tokens: params.maxTokens,
      tools: RESPONSES_CHAT_TOOLS,
      tool_choice: "auto",
      parallel_tool_calls: true,
      // Tool loops pass response output items back as input; reasoning items need persistence.
      store: true,
    } as any);
    rawResponses.push(response);
    sawUsage = mergeResponsesUsage(usageAcc, response?.usage) || sawUsage;
    const failureMessage = getResponseFailureMessage(response);
    if (failureMessage) {
      throw new Error(failureMessage);
    }
    const outputItems = getResponseOutputItems(response);
    const normalizedToolCalls = normalizeResponsesToolCalls(outputItems, round);
    if (!normalizedToolCalls.length) {
      const text = extractResponsesText(response);
      if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(text)) {
        danglingToolIntentRetries += 1;
        appendDanglingToolIntentCorrection(input, text);
        continue;
      }
      return {
        text,
        usage: sawUsage ? usageAcc : undefined,
        raw: { responses: rawResponses, toolCallsUsed: totalToolCalls, api: "responses" },
        toolEvents,
      };
    }
    totalToolCalls += normalizedToolCalls.length;
    input.push(...outputItems);
    for (const call of normalizedToolCalls) {
      const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);
      toolEvents.push(event);
      input.push({
        type: "function_call_output",
        call_id: call.id,
        output: JSON.stringify(toolResult),
      });
    }
  }
  return {
    text: "I reached the tool-call limit while gathering information. Please narrow the request and try again.",
    usage: sawUsage ? usageAcc : undefined,
    raw: { responses: rawResponses, toolCallsUsed: totalToolCalls, toolCallLimitReached: true, api: "responses" },
    toolEvents,
  };
 }
 export async function runToolAwareChatCompletions(params: ToolAwareCompletionParams): Promise<ToolAwareCompletionResult> {
  const conversation: any[] = normalizeIncomingMessages(params.messages);
  const rawResponses: unknown[] = [];
  const toolEvents: ToolExecutionEvent[] = [];
@@ -956,6 +1088,109 @@ export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams):
 export async function* runToolAwareOpenAIChatStream(
  params: ToolAwareCompletionParams
 ): AsyncGenerator<ToolAwareStreamingEvent> {
  const input: any[] = normalizeIncomingResponsesInput(params.messages);
  const rawResponses: unknown[] = [];
  const toolEvents: ToolExecutionEvent[] = [];
  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
  let sawUsage = false;
  let totalToolCalls = 0;
  let danglingToolIntentRetries = 0;
  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
    const stream = await params.client.responses.create({
      model: params.model,
      input,
      temperature: params.temperature,
      max_output_tokens: params.maxTokens,
      tools: RESPONSES_CHAT_TOOLS,
      tool_choice: "auto",
      parallel_tool_calls: true,
      // Tool loops pass response output items back as input; reasoning items need persistence.
      store: true,
      stream: true,
    } as any);
    let roundText = "";
    let completedResponse: any | null = null;
    const completedOutputItems: any[] = [];
    for await (const event of stream as any as AsyncIterable<any>) {
      rawResponses.push(event);
      if (event?.type === "response.output_text.delta" && typeof event.delta === "string") {
        roundText += event.delta;
      } else if (event?.type === "response.output_item.done" && event.item) {
        completedOutputItems[event.output_index ?? completedOutputItems.length] = event.item;
      } else if (event?.type === "response.completed") {
        completedResponse = event.response;
        sawUsage = mergeResponsesUsage(usageAcc, event.response?.usage) || sawUsage;
      } else if (event?.type === "response.failed" || event?.type === "response.incomplete") {
        completedResponse = event.response;
        sawUsage = mergeResponsesUsage(usageAcc, event.response?.usage) || sawUsage;
      } else if (event?.type === "error") {
        throw new Error(event.message ?? "OpenAI Responses stream failed.");
      }
    }
    const failureMessage = getResponseFailureMessage(completedResponse);
    if (failureMessage) {
      throw new Error(failureMessage);
    }
    const outputItems = getResponseOutputItems(completedResponse);
    const responseOutputItems = outputItems.length ? outputItems : completedOutputItems.filter(Boolean);
    const normalizedToolCalls = normalizeResponsesToolCalls(responseOutputItems, round);
    if (!normalizedToolCalls.length) {
      const text = extractResponsesText(completedResponse, roundText);
      if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(text)) {
        danglingToolIntentRetries += 1;
        appendDanglingToolIntentCorrection(input, text);
        continue;
      }
      if (text) {
        yield { type: "delta", text };
      }
      yield {
        type: "done",
        result: {
          text,
          usage: sawUsage ? usageAcc : undefined,
          raw: { streamed: true, responses: rawResponses, toolCallsUsed: totalToolCalls, api: "responses" },
          toolEvents,
        },
      };
      return;
    }
    totalToolCalls += normalizedToolCalls.length;
    input.push(...responseOutputItems);
    for (const call of normalizedToolCalls) {
      const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);
      toolEvents.push(event);
      yield { type: "tool_call", event };
      input.push({
        type: "function_call_output",
        call_id: call.id,
        output: JSON.stringify(toolResult),
      });
    }
  }
  yield {
    type: "done",
    result: {
      text: "I reached the tool-call limit while gathering information. Please narrow the request and try again.",
      usage: sawUsage ? usageAcc : undefined,
      raw: { streamed: true, responses: rawResponses, toolCallsUsed: totalToolCalls, toolCallLimitReached: true, api: "responses" },
      toolEvents,
    },
  };
 }
 export async function* runToolAwareChatCompletionsStream(
  params: ToolAwareCompletionParams
 ): AsyncGenerator<ToolAwareStreamingEvent> {
  const conversation: any[] = normalizeIncomingMessages(params.messages);
  const rawResponses: unknown[] = [];
--- a/server/src/llm/message-content.ts
+++ b/server/src/llm/message-content.ts
@@ -67,6 +67,43 @@ function toOpenAIContent(message: ChatMessage) {
  return parts;
 }
 function toOpenAIResponsesContent(message: ChatMessage) {
  const imageAttachments = getImageAttachments(message);
  const textAttachments = getTextAttachments(message);
  if (!imageAttachments.length && !textAttachments.length) {
    return message.content;
  }
  const parts: Array<Record<string, unknown>> = [];
  for (const attachment of imageAttachments) {
    parts.push({
      type: "input_image",
      image_url: attachment.dataUrl,
      detail: "auto",
    });
  }
  const imageSummary = buildImageSummaryText(imageAttachments);
  if (imageSummary) {
    parts.push({ type: "input_text", text: imageSummary });
  }
  for (const attachment of textAttachments) {
    parts.push({ type: "input_text", text: buildTextAttachmentPrompt(attachment) });
  }
  if (message.content.trim()) {
    parts.push({ type: "input_text", text: message.content });
  }
  if (parts.length === 1 && parts[0]?.type === "input_text" && typeof parts[0].text === "string") {
    return parts[0].text;
  }
  return parts;
 }
 function parseImageDataUrl(attachment: ChatImageAttachment) {
  const match = attachment.dataUrl.match(/^data:(image\/(?:png|jpeg));base64,([a-z0-9+/=\s]+)$/i);
  if (!match) {
@@ -146,6 +183,21 @@ export function buildOpenAIConversationMessage(message: ChatMessage) {
  return out;
 }
 export function buildOpenAIResponsesInputMessage(message: ChatMessage) {
  if (message.role === "tool") {
    const name = message.name?.trim() || "tool";
    return {
      role: "user",
      content: `Tool output (${name}):\n${message.content}`,
    };
  }
  return {
    role: message.role,
    content: toOpenAIResponsesContent(message),
  };
 }
 const ANTHROPIC_NO_SERVER_TOOLS_PROMPT =
  "This Anthropic backend path does not have server-managed tool calls. Do not claim to run shell commands, Codex tasks, web searches, or fetch URLs. If the user asks for tool execution, explain that they should switch to OpenAI or xAI in this app for tool-enabled chat.";
--- a/server/src/llm/model-catalog.ts
+++ b/server/src/llm/model-catalog.ts
@@ -23,13 +23,12 @@ function uniqSorted(models: string[]) {
  return [...new Set(models.map((value) => value.trim()).filter(Boolean))].sort((a, b) => a.localeCompare(b));
 }
-function isLikelyOpenAIChatCompletionsModel(model: string) {
+function isLikelyOpenAIResponsesModel(model: string) {
  const id = model.toLowerCase();
  if (id.includes("embedding") || id.includes("moderation")) return false;
  if (id.includes("audio") || id.includes("realtime") || id.includes("transcribe") || id.includes("tts")) return false;
  if (id.includes("image") || id.includes("dall-e") || id.includes("sora")) return false;
  if (id.includes("search") || id.includes("computer-use")) return false;
  if (/^gpt-[\d.]+-pro(?:-|$)/.test(id)) return false;
  return /^(gpt-|o\d|chatgpt-)/.test(id);
 }
@@ -52,7 +51,7 @@ async function withTimeout<T>(promise: Promise<T>, timeoutMs: number, label: str
 async function fetchProviderModels(provider: Provider) {
  if (provider === "openai") {
    const page = await openaiClient().models.list();
-    return uniqSorted(page.data.map((model) => model.id).filter(isLikelyOpenAIChatCompletionsModel));
+    return uniqSorted(page.data.map((model) => model.id).filter(isLikelyOpenAIResponsesModel));
  }
  if (provider === "anthropic") {
--- a/server/src/llm/multiplexer.ts
+++ b/server/src/llm/multiplexer.ts
@@ -1,7 +1,7 @@
 import { performance } from "node:perf_hooks";
 import { prisma } from "../db.js";
 import { anthropicClient, openaiClient, xaiClient } from "./providers.js";
-import { buildToolLogMessageData, runToolAwareOpenAIChat } from "./chat-tools.js";
+import { buildToolLogMessageData, runToolAwareChatCompletions, runToolAwareOpenAIChat } from "./chat-tools.js";
 import { buildAnthropicConversationMessage, getAnthropicSystemPrompt } from "./message-content.js";
 import type { MultiplexRequest, MultiplexResponse, Provider } from "./types.js";
@@ -48,8 +48,8 @@ export async function runMultiplex(req: MultiplexRequest): Promise<MultiplexResp
    let raw: unknown;
    let toolMessages: ReturnType<typeof buildToolLogMessageData>[] = [];
-    if (req.provider === "openai" || req.provider === "xai") {
+    if (req.provider === "openai") {
-      const client = req.provider === "openai" ? openaiClient() : xaiClient();
+      const client = openaiClient();
      const r = await runToolAwareOpenAIChat({
        client,
        model: req.model,
@@ -66,6 +66,24 @@ export async function runMultiplex(req: MultiplexRequest): Promise<MultiplexResp
      outText = r.text;
      usage = r.usage;
      toolMessages = r.toolEvents.map((event) => buildToolLogMessageData(call.chatId, event));
    } else if (req.provider === "xai") {
      const client = xaiClient();
      const r = await runToolAwareChatCompletions({
        client,
        model: req.model,
        messages: req.messages,
        temperature: req.temperature,
        maxTokens: req.maxTokens,
        logContext: {
          provider: req.provider,
          model: req.model,
          chatId,
        },
      });
      raw = r.raw;
      outText = r.text;
      usage = r.usage;
      toolMessages = r.toolEvents.map((event) => buildToolLogMessageData(call.chatId, event));
    } else if (req.provider === "anthropic") {
      const client = anthropicClient();
--- a/server/src/llm/streaming.ts
+++ b/server/src/llm/streaming.ts
@@ -1,7 +1,12 @@
 import { performance } from "node:perf_hooks";
 import { prisma } from "../db.js";
 import { anthropicClient, openaiClient, xaiClient } from "./providers.js";
-import { buildToolLogMessageData, runToolAwareOpenAIChatStream, type ToolExecutionEvent } from "./chat-tools.js";
+import {
  buildToolLogMessageData,
  runToolAwareChatCompletionsStream,
  runToolAwareOpenAIChatStream,
  type ToolExecutionEvent,
 } from "./chat-tools.js";
 import { buildAnthropicConversationMessage, getAnthropicSystemPrompt } from "./message-content.js";
 import type { MultiplexRequest, Provider } from "./types.js";
@@ -58,18 +63,33 @@ export async function* runMultiplexStream(req: MultiplexRequest): AsyncGenerator
  try {
    if (req.provider === "openai" || req.provider === "xai") {
      const client = req.provider === "openai" ? openaiClient() : xaiClient();
-      for await (const ev of runToolAwareOpenAIChatStream({
+      const streamEvents =
-        client,
+        req.provider === "openai"
-        model: req.model,
+          ? runToolAwareOpenAIChatStream({
-        messages: req.messages,
+              client,
-        temperature: req.temperature,
+              model: req.model,
-        maxTokens: req.maxTokens,
+              messages: req.messages,
-        logContext: {
+              temperature: req.temperature,
-          provider: req.provider,
+              maxTokens: req.maxTokens,
-          model: req.model,
+              logContext: {
-          chatId,
+                provider: req.provider,
-        },
+                model: req.model,
-      })) {
+                chatId,
              },
            })
          : runToolAwareChatCompletionsStream({
              client,
              model: req.model,
              messages: req.messages,
              temperature: req.temperature,
              maxTokens: req.maxTokens,
              logContext: {
                provider: req.provider,
                model: req.model,
                chatId,
              },
            });
      for await (const ev of streamEvents) {
        if (ev.type === "delta") {
          text += ev.text;
          yield { type: "delta", text: ev.text };
--- a/server/src/routes.ts
+++ b/server/src/routes.ts
@@ -203,16 +203,15 @@ async function generateChatTitle(content: string) {
  const systemPrompt =
    "You create short chat titles. Return exactly one line, maximum 4 words, no quotes, no trailing punctuation.";
  const userPrompt = `User request:\n${content}\n\nTitle:`;
-  const response = await openaiClient().chat.completions.create({
+  const response = await openaiClient().responses.create({
    model: "gpt-4.1-mini",
    temperature: 0,
-    max_completion_tokens: 20,
+    max_output_tokens: 20,
-    messages: [
+    instructions: systemPrompt,
-      { role: "system", content: systemPrompt },
+    input: userPrompt,
-      { role: "user", content: userPrompt },
+    store: false,
    ],
  });
-  return response.choices?.[0]?.message?.content ?? "";
+  return response.output_text ?? "";
 }
 function normalizeUrlForMatch(input: string | null | undefined) {