From 015253c0afbe894b45735f2f64963d9a046c2fed Mon Sep 17 00:00:00 2001
From: James Magahern <james@magahern.com>
Date: Sat, 2 May 2026 21:44:32 -0700
Subject: [PATCH] oai responses api, tool call retries

---
 docker-compose.example.yml        |   2 +-
 docs/api/rest.md                  |  11 +-
 docs/api/streaming-chat.md        |  13 +-
 server/README.md                  |   4 +-
 server/src/env.ts                 |   2 +-
 server/src/llm/chat-tools.ts      | 237 +++++++++++++++++++++++++++++-
 server/src/llm/message-content.ts |  52 +++++++
 server/src/llm/model-catalog.ts   |   5 +-
 server/src/llm/multiplexer.ts     |  24 ++-
 server/src/llm/streaming.ts       |  46 ++++--
 server/src/routes.ts              |  13 +-
 11 files changed, 369 insertions(+), 40 deletions(-)
diff --git a/docker-compose.example.yml b/docker-compose.example.yml
index 0a86d54..a346a00 100644
--- a/docker-compose.example.yml
+++ b/docker-compose.example.yml
@@ -15,7 +15,7 @@ services:
       EXA_API_KEY: ${EXA_API_KEY:-}
       CHAT_WEB_SEARCH_ENGINE: ${CHAT_WEB_SEARCH_ENGINE:-exa}
       SEARXNG_BASE_URL: ${SEARXNG_BASE_URL:-}
-      CHAT_MAX_TOOL_ROUNDS: ${CHAT_MAX_TOOL_ROUNDS:-8}
+      CHAT_MAX_TOOL_ROUNDS: ${CHAT_MAX_TOOL_ROUNDS:-100}
       CHAT_CODEX_TOOL_ENABLED: ${CHAT_CODEX_TOOL_ENABLED:-false}
       CHAT_CODEX_REMOTE_HOST: ${CHAT_CODEX_REMOTE_HOST:-}
       CHAT_CODEX_REMOTE_USER: ${CHAT_CODEX_REMOTE_USER:-}
diff --git a/docs/api/rest.md b/docs/api/rest.md
index 31895d8..8a99bdc 100644
--- a/docs/api/rest.md
+++ b/docs/api/rest.md
@@ -37,7 +37,7 @@ Chat upload limits:
   }
 }
 ```
-- OpenAI model lists are filtered to models that are expected to work with the backend's current Chat Completions implementation.
+- OpenAI model lists are filtered to models that are expected to work with the backend's Responses API implementation.
 
 ## Chats
 
@@ -168,8 +168,11 @@ Behavior notes:
 - Attachments are optional and currently apply to `user` messages. Persisted chat history stores them under `message.metadata.attachments`.
 - Images are forwarded inline to providers as multimodal image parts. Use PNG or JPEG for cross-provider compatibility.
 - Text files are forwarded as explicit text blocks rather than provider-managed file references. Large text attachments should already be truncated client-side before submission.
-- For `openai` and `xai`, backend enables tool use during chat completion with an internal system instruction.
-- For `openai` and `xai`, image attachments are sent as chat-completions content parts alongside text.
+- For `openai`, backend calls OpenAI's Responses API and enables internal tool use with an internal system instruction.
+- For `xai`, backend calls xAI's OpenAI-compatible Chat Completions API and enables internal tool use with the same internal system instruction.
+- For `openai`, image attachments are sent as Responses `input_image` items and text attachments are sent as `input_text` items.
+- For `xai`, image attachments are sent as Chat Completions content parts alongside text.
+- For `openai`, Responses calls that can enter the server-managed tool loop use `store: true` so reasoning and function-call items can be passed between tool rounds.
 - For `anthropic`, image attachments are sent as Messages API `image` blocks using base64 source data; text attachments are added as `text` blocks.
 - Available tool calls for chat: `web_search` and `fetch_url`. When `CHAT_CODEX_TOOL_ENABLED=true`, `codex_exec` is also available. When `CHAT_SHELL_TOOL_ENABLED=true`, `shell_exec` is also available.
 - `web_search` returns ranked results with per-result summaries/snippets. Its backend engine is selected by `CHAT_WEB_SEARCH_ENGINE` (`exa` default, or `searxng` with `SEARXNG_BASE_URL` set). SearXNG mode requires the instance to allow `format=json`.
@@ -177,7 +180,7 @@ Behavior notes:
 - `codex_exec` delegates coding, shell, repository inspection, and other complex software tasks to a persistent remote Codex CLI workspace over SSH. The server runs `codex exec --skip-git-repo-check <non-interactive wrapped prompt>` on the configured devbox inside `CHAT_CODEX_REMOTE_WORKDIR`, with SSH stdin closed.
 - `shell_exec` runs arbitrary non-interactive shell commands on the same configured devbox, starting in `CHAT_CODEX_REMOTE_WORKDIR`. It uses `bash -lc` when bash exists, otherwise `sh -lc`, closes SSH stdin, and does not run inside the Sybil server container.
 - Devbox tool configuration:
-  - `CHAT_MAX_TOOL_ROUNDS=8` (optional; maximum model/tool result cycles before the backend returns a limit message)
+  - `CHAT_MAX_TOOL_ROUNDS=100` (optional; maximum model/tool result cycles before the backend returns a limit message)
   - `CHAT_CODEX_TOOL_ENABLED=true`
   - `CHAT_SHELL_TOOL_ENABLED=true`
   - `CHAT_CODEX_REMOTE_HOST=<host-or-ip>` (required when enabled)
diff --git a/docs/api/streaming-chat.md b/docs/api/streaming-chat.md
index c0dfb8e..10a92d0 100644
--- a/docs/api/streaming-chat.md
+++ b/docs/api/streaming-chat.md
@@ -127,19 +127,22 @@ Event order:
 
 ## Provider Streaming Behavior
 
-- `openai`/`xai`: backend may execute internal tool calls (`web_search`, `fetch_url`, optional `codex_exec`, and optional `shell_exec`) before producing final text.
-- `openai`: image attachments are sent as chat-completions content parts; text attachments are inlined as text parts.
-- `xai`: same attachment behavior as OpenAI.
+- `openai`: backend uses OpenAI's Responses API and may execute internal function tool calls (`web_search`, `fetch_url`, optional `codex_exec`, and optional `shell_exec`) before producing final text.
+- `xai`: backend uses xAI's OpenAI-compatible Chat Completions API and may execute the same internal tool calls before producing final text.
+- `openai`: image attachments are sent as Responses `input_image` items; text attachments are sent as `input_text` items.
+- `xai`: image attachments are sent as Chat Completions content parts; text attachments are inlined as text parts.
+- `openai`: Responses calls that can enter the server-managed tool loop use `store: true` so reasoning and function-call items can be passed between tool rounds.
 - `anthropic`: streamed via event stream; emits `delta` from `content_block_delta` with `text_delta`. Image attachments are sent as base64 `image` blocks and text attachments are appended as `text` blocks.
 - `web_search` uses `CHAT_WEB_SEARCH_ENGINE` (`exa` default, or `searxng` with `SEARXNG_BASE_URL` set). SearXNG mode requires the instance to allow `format=json`. This only affects chat-mode tool calls, not search-mode endpoints.
 - `codex_exec` is available only when `CHAT_CODEX_TOOL_ENABLED=true`. It SSHes to `CHAT_CODEX_REMOTE_HOST`, creates/uses `CHAT_CODEX_REMOTE_WORKDIR`, and runs `codex exec --skip-git-repo-check <non-interactive wrapped prompt>` there with SSH stdin closed. Prefer `CHAT_CODEX_SSH_KEY_PATH` with a read-only mounted private key; `CHAT_CODEX_SSH_PRIVATE_KEY_B64` is also supported.
 - `shell_exec` is available only when `CHAT_SHELL_TOOL_ENABLED=true`. It uses the same devbox SSH configuration, starts in `CHAT_CODEX_REMOTE_WORKDIR`, and runs non-interactive shell commands there with SSH stdin closed, not inside the Sybil server container.
-- `CHAT_MAX_TOOL_ROUNDS` controls how many model/tool result cycles may occur before the backend returns a tool-call limit message; default is 8.
+- `CHAT_MAX_TOOL_ROUNDS` controls how many model/tool result cycles may occur before the backend returns a tool-call limit message; default is 100.
 
 Tool-enabled streaming notes (`openai`/`xai`):
 - Stream still emits standard `meta`, `delta`, `done|error` events.
 - Stream may emit `tool_call` events while tool calls are executed.
-- `delta` events stream incrementally as text is generated.
+- `delta` events carry assistant text. The backend may buffer model-native text briefly while determining whether a provider round contains tool calls.
+- OpenAI Responses stream events are normalized by the backend into this SSE contract; clients do not consume OpenAI's raw Responses stream event names.
 
 ## Persistence + Consistency Model
 
diff --git a/server/README.md b/server/README.md
index 6d6418d..bd1dd75 100644
--- a/server/README.md
+++ b/server/README.md
@@ -1,7 +1,7 @@
 # Sybil Server
 
 Backend API for:
-- LLM multiplexer (OpenAI / Anthropic / xAI (Grok))
+- LLM multiplexer (OpenAI Responses / Anthropic / xAI Chat Completions-compatible Grok)
 - Personal chat database (chats/messages + LLM call log)
 
 ## Stack
@@ -46,7 +46,7 @@ If `ADMIN_TOKEN` is not set, the server runs in open mode (dev).
 - `EXA_API_KEY`
 - `CHAT_WEB_SEARCH_ENGINE` (`exa` by default, or `searxng` for chat tool calls only)
 - `SEARXNG_BASE_URL` (required when `CHAT_WEB_SEARCH_ENGINE=searxng`; instance must allow `format=json`)
-- `CHAT_MAX_TOOL_ROUNDS` (`8` by default; maximum model/tool result cycles per chat completion)
+- `CHAT_MAX_TOOL_ROUNDS` (`100` by default; maximum model/tool result cycles per chat completion)
 - `CHAT_CODEX_TOOL_ENABLED` (`false` by default; enables the `codex_exec` chat tool for OpenAI/xAI)
 - `CHAT_CODEX_REMOTE_HOST` (required when Codex tool is enabled; SSH host/IP or `user@host`)
 - `CHAT_CODEX_REMOTE_USER` (optional SSH user when host does not include one)
diff --git a/server/src/env.ts b/server/src/env.ts
index fea6ab8..dffb9ab 100644
--- a/server/src/env.ts
+++ b/server/src/env.ts
@@ -64,7 +64,7 @@ const EnvSchema = z.object({
   // Chat-mode web_search tool configuration. Search mode remains Exa-only for now.
   CHAT_WEB_SEARCH_ENGINE: ChatWebSearchEngineSchema,
   SEARXNG_BASE_URL: OptionalUrlSchema,
-  CHAT_MAX_TOOL_ROUNDS: defaultedPositiveInt(8),
+  CHAT_MAX_TOOL_ROUNDS: defaultedPositiveInt(100),
 
   // Optional chat-mode Codex tool. When enabled, the server SSHes into a remote
   // devbox and runs `codex exec` in a persistent scratch directory there.
diff --git a/server/src/llm/chat-tools.ts b/server/src/llm/chat-tools.ts
index 197e8fb..bf153c4 100644
--- a/server/src/llm/chat-tools.ts
+++ b/server/src/llm/chat-tools.ts
@@ -9,7 +9,7 @@ import { z } from "zod";
 import { env } from "../env.js";
 import { exaClient } from "../search/exa.js";
 import { searchSearxng } from "../search/searxng.js";
-import { buildOpenAIConversationMessage } from "./message-content.js";
+import { buildOpenAIConversationMessage, buildOpenAIResponsesInputMessage } from "./message-content.js";
 import type { ChatMessage } from "./types.js";
 
 const MAX_TOOL_ROUNDS = env.CHAT_MAX_TOOL_ROUNDS;
@@ -188,6 +188,17 @@ const CHAT_TOOLS: any[] = [
   ...(env.CHAT_SHELL_TOOL_ENABLED ? [SHELL_EXEC_TOOL] : []),
 ];
 
+const RESPONSES_CHAT_TOOLS: any[] = CHAT_TOOLS.map((tool) => {
+  if (tool?.type !== "function") return tool;
+  return {
+    type: "function",
+    name: tool.function.name,
+    description: tool.function.description,
+    parameters: tool.function.parameters,
+    strict: false,
+  };
+});
+
 export const CHAT_TOOL_SYSTEM_PROMPT =
   "You can use tools to gather up-to-date web information when needed. " +
   "Use web_search for discovery and recent facts, and fetch_url to read the full content of a specific page. " +
@@ -374,6 +385,12 @@ function normalizeIncomingMessages(messages: ChatMessage[]) {
   return [{ role: "system", content: CHAT_TOOL_SYSTEM_PROMPT }, ...normalized];
 }
 
+function normalizeIncomingResponsesInput(messages: ChatMessage[]) {
+  const normalized = messages.map((message) => buildOpenAIResponsesInputMessage(message));
+
+  return [{ role: "system", content: CHAT_TOOL_SYSTEM_PROMPT }, ...normalized];
+}
+
 async function runExaWebSearchTool(args: WebSearchArgs): Promise<ToolRunOutcome> {
   const exa = exaClient();
   const response = await exa.search(args.query, {
@@ -806,6 +823,52 @@ function mergeUsage(acc: Required<ToolAwareUsage>, usage: any) {
   return true;
 }
 
+function mergeResponsesUsage(acc: Required<ToolAwareUsage>, usage: any) {
+  if (!usage) return false;
+  acc.inputTokens += usage.input_tokens ?? 0;
+  acc.outputTokens += usage.output_tokens ?? 0;
+  acc.totalTokens += usage.total_tokens ?? 0;
+  return true;
+}
+
+function getResponseOutputItems(response: any) {
+  return Array.isArray(response?.output) ? response.output : [];
+}
+
+function extractResponsesText(response: any, fallback = "") {
+  if (typeof response?.output_text === "string") return response.output_text;
+
+  const parts: string[] = [];
+  for (const item of getResponseOutputItems(response)) {
+    if (item?.type !== "message" || !Array.isArray(item.content)) continue;
+    for (const content of item.content) {
+      if (content?.type === "output_text" && typeof content.text === "string") {
+        parts.push(content.text);
+      } else if (content?.type === "refusal" && typeof content.refusal === "string") {
+        parts.push(content.refusal);
+      }
+    }
+  }
+  return parts.join("") || fallback;
+}
+
+function getResponseFailureMessage(response: any) {
+  if (response?.status !== "failed" && response?.status !== "incomplete") return null;
+  const errorMessage = typeof response?.error?.message === "string" ? response.error.message : null;
+  const incompleteReason = typeof response?.incomplete_details?.reason === "string" ? response.incomplete_details.reason : null;
+  return errorMessage ?? (incompleteReason ? `Response incomplete: ${incompleteReason}` : `Response ${response.status}.`);
+}
+
+function normalizeResponsesToolCalls(outputItems: any[], round: number): NormalizedToolCall[] {
+  return outputItems
+    .filter((item) => item?.type === "function_call")
+    .map((call: any, index: number) => ({
+      id: call.call_id ?? call.id ?? `tool_call_${round}_${index}`,
+      name: call.name ?? "unknown_tool",
+      arguments: call.arguments ?? "{}",
+    }));
+}
+
 type NormalizedToolCall = {
   id: string;
   name: string;
@@ -869,6 +932,75 @@ async function executeToolCallAndBuildEvent(
 }
 
 export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams): Promise<ToolAwareCompletionResult> {
+  const input: any[] = normalizeIncomingResponsesInput(params.messages);
+  const rawResponses: unknown[] = [];
+  const toolEvents: ToolExecutionEvent[] = [];
+  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
+  let sawUsage = false;
+  let totalToolCalls = 0;
+  let danglingToolIntentRetries = 0;
+
+  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
+    const response = await params.client.responses.create({
+      model: params.model,
+      input,
+      temperature: params.temperature,
+      max_output_tokens: params.maxTokens,
+      tools: RESPONSES_CHAT_TOOLS,
+      tool_choice: "auto",
+      parallel_tool_calls: true,
+      // Tool loops pass response output items back as input; reasoning items need persistence.
+      store: true,
+    } as any);
+    rawResponses.push(response);
+    sawUsage = mergeResponsesUsage(usageAcc, response?.usage) || sawUsage;
+
+    const failureMessage = getResponseFailureMessage(response);
+    if (failureMessage) {
+      throw new Error(failureMessage);
+    }
+
+    const outputItems = getResponseOutputItems(response);
+    const normalizedToolCalls = normalizeResponsesToolCalls(outputItems, round);
+    if (!normalizedToolCalls.length) {
+      const text = extractResponsesText(response);
+      if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(text)) {
+        danglingToolIntentRetries += 1;
+        appendDanglingToolIntentCorrection(input, text);
+        continue;
+      }
+      return {
+        text,
+        usage: sawUsage ? usageAcc : undefined,
+        raw: { responses: rawResponses, toolCallsUsed: totalToolCalls, api: "responses" },
+        toolEvents,
+      };
+    }
+
+    totalToolCalls += normalizedToolCalls.length;
+    input.push(...outputItems);
+
+    for (const call of normalizedToolCalls) {
+      const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);
+      toolEvents.push(event);
+
+      input.push({
+        type: "function_call_output",
+        call_id: call.id,
+        output: JSON.stringify(toolResult),
+      });
+    }
+  }
+
+  return {
+    text: "I reached the tool-call limit while gathering information. Please narrow the request and try again.",
+    usage: sawUsage ? usageAcc : undefined,
+    raw: { responses: rawResponses, toolCallsUsed: totalToolCalls, toolCallLimitReached: true, api: "responses" },
+    toolEvents,
+  };
+}
+
+export async function runToolAwareChatCompletions(params: ToolAwareCompletionParams): Promise<ToolAwareCompletionResult> {
   const conversation: any[] = normalizeIncomingMessages(params.messages);
   const rawResponses: unknown[] = [];
   const toolEvents: ToolExecutionEvent[] = [];
@@ -956,6 +1088,109 @@ export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams):
 
 export async function* runToolAwareOpenAIChatStream(
   params: ToolAwareCompletionParams
+): AsyncGenerator<ToolAwareStreamingEvent> {
+  const input: any[] = normalizeIncomingResponsesInput(params.messages);
+  const rawResponses: unknown[] = [];
+  const toolEvents: ToolExecutionEvent[] = [];
+  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
+  let sawUsage = false;
+  let totalToolCalls = 0;
+  let danglingToolIntentRetries = 0;
+
+  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
+    const stream = await params.client.responses.create({
+      model: params.model,
+      input,
+      temperature: params.temperature,
+      max_output_tokens: params.maxTokens,
+      tools: RESPONSES_CHAT_TOOLS,
+      tool_choice: "auto",
+      parallel_tool_calls: true,
+      // Tool loops pass response output items back as input; reasoning items need persistence.
+      store: true,
+      stream: true,
+    } as any);
+
+    let roundText = "";
+    let completedResponse: any | null = null;
+    const completedOutputItems: any[] = [];
+
+    for await (const event of stream as any as AsyncIterable<any>) {
+      rawResponses.push(event);
+
+      if (event?.type === "response.output_text.delta" && typeof event.delta === "string") {
+        roundText += event.delta;
+      } else if (event?.type === "response.output_item.done" && event.item) {
+        completedOutputItems[event.output_index ?? completedOutputItems.length] = event.item;
+      } else if (event?.type === "response.completed") {
+        completedResponse = event.response;
+        sawUsage = mergeResponsesUsage(usageAcc, event.response?.usage) || sawUsage;
+      } else if (event?.type === "response.failed" || event?.type === "response.incomplete") {
+        completedResponse = event.response;
+        sawUsage = mergeResponsesUsage(usageAcc, event.response?.usage) || sawUsage;
+      } else if (event?.type === "error") {
+        throw new Error(event.message ?? "OpenAI Responses stream failed.");
+      }
+    }
+
+    const failureMessage = getResponseFailureMessage(completedResponse);
+    if (failureMessage) {
+      throw new Error(failureMessage);
+    }
+
+    const outputItems = getResponseOutputItems(completedResponse);
+    const responseOutputItems = outputItems.length ? outputItems : completedOutputItems.filter(Boolean);
+    const normalizedToolCalls = normalizeResponsesToolCalls(responseOutputItems, round);
+    if (!normalizedToolCalls.length) {
+      const text = extractResponsesText(completedResponse, roundText);
+      if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(text)) {
+        danglingToolIntentRetries += 1;
+        appendDanglingToolIntentCorrection(input, text);
+        continue;
+      }
+      if (text) {
+        yield { type: "delta", text };
+      }
+      yield {
+        type: "done",
+        result: {
+          text,
+          usage: sawUsage ? usageAcc : undefined,
+          raw: { streamed: true, responses: rawResponses, toolCallsUsed: totalToolCalls, api: "responses" },
+          toolEvents,
+        },
+      };
+      return;
+    }
+
+    totalToolCalls += normalizedToolCalls.length;
+    input.push(...responseOutputItems);
+
+    for (const call of normalizedToolCalls) {
+      const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);
+      toolEvents.push(event);
+      yield { type: "tool_call", event };
+      input.push({
+        type: "function_call_output",
+        call_id: call.id,
+        output: JSON.stringify(toolResult),
+      });
+    }
+  }
+
+  yield {
+    type: "done",
+    result: {
+      text: "I reached the tool-call limit while gathering information. Please narrow the request and try again.",
+      usage: sawUsage ? usageAcc : undefined,
+      raw: { streamed: true, responses: rawResponses, toolCallsUsed: totalToolCalls, toolCallLimitReached: true, api: "responses" },
+      toolEvents,
+    },
+  };
+}
+
+export async function* runToolAwareChatCompletionsStream(
+  params: ToolAwareCompletionParams
 ): AsyncGenerator<ToolAwareStreamingEvent> {
   const conversation: any[] = normalizeIncomingMessages(params.messages);
   const rawResponses: unknown[] = [];
diff --git a/server/src/llm/message-content.ts b/server/src/llm/message-content.ts
index a2e77c3..8b3434c 100644
--- a/server/src/llm/message-content.ts
+++ b/server/src/llm/message-content.ts
@@ -67,6 +67,43 @@ function toOpenAIContent(message: ChatMessage) {
   return parts;
 }
 
+function toOpenAIResponsesContent(message: ChatMessage) {
+  const imageAttachments = getImageAttachments(message);
+  const textAttachments = getTextAttachments(message);
+  if (!imageAttachments.length && !textAttachments.length) {
+    return message.content;
+  }
+
+  const parts: Array<Record<string, unknown>> = [];
+
+  for (const attachment of imageAttachments) {
+    parts.push({
+      type: "input_image",
+      image_url: attachment.dataUrl,
+      detail: "auto",
+    });
+  }
+
+  const imageSummary = buildImageSummaryText(imageAttachments);
+  if (imageSummary) {
+    parts.push({ type: "input_text", text: imageSummary });
+  }
+
+  for (const attachment of textAttachments) {
+    parts.push({ type: "input_text", text: buildTextAttachmentPrompt(attachment) });
+  }
+
+  if (message.content.trim()) {
+    parts.push({ type: "input_text", text: message.content });
+  }
+
+  if (parts.length === 1 && parts[0]?.type === "input_text" && typeof parts[0].text === "string") {
+    return parts[0].text;
+  }
+
+  return parts;
+}
+
 function parseImageDataUrl(attachment: ChatImageAttachment) {
   const match = attachment.dataUrl.match(/^data:(image\/(?:png|jpeg));base64,([a-z0-9+/=\s]+)$/i);
   if (!match) {
@@ -146,6 +183,21 @@ export function buildOpenAIConversationMessage(message: ChatMessage) {
   return out;
 }
 
+export function buildOpenAIResponsesInputMessage(message: ChatMessage) {
+  if (message.role === "tool") {
+    const name = message.name?.trim() || "tool";
+    return {
+      role: "user",
+      content: `Tool output (${name}):\n${message.content}`,
+    };
+  }
+
+  return {
+    role: message.role,
+    content: toOpenAIResponsesContent(message),
+  };
+}
+
 const ANTHROPIC_NO_SERVER_TOOLS_PROMPT =
   "This Anthropic backend path does not have server-managed tool calls. Do not claim to run shell commands, Codex tasks, web searches, or fetch URLs. If the user asks for tool execution, explain that they should switch to OpenAI or xAI in this app for tool-enabled chat.";
 
diff --git a/server/src/llm/model-catalog.ts b/server/src/llm/model-catalog.ts
index 4366925..b54978f 100644
--- a/server/src/llm/model-catalog.ts
+++ b/server/src/llm/model-catalog.ts
@@ -23,13 +23,12 @@ function uniqSorted(models: string[]) {
   return [...new Set(models.map((value) => value.trim()).filter(Boolean))].sort((a, b) => a.localeCompare(b));
 }
 
-function isLikelyOpenAIChatCompletionsModel(model: string) {
+function isLikelyOpenAIResponsesModel(model: string) {
   const id = model.toLowerCase();
   if (id.includes("embedding") || id.includes("moderation")) return false;
   if (id.includes("audio") || id.includes("realtime") || id.includes("transcribe") || id.includes("tts")) return false;
   if (id.includes("image") || id.includes("dall-e") || id.includes("sora")) return false;
   if (id.includes("search") || id.includes("computer-use")) return false;
-  if (/^gpt-[\d.]+-pro(?:-|$)/.test(id)) return false;
   return /^(gpt-|o\d|chatgpt-)/.test(id);
 }
 
@@ -52,7 +51,7 @@ async function withTimeout<T>(promise: Promise<T>, timeoutMs: number, label: str
 async function fetchProviderModels(provider: Provider) {
   if (provider === "openai") {
     const page = await openaiClient().models.list();
-    return uniqSorted(page.data.map((model) => model.id).filter(isLikelyOpenAIChatCompletionsModel));
+    return uniqSorted(page.data.map((model) => model.id).filter(isLikelyOpenAIResponsesModel));
   }
 
   if (provider === "anthropic") {
diff --git a/server/src/llm/multiplexer.ts b/server/src/llm/multiplexer.ts
index 9f150d5..c87e3d8 100644
--- a/server/src/llm/multiplexer.ts
+++ b/server/src/llm/multiplexer.ts
@@ -1,7 +1,7 @@
 import { performance } from "node:perf_hooks";
 import { prisma } from "../db.js";
 import { anthropicClient, openaiClient, xaiClient } from "./providers.js";
-import { buildToolLogMessageData, runToolAwareOpenAIChat } from "./chat-tools.js";
+import { buildToolLogMessageData, runToolAwareChatCompletions, runToolAwareOpenAIChat } from "./chat-tools.js";
 import { buildAnthropicConversationMessage, getAnthropicSystemPrompt } from "./message-content.js";
 import type { MultiplexRequest, MultiplexResponse, Provider } from "./types.js";
 
@@ -48,8 +48,8 @@ export async function runMultiplex(req: MultiplexRequest): Promise<MultiplexResp
     let raw: unknown;
     let toolMessages: ReturnType<typeof buildToolLogMessageData>[] = [];
 
-    if (req.provider === "openai" || req.provider === "xai") {
-      const client = req.provider === "openai" ? openaiClient() : xaiClient();
+    if (req.provider === "openai") {
+      const client = openaiClient();
       const r = await runToolAwareOpenAIChat({
         client,
         model: req.model,
@@ -66,6 +66,24 @@ export async function runMultiplex(req: MultiplexRequest): Promise<MultiplexResp
       outText = r.text;
       usage = r.usage;
       toolMessages = r.toolEvents.map((event) => buildToolLogMessageData(call.chatId, event));
+    } else if (req.provider === "xai") {
+      const client = xaiClient();
+      const r = await runToolAwareChatCompletions({
+        client,
+        model: req.model,
+        messages: req.messages,
+        temperature: req.temperature,
+        maxTokens: req.maxTokens,
+        logContext: {
+          provider: req.provider,
+          model: req.model,
+          chatId,
+        },
+      });
+      raw = r.raw;
+      outText = r.text;
+      usage = r.usage;
+      toolMessages = r.toolEvents.map((event) => buildToolLogMessageData(call.chatId, event));
     } else if (req.provider === "anthropic") {
       const client = anthropicClient();
 
diff --git a/server/src/llm/streaming.ts b/server/src/llm/streaming.ts
index 434a2ed..e94035d 100644
--- a/server/src/llm/streaming.ts
+++ b/server/src/llm/streaming.ts
@@ -1,7 +1,12 @@
 import { performance } from "node:perf_hooks";
 import { prisma } from "../db.js";
 import { anthropicClient, openaiClient, xaiClient } from "./providers.js";
-import { buildToolLogMessageData, runToolAwareOpenAIChatStream, type ToolExecutionEvent } from "./chat-tools.js";
+import {
+  buildToolLogMessageData,
+  runToolAwareChatCompletionsStream,
+  runToolAwareOpenAIChatStream,
+  type ToolExecutionEvent,
+} from "./chat-tools.js";
 import { buildAnthropicConversationMessage, getAnthropicSystemPrompt } from "./message-content.js";
 import type { MultiplexRequest, Provider } from "./types.js";
 
@@ -58,18 +63,33 @@ export async function* runMultiplexStream(req: MultiplexRequest): AsyncGenerator
   try {
     if (req.provider === "openai" || req.provider === "xai") {
       const client = req.provider === "openai" ? openaiClient() : xaiClient();
-      for await (const ev of runToolAwareOpenAIChatStream({
-        client,
-        model: req.model,
-        messages: req.messages,
-        temperature: req.temperature,
-        maxTokens: req.maxTokens,
-        logContext: {
-          provider: req.provider,
-          model: req.model,
-          chatId,
-        },
-      })) {
+      const streamEvents =
+        req.provider === "openai"
+          ? runToolAwareOpenAIChatStream({
+              client,
+              model: req.model,
+              messages: req.messages,
+              temperature: req.temperature,
+              maxTokens: req.maxTokens,
+              logContext: {
+                provider: req.provider,
+                model: req.model,
+                chatId,
+              },
+            })
+          : runToolAwareChatCompletionsStream({
+              client,
+              model: req.model,
+              messages: req.messages,
+              temperature: req.temperature,
+              maxTokens: req.maxTokens,
+              logContext: {
+                provider: req.provider,
+                model: req.model,
+                chatId,
+              },
+            });
+      for await (const ev of streamEvents) {
         if (ev.type === "delta") {
           text += ev.text;
           yield { type: "delta", text: ev.text };
diff --git a/server/src/routes.ts b/server/src/routes.ts
index 1eb49cc..fbb2395 100644
--- a/server/src/routes.ts
+++ b/server/src/routes.ts
@@ -203,16 +203,15 @@ async function generateChatTitle(content: string) {
   const systemPrompt =
     "You create short chat titles. Return exactly one line, maximum 4 words, no quotes, no trailing punctuation.";
   const userPrompt = `User request:\n${content}\n\nTitle:`;
-  const response = await openaiClient().chat.completions.create({
+  const response = await openaiClient().responses.create({
     model: "gpt-4.1-mini",
     temperature: 0,
-    max_completion_tokens: 20,
-    messages: [
-      { role: "system", content: systemPrompt },
-      { role: "user", content: userPrompt },
-    ],
+    max_output_tokens: 20,
+    instructions: systemPrompt,
+    input: userPrompt,
+    store: false,
   });
-  return response.choices?.[0]?.message?.content ?? "";
+  return response.output_text ?? "";
 }
 
 function normalizeUrlForMatch(input: string | null | undefined) {