Various fixes for tool calling

2026-05-02 21:19:52 -07:00
parent d579b5bf75
commit 8d6c069a33
8 changed files with 97 additions and 17 deletions
--- a/docker-compose.example.yml
+++ b/docker-compose.example.yml
@@ -15,6 +15,7 @@ services:
      EXA_API_KEY: ${EXA_API_KEY:-}
      CHAT_WEB_SEARCH_ENGINE: ${CHAT_WEB_SEARCH_ENGINE:-exa}
      SEARXNG_BASE_URL: ${SEARXNG_BASE_URL:-}
      CHAT_MAX_TOOL_ROUNDS: ${CHAT_MAX_TOOL_ROUNDS:-8}
      CHAT_CODEX_TOOL_ENABLED: ${CHAT_CODEX_TOOL_ENABLED:-false}
      CHAT_CODEX_REMOTE_HOST: ${CHAT_CODEX_REMOTE_HOST:-}
      CHAT_CODEX_REMOTE_USER: ${CHAT_CODEX_REMOTE_USER:-}
--- a/docs/api/rest.md
+++ b/docs/api/rest.md
@@ -37,6 +37,7 @@ Chat upload limits:
  }
 }
 ```
 - OpenAI model lists are filtered to models that are expected to work with the backend's current Chat Completions implementation.
 ## Chats
@@ -173,9 +174,10 @@ Behavior notes:
 - Available tool calls for chat: `web_search` and `fetch_url`. When `CHAT_CODEX_TOOL_ENABLED=true`, `codex_exec` is also available. When `CHAT_SHELL_TOOL_ENABLED=true`, `shell_exec` is also available.
 - `web_search` returns ranked results with per-result summaries/snippets. Its backend engine is selected by `CHAT_WEB_SEARCH_ENGINE` (`exa` default, or `searxng` with `SEARXNG_BASE_URL` set). SearXNG mode requires the instance to allow `format=json`.
 - `fetch_url` fetches a URL and returns plaintext page content (HTML converted to text server-side).
- `codex_exec` delegates coding, shell, repository inspection, and other complex software tasks to a persistent remote Codex CLI workspace over SSH. The server runs `codex exec <prompt>` on the configured devbox inside `CHAT_CODEX_REMOTE_WORKDIR`.
+- `codex_exec` delegates coding, shell, repository inspection, and other complex software tasks to a persistent remote Codex CLI workspace over SSH. The server runs `codex exec --skip-git-repo-check <non-interactive wrapped prompt>` on the configured devbox inside `CHAT_CODEX_REMOTE_WORKDIR`, with SSH stdin closed.
- `shell_exec` runs arbitrary non-interactive shell commands on the same configured devbox, starting in `CHAT_CODEX_REMOTE_WORKDIR`. It uses `bash -lc` when bash exists, otherwise `sh -lc`, and does not run inside the Sybil server container.
+- `shell_exec` runs arbitrary non-interactive shell commands on the same configured devbox, starting in `CHAT_CODEX_REMOTE_WORKDIR`. It uses `bash -lc` when bash exists, otherwise `sh -lc`, closes SSH stdin, and does not run inside the Sybil server container.
 - Devbox tool configuration:
  - `CHAT_MAX_TOOL_ROUNDS=8` (optional; maximum model/tool result cycles before the backend returns a limit message)
  - `CHAT_CODEX_TOOL_ENABLED=true`
  - `CHAT_SHELL_TOOL_ENABLED=true`
  - `CHAT_CODEX_REMOTE_HOST=<host-or-ip>` (required when enabled)
--- a/docs/api/streaming-chat.md
+++ b/docs/api/streaming-chat.md
@@ -132,8 +132,9 @@ Event order:
 - `xai`: same attachment behavior as OpenAI.
 - `anthropic`: streamed via event stream; emits `delta` from `content_block_delta` with `text_delta`. Image attachments are sent as base64 `image` blocks and text attachments are appended as `text` blocks.
 - `web_search` uses `CHAT_WEB_SEARCH_ENGINE` (`exa` default, or `searxng` with `SEARXNG_BASE_URL` set). SearXNG mode requires the instance to allow `format=json`. This only affects chat-mode tool calls, not search-mode endpoints.
- `codex_exec` is available only when `CHAT_CODEX_TOOL_ENABLED=true`. It SSHes to `CHAT_CODEX_REMOTE_HOST`, creates/uses `CHAT_CODEX_REMOTE_WORKDIR`, and runs `codex exec <prompt>` there. Prefer `CHAT_CODEX_SSH_KEY_PATH` with a read-only mounted private key; `CHAT_CODEX_SSH_PRIVATE_KEY_B64` is also supported.
+- `codex_exec` is available only when `CHAT_CODEX_TOOL_ENABLED=true`. It SSHes to `CHAT_CODEX_REMOTE_HOST`, creates/uses `CHAT_CODEX_REMOTE_WORKDIR`, and runs `codex exec --skip-git-repo-check <non-interactive wrapped prompt>` there with SSH stdin closed. Prefer `CHAT_CODEX_SSH_KEY_PATH` with a read-only mounted private key; `CHAT_CODEX_SSH_PRIVATE_KEY_B64` is also supported.
- `shell_exec` is available only when `CHAT_SHELL_TOOL_ENABLED=true`. It uses the same devbox SSH configuration, starts in `CHAT_CODEX_REMOTE_WORKDIR`, and runs non-interactive shell commands there, not inside the Sybil server container.
+- `shell_exec` is available only when `CHAT_SHELL_TOOL_ENABLED=true`. It uses the same devbox SSH configuration, starts in `CHAT_CODEX_REMOTE_WORKDIR`, and runs non-interactive shell commands there with SSH stdin closed, not inside the Sybil server container.
 - `CHAT_MAX_TOOL_ROUNDS` controls how many model/tool result cycles may occur before the backend returns a tool-call limit message; default is 8.
 Tool-enabled streaming notes (`openai`/`xai`):
 - Stream still emits standard `meta`, `delta`, `done|error` events.
--- a/server/README.md
+++ b/server/README.md
@@ -46,6 +46,7 @@ If `ADMIN_TOKEN` is not set, the server runs in open mode (dev).
 - `EXA_API_KEY`
 - `CHAT_WEB_SEARCH_ENGINE` (`exa` by default, or `searxng` for chat tool calls only)
 - `SEARXNG_BASE_URL` (required when `CHAT_WEB_SEARCH_ENGINE=searxng`; instance must allow `format=json`)
 - `CHAT_MAX_TOOL_ROUNDS` (`8` by default; maximum model/tool result cycles per chat completion)
 - `CHAT_CODEX_TOOL_ENABLED` (`false` by default; enables the `codex_exec` chat tool for OpenAI/xAI)
 - `CHAT_CODEX_REMOTE_HOST` (required when Codex tool is enabled; SSH host/IP or `user@host`)
 - `CHAT_CODEX_REMOTE_USER` (optional SSH user when host does not include one)
--- a/server/src/env.ts
+++ b/server/src/env.ts
@@ -64,6 +64,7 @@ const EnvSchema = z.object({
  // Chat-mode web_search tool configuration. Search mode remains Exa-only for now.
  CHAT_WEB_SEARCH_ENGINE: ChatWebSearchEngineSchema,
  SEARXNG_BASE_URL: OptionalUrlSchema,
  CHAT_MAX_TOOL_ROUNDS: defaultedPositiveInt(8),
  // Optional chat-mode Codex tool. When enabled, the server SSHes into a remote
  // devbox and runs `codex exec` in a persistent scratch directory there.
--- a/server/src/llm/chat-tools.ts
+++ b/server/src/llm/chat-tools.ts
@@ -12,7 +12,7 @@ import { searchSearxng } from "../search/searxng.js";
 import { buildOpenAIConversationMessage } from "./message-content.js";
 import type { ChatMessage } from "./types.js";
-const MAX_TOOL_ROUNDS = 4;
+const MAX_TOOL_ROUNDS = env.CHAT_MAX_TOOL_ROUNDS;
 const DEFAULT_WEB_RESULTS = 5;
 const MAX_WEB_RESULTS = 10;
 const DEFAULT_FETCH_MAX_CHARACTERS = 12_000;
@@ -25,6 +25,7 @@ const MAX_SHELL_COMMAND_CHARACTERS = 20_000;
 const DEFAULT_SHELL_MAX_OUTPUT_CHARACTERS = 24_000;
 const MAX_SHELL_MAX_OUTPUT_CHARACTERS = 80_000;
 const REMOTE_EXEC_MAX_BUFFER_BYTES = 1_000_000;
 const MAX_DANGLING_TOOL_INTENT_RETRIES = 1;
 const execFileAsync = promisify(execFile);
@@ -70,7 +71,7 @@ const CODEX_EXEC_TOOL = {
  function: {
    name: "codex_exec",
    description:
-      "Delegate a coding, terminal, or multi-step software task to a persistent remote Codex CLI workspace. Use for complex code changes, repository inspection, running programs/tests, debugging build failures, or other tasks that need a real shell. Return the remote Codex summary and relevant stdout/stderr.",
+      "Delegate a coding, terminal, or multi-step software task to a persistent remote Codex CLI workspace. Use for complex code changes, repository inspection, running programs/tests, debugging build failures, or other tasks that need a real shell. The task runs non-interactively; the remote Codex instance must make reasonable assumptions, complete the task, and return a final summary with relevant stdout/stderr.",
    parameters: {
      type: "object",
      properties: {
@@ -191,11 +192,12 @@ export const CHAT_TOOL_SYSTEM_PROMPT =
  "You can use tools to gather up-to-date web information when needed. " +
  "Use web_search for discovery and recent facts, and fetch_url to read the full content of a specific page. " +
  "Prefer tools when the user asks for current events, verification, sources, or details you do not already have. " +
  "When you decide tool use is needed, call the tool immediately in the same response; do not say you are running a tool unless you actually call it. " +
  (env.CHAT_CODEX_TOOL_ENABLED
-    ? "Use codex_exec when a request needs substantial coding work, repository inspection, shell commands, tests, debugging, or another complex task suited to a persistent Codex workspace. Provide codex_exec a complete prompt with the goal, constraints, and expected report-back format. "
+    ? "Use codex_exec when a request needs substantial coding work, repository inspection, shell commands, tests, debugging, or another complex task suited to a persistent Codex workspace. Provide codex_exec a complete prompt with the goal, constraints, assumptions, and expected report-back format. Never ask codex_exec to wait for user input or run interactive commands. "
    : "") +
  (env.CHAT_SHELL_TOOL_ENABLED
-    ? "Use shell_exec for direct command-line work on the remote devbox, including quick Python programs, calculations, file inspection, running tests, and small scripts. "
+    ? "Use shell_exec for direct non-interactive command-line work on the remote devbox, including quick Python programs, calculations, file inspection, running tests, and small scripts. "
    : "") +
  "Do not fabricate tool outputs; reason only from provided tool results.";
@@ -535,7 +537,20 @@ function buildDevboxSshTarget() {
 function buildRemoteCodexCommand(prompt: string) {
  const workdir = env.CHAT_CODEX_REMOTE_WORKDIR.trim();
-  const codexCommand = `codex exec ${shellQuote(prompt)}`;
+  const wrappedPrompt = [
    "You are running in a non-interactive batch environment.",
    "",
    "Rules:",
    "- Do not ask questions or wait for user input.",
    "- Do not use interactive commands, editors, pagers, or prompts.",
    "- If details are ambiguous, make a reasonable assumption and continue.",
    "- Complete the task in one run, including any requested file edits, commands, and verification.",
    "- End with a concise final report that includes changed files, commands run, and outcomes.",
    "",
    "Task:",
    prompt,
  ].join("\n");
  const codexCommand = `codex exec --skip-git-repo-check ${shellQuote(wrappedPrompt)} < /dev/null`;
  return `mkdir -p ${shellQuote(workdir)} && cd ${shellQuote(workdir)} && ${codexCommand}`;
 }
@@ -595,6 +610,7 @@ async function runCodexExecTool(input: unknown): Promise<ToolRunOutcome> {
  const run = async (keyPath?: string) => {
    const sshArgs = [
      "-n",
      "-o",
      "BatchMode=yes",
      "-o",
@@ -662,6 +678,7 @@ async function runShellExecTool(input: unknown): Promise<ToolRunOutcome> {
  const run = async (keyPath?: string) => {
    const sshArgs = [
      "-n",
      "-o",
      "BatchMode=yes",
      "-o",
@@ -756,6 +773,31 @@ function buildEventArgs(name: string, args: Record<string, unknown>) {
  return args;
 }
 function looksLikeDanglingToolIntent(text: string) {
  const normalized = text
    .toLowerCase()
    .replace(/[`*_>#-]/g, " ")
    .replace(/\s+/g, " ")
    .trim();
  if (!normalized) return false;
  if (normalized.length > 800) return false;
  if (/\blet me know\b/.test(normalized) || /\bif you (want|would like)\b/.test(normalized)) return false;
  return (
    /\b(calling|running|executing|trying|checking|testing)\b.{0,80}\b(now|it|tool|command|shell_exec|codex_exec)\b/.test(normalized) ||
    /\b(let me|i'?ll|i will)\b.{0,120}\b(run|execute|call|try|check|test)\b/.test(normalized) ||
    /\b(stand by|hang on|one moment)\b/.test(normalized)
  );
 }
 function appendDanglingToolIntentCorrection(conversation: any[], text: string) {
  conversation.push({ role: "assistant", content: text });
  conversation.push({
    role: "system",
    content:
      "Internal correction: the previous assistant message claimed it would run a tool, but no tool call was made. If the task needs an available tool, call it now. Otherwise provide the final answer directly without saying you will run a tool.",
  });
 }
 function mergeUsage(acc: Required<ToolAwareUsage>, usage: any) {
  if (!usage) return false;
  acc.inputTokens += usage.prompt_tokens ?? 0;
@@ -833,6 +875,7 @@ export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams):
  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
  let sawUsage = false;
  let totalToolCalls = 0;
  let danglingToolIntentRetries = 0;
  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
    const completion = await params.client.chat.completions.create({
@@ -858,8 +901,14 @@ export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams):
    const toolCalls = Array.isArray(message.tool_calls) ? message.tool_calls : [];
    if (!toolCalls.length) {
      const text = typeof message.content === "string" ? message.content : "";
      if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(text)) {
        danglingToolIntentRetries += 1;
        appendDanglingToolIntentCorrection(conversation, text);
        continue;
      }
      return {
-        text: typeof message.content === "string" ? message.content : "",
+        text,
        usage: sawUsage ? usageAcc : undefined,
        raw: { responses: rawResponses, toolCallsUsed: totalToolCalls },
        toolEvents,
@@ -914,6 +963,7 @@ export async function* runToolAwareOpenAIChatStream(
  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
  let sawUsage = false;
  let totalToolCalls = 0;
  let danglingToolIntentRetries = 0;
  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
    const stream = await params.client.chat.completions.create({
@@ -938,9 +988,6 @@ export async function* runToolAwareOpenAIChatStream(
      const deltaText = choice?.delta?.content ?? "";
      if (typeof deltaText === "string" && deltaText.length) {
        roundText += deltaText;
        if (roundToolCalls.size === 0) {
          yield { type: "delta", text: deltaText };
        }
      }
      const deltaToolCalls = Array.isArray(choice?.delta?.tool_calls) ? choice.delta.tool_calls : [];
@@ -969,6 +1016,14 @@ export async function* runToolAwareOpenAIChatStream(
      }));
    if (!normalizedToolCalls.length) {
      if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(roundText)) {
        danglingToolIntentRetries += 1;
        appendDanglingToolIntentCorrection(conversation, roundText);
        continue;
      }
      if (roundText) {
        yield { type: "delta", text: roundText };
      }
      yield {
        type: "done",
        result: {
@@ -982,7 +1037,7 @@ export async function* runToolAwareOpenAIChatStream(
    }
    totalToolCalls += normalizedToolCalls.length;
-    conversation.push({
+    const assistantToolCallMessage: any = {
      role: "assistant",
      tool_calls: normalizedToolCalls.map((call) => ({
        id: call.id,
@@ -992,7 +1047,11 @@ export async function* runToolAwareOpenAIChatStream(
          arguments: call.arguments,
        },
      })),
-    });
+    };
    if (roundText) {
      assistantToolCallMessage.content = roundText;
    }
    conversation.push(assistantToolCallMessage);
    for (const call of normalizedToolCalls) {
      const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);
--- a/server/src/llm/message-content.ts
+++ b/server/src/llm/message-content.ts
@@ -146,8 +146,13 @@ export function buildOpenAIConversationMessage(message: ChatMessage) {
  return out;
 }
 const ANTHROPIC_NO_SERVER_TOOLS_PROMPT =
  "This Anthropic backend path does not have server-managed tool calls. Do not claim to run shell commands, Codex tasks, web searches, or fetch URLs. If the user asks for tool execution, explain that they should switch to OpenAI or xAI in this app for tool-enabled chat.";
 export function getAnthropicSystemPrompt(messages: ChatMessage[]) {
-  return messages.find((message) => message.role === "system")?.content;
+  return [ANTHROPIC_NO_SERVER_TOOLS_PROMPT, messages.find((message) => message.role === "system")?.content]
    .filter(Boolean)
    .join("\n\n");
 }
 export function buildAnthropicConversationMessage(message: ChatMessage) {
--- a/server/src/llm/model-catalog.ts
+++ b/server/src/llm/model-catalog.ts
@@ -23,6 +23,16 @@ function uniqSorted(models: string[]) {
  return [...new Set(models.map((value) => value.trim()).filter(Boolean))].sort((a, b) => a.localeCompare(b));
 }
 function isLikelyOpenAIChatCompletionsModel(model: string) {
  const id = model.toLowerCase();
  if (id.includes("embedding") || id.includes("moderation")) return false;
  if (id.includes("audio") || id.includes("realtime") || id.includes("transcribe") || id.includes("tts")) return false;
  if (id.includes("image") || id.includes("dall-e") || id.includes("sora")) return false;
  if (id.includes("search") || id.includes("computer-use")) return false;
  if (/^gpt-[\d.]+-pro(?:-|$)/.test(id)) return false;
  return /^(gpt-|o\d|chatgpt-)/.test(id);
 }
 async function withTimeout<T>(promise: Promise<T>, timeoutMs: number, label: string) {
  let timeoutId: NodeJS.Timeout | null = null;
  try {
@@ -42,7 +52,7 @@ async function withTimeout<T>(promise: Promise<T>, timeoutMs: number, label: str
 async function fetchProviderModels(provider: Provider) {
  if (provider === "openai") {
    const page = await openaiClient().models.list();
-    return uniqSorted(page.data.map((model) => model.id));
+    return uniqSorted(page.data.map((model) => model.id).filter(isLikelyOpenAIChatCompletionsModel));
  }
  if (provider === "anthropic") {