Various fixes for tool calling

2026-05-02 21:19:52 -07:00
parent d579b5bf75
commit 8d6c069a33
8 changed files with 97 additions and 17 deletions
--- a/server/src/llm/chat-tools.ts
+++ b/server/src/llm/chat-tools.ts
@@ -12,7 +12,7 @@ import { searchSearxng } from "../search/searxng.js";
 import { buildOpenAIConversationMessage } from "./message-content.js";
 import type { ChatMessage } from "./types.js";

-const MAX_TOOL_ROUNDS = 4;
+const MAX_TOOL_ROUNDS = env.CHAT_MAX_TOOL_ROUNDS;
 const DEFAULT_WEB_RESULTS = 5;
 const MAX_WEB_RESULTS = 10;
 const DEFAULT_FETCH_MAX_CHARACTERS = 12_000;
@@ -25,6 +25,7 @@ const MAX_SHELL_COMMAND_CHARACTERS = 20_000;
 const DEFAULT_SHELL_MAX_OUTPUT_CHARACTERS = 24_000;
 const MAX_SHELL_MAX_OUTPUT_CHARACTERS = 80_000;
 const REMOTE_EXEC_MAX_BUFFER_BYTES = 1_000_000;
+const MAX_DANGLING_TOOL_INTENT_RETRIES = 1;

 const execFileAsync = promisify(execFile);

@@ -70,7 +71,7 @@ const CODEX_EXEC_TOOL = {
  function: {
    name: "codex_exec",
    description:
-      "Delegate a coding, terminal, or multi-step software task to a persistent remote Codex CLI workspace. Use for complex code changes, repository inspection, running programs/tests, debugging build failures, or other tasks that need a real shell. Return the remote Codex summary and relevant stdout/stderr.",
+      "Delegate a coding, terminal, or multi-step software task to a persistent remote Codex CLI workspace. Use for complex code changes, repository inspection, running programs/tests, debugging build failures, or other tasks that need a real shell. The task runs non-interactively; the remote Codex instance must make reasonable assumptions, complete the task, and return a final summary with relevant stdout/stderr.",
    parameters: {
      type: "object",
      properties: {
@@ -191,11 +192,12 @@ export const CHAT_TOOL_SYSTEM_PROMPT =
  "You can use tools to gather up-to-date web information when needed. " +
  "Use web_search for discovery and recent facts, and fetch_url to read the full content of a specific page. " +
  "Prefer tools when the user asks for current events, verification, sources, or details you do not already have. " +
+  "When you decide tool use is needed, call the tool immediately in the same response; do not say you are running a tool unless you actually call it. " +
  (env.CHAT_CODEX_TOOL_ENABLED
-    ? "Use codex_exec when a request needs substantial coding work, repository inspection, shell commands, tests, debugging, or another complex task suited to a persistent Codex workspace. Provide codex_exec a complete prompt with the goal, constraints, and expected report-back format. "
+    ? "Use codex_exec when a request needs substantial coding work, repository inspection, shell commands, tests, debugging, or another complex task suited to a persistent Codex workspace. Provide codex_exec a complete prompt with the goal, constraints, assumptions, and expected report-back format. Never ask codex_exec to wait for user input or run interactive commands. "
    : "") +
  (env.CHAT_SHELL_TOOL_ENABLED
-    ? "Use shell_exec for direct command-line work on the remote devbox, including quick Python programs, calculations, file inspection, running tests, and small scripts. "
+    ? "Use shell_exec for direct non-interactive command-line work on the remote devbox, including quick Python programs, calculations, file inspection, running tests, and small scripts. "
    : "") +
  "Do not fabricate tool outputs; reason only from provided tool results.";

@@ -535,7 +537,20 @@ function buildDevboxSshTarget() {

 function buildRemoteCodexCommand(prompt: string) {
  const workdir = env.CHAT_CODEX_REMOTE_WORKDIR.trim();
-  const codexCommand = `codex exec ${shellQuote(prompt)}`;
+  const wrappedPrompt = [
+    "You are running in a non-interactive batch environment.",
+    "",
+    "Rules:",
+    "- Do not ask questions or wait for user input.",
+    "- Do not use interactive commands, editors, pagers, or prompts.",
+    "- If details are ambiguous, make a reasonable assumption and continue.",
+    "- Complete the task in one run, including any requested file edits, commands, and verification.",
+    "- End with a concise final report that includes changed files, commands run, and outcomes.",
+    "",
+    "Task:",
+    prompt,
+  ].join("\n");
+  const codexCommand = `codex exec --skip-git-repo-check ${shellQuote(wrappedPrompt)} < /dev/null`;
  return `mkdir -p ${shellQuote(workdir)} && cd ${shellQuote(workdir)} && ${codexCommand}`;
 }

@@ -595,6 +610,7 @@ async function runCodexExecTool(input: unknown): Promise<ToolRunOutcome> {

  const run = async (keyPath?: string) => {
    const sshArgs = [
+      "-n",
      "-o",
      "BatchMode=yes",
      "-o",
@@ -662,6 +678,7 @@ async function runShellExecTool(input: unknown): Promise<ToolRunOutcome> {

  const run = async (keyPath?: string) => {
    const sshArgs = [
+      "-n",
      "-o",
      "BatchMode=yes",
      "-o",
@@ -756,6 +773,31 @@ function buildEventArgs(name: string, args: Record<string, unknown>) {
  return args;
 }

+function looksLikeDanglingToolIntent(text: string) {
+  const normalized = text
+    .toLowerCase()
+    .replace(/[`*_>#-]/g, " ")
+    .replace(/\s+/g, " ")
+    .trim();
+  if (!normalized) return false;
+  if (normalized.length > 800) return false;
+  if (/\blet me know\b/.test(normalized) || /\bif you (want|would like)\b/.test(normalized)) return false;
+  return (
+    /\b(calling|running|executing|trying|checking|testing)\b.{0,80}\b(now|it|tool|command|shell_exec|codex_exec)\b/.test(normalized) ||
+    /\b(let me|i'?ll|i will)\b.{0,120}\b(run|execute|call|try|check|test)\b/.test(normalized) ||
+    /\b(stand by|hang on|one moment)\b/.test(normalized)
+  );
+}
+
+function appendDanglingToolIntentCorrection(conversation: any[], text: string) {
+  conversation.push({ role: "assistant", content: text });
+  conversation.push({
+    role: "system",
+    content:
+      "Internal correction: the previous assistant message claimed it would run a tool, but no tool call was made. If the task needs an available tool, call it now. Otherwise provide the final answer directly without saying you will run a tool.",
+  });
+}
+
 function mergeUsage(acc: Required<ToolAwareUsage>, usage: any) {
  if (!usage) return false;
  acc.inputTokens += usage.prompt_tokens ?? 0;
@@ -833,6 +875,7 @@ export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams):
  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
  let sawUsage = false;
  let totalToolCalls = 0;
+  let danglingToolIntentRetries = 0;

  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
    const completion = await params.client.chat.completions.create({
@@ -858,8 +901,14 @@ export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams):

    const toolCalls = Array.isArray(message.tool_calls) ? message.tool_calls : [];
    if (!toolCalls.length) {
+      const text = typeof message.content === "string" ? message.content : "";
+      if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(text)) {
+        danglingToolIntentRetries += 1;
+        appendDanglingToolIntentCorrection(conversation, text);
+        continue;
+      }
      return {
-        text: typeof message.content === "string" ? message.content : "",
+        text,
        usage: sawUsage ? usageAcc : undefined,
        raw: { responses: rawResponses, toolCallsUsed: totalToolCalls },
        toolEvents,
@@ -914,6 +963,7 @@ export async function* runToolAwareOpenAIChatStream(
  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
  let sawUsage = false;
  let totalToolCalls = 0;
+  let danglingToolIntentRetries = 0;

  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
    const stream = await params.client.chat.completions.create({
@@ -938,9 +988,6 @@ export async function* runToolAwareOpenAIChatStream(
      const deltaText = choice?.delta?.content ?? "";
      if (typeof deltaText === "string" && deltaText.length) {
        roundText += deltaText;
-        if (roundToolCalls.size === 0) {
-          yield { type: "delta", text: deltaText };
-        }
      }

      const deltaToolCalls = Array.isArray(choice?.delta?.tool_calls) ? choice.delta.tool_calls : [];
@@ -969,6 +1016,14 @@ export async function* runToolAwareOpenAIChatStream(
      }));

    if (!normalizedToolCalls.length) {
+      if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(roundText)) {
+        danglingToolIntentRetries += 1;
+        appendDanglingToolIntentCorrection(conversation, roundText);
+        continue;
+      }
+      if (roundText) {
+        yield { type: "delta", text: roundText };
+      }
      yield {
        type: "done",
        result: {
@@ -982,7 +1037,7 @@ export async function* runToolAwareOpenAIChatStream(
    }

    totalToolCalls += normalizedToolCalls.length;
-    conversation.push({
+    const assistantToolCallMessage: any = {
      role: "assistant",
      tool_calls: normalizedToolCalls.map((call) => ({
        id: call.id,
@@ -992,7 +1047,11 @@ export async function* runToolAwareOpenAIChatStream(
          arguments: call.arguments,
        },
      })),
-    });
+    };
+    if (roundText) {
+      assistantToolCallMessage.content = roundText;
+    }
+    conversation.push(assistantToolCallMessage);

    for (const call of normalizedToolCalls) {
      const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);