Various fixes for tool calling

This commit is contained in:
2026-05-02 21:19:52 -07:00
parent d579b5bf75
commit 8d6c069a33
8 changed files with 97 additions and 17 deletions

View File

@@ -12,7 +12,7 @@ import { searchSearxng } from "../search/searxng.js";
import { buildOpenAIConversationMessage } from "./message-content.js";
import type { ChatMessage } from "./types.js";
const MAX_TOOL_ROUNDS = 4;
const MAX_TOOL_ROUNDS = env.CHAT_MAX_TOOL_ROUNDS;
const DEFAULT_WEB_RESULTS = 5;
const MAX_WEB_RESULTS = 10;
const DEFAULT_FETCH_MAX_CHARACTERS = 12_000;
@@ -25,6 +25,7 @@ const MAX_SHELL_COMMAND_CHARACTERS = 20_000;
const DEFAULT_SHELL_MAX_OUTPUT_CHARACTERS = 24_000;
const MAX_SHELL_MAX_OUTPUT_CHARACTERS = 80_000;
const REMOTE_EXEC_MAX_BUFFER_BYTES = 1_000_000;
const MAX_DANGLING_TOOL_INTENT_RETRIES = 1;
const execFileAsync = promisify(execFile);
@@ -70,7 +71,7 @@ const CODEX_EXEC_TOOL = {
function: {
name: "codex_exec",
description:
"Delegate a coding, terminal, or multi-step software task to a persistent remote Codex CLI workspace. Use for complex code changes, repository inspection, running programs/tests, debugging build failures, or other tasks that need a real shell. Return the remote Codex summary and relevant stdout/stderr.",
"Delegate a coding, terminal, or multi-step software task to a persistent remote Codex CLI workspace. Use for complex code changes, repository inspection, running programs/tests, debugging build failures, or other tasks that need a real shell. The task runs non-interactively; the remote Codex instance must make reasonable assumptions, complete the task, and return a final summary with relevant stdout/stderr.",
parameters: {
type: "object",
properties: {
@@ -191,11 +192,12 @@ export const CHAT_TOOL_SYSTEM_PROMPT =
"You can use tools to gather up-to-date web information when needed. " +
"Use web_search for discovery and recent facts, and fetch_url to read the full content of a specific page. " +
"Prefer tools when the user asks for current events, verification, sources, or details you do not already have. " +
"When you decide tool use is needed, call the tool immediately in the same response; do not say you are running a tool unless you actually call it. " +
(env.CHAT_CODEX_TOOL_ENABLED
? "Use codex_exec when a request needs substantial coding work, repository inspection, shell commands, tests, debugging, or another complex task suited to a persistent Codex workspace. Provide codex_exec a complete prompt with the goal, constraints, and expected report-back format. "
? "Use codex_exec when a request needs substantial coding work, repository inspection, shell commands, tests, debugging, or another complex task suited to a persistent Codex workspace. Provide codex_exec a complete prompt with the goal, constraints, assumptions, and expected report-back format. Never ask codex_exec to wait for user input or run interactive commands. "
: "") +
(env.CHAT_SHELL_TOOL_ENABLED
? "Use shell_exec for direct command-line work on the remote devbox, including quick Python programs, calculations, file inspection, running tests, and small scripts. "
? "Use shell_exec for direct non-interactive command-line work on the remote devbox, including quick Python programs, calculations, file inspection, running tests, and small scripts. "
: "") +
"Do not fabricate tool outputs; reason only from provided tool results.";
@@ -535,7 +537,20 @@ function buildDevboxSshTarget() {
function buildRemoteCodexCommand(prompt: string) {
const workdir = env.CHAT_CODEX_REMOTE_WORKDIR.trim();
const codexCommand = `codex exec ${shellQuote(prompt)}`;
const wrappedPrompt = [
"You are running in a non-interactive batch environment.",
"",
"Rules:",
"- Do not ask questions or wait for user input.",
"- Do not use interactive commands, editors, pagers, or prompts.",
"- If details are ambiguous, make a reasonable assumption and continue.",
"- Complete the task in one run, including any requested file edits, commands, and verification.",
"- End with a concise final report that includes changed files, commands run, and outcomes.",
"",
"Task:",
prompt,
].join("\n");
const codexCommand = `codex exec --skip-git-repo-check ${shellQuote(wrappedPrompt)} < /dev/null`;
return `mkdir -p ${shellQuote(workdir)} && cd ${shellQuote(workdir)} && ${codexCommand}`;
}
@@ -595,6 +610,7 @@ async function runCodexExecTool(input: unknown): Promise<ToolRunOutcome> {
const run = async (keyPath?: string) => {
const sshArgs = [
"-n",
"-o",
"BatchMode=yes",
"-o",
@@ -662,6 +678,7 @@ async function runShellExecTool(input: unknown): Promise<ToolRunOutcome> {
const run = async (keyPath?: string) => {
const sshArgs = [
"-n",
"-o",
"BatchMode=yes",
"-o",
@@ -756,6 +773,31 @@ function buildEventArgs(name: string, args: Record<string, unknown>) {
return args;
}
function looksLikeDanglingToolIntent(text: string) {
const normalized = text
.toLowerCase()
.replace(/[`*_>#-]/g, " ")
.replace(/\s+/g, " ")
.trim();
if (!normalized) return false;
if (normalized.length > 800) return false;
if (/\blet me know\b/.test(normalized) || /\bif you (want|would like)\b/.test(normalized)) return false;
return (
/\b(calling|running|executing|trying|checking|testing)\b.{0,80}\b(now|it|tool|command|shell_exec|codex_exec)\b/.test(normalized) ||
/\b(let me|i'?ll|i will)\b.{0,120}\b(run|execute|call|try|check|test)\b/.test(normalized) ||
/\b(stand by|hang on|one moment)\b/.test(normalized)
);
}
function appendDanglingToolIntentCorrection(conversation: any[], text: string) {
conversation.push({ role: "assistant", content: text });
conversation.push({
role: "system",
content:
"Internal correction: the previous assistant message claimed it would run a tool, but no tool call was made. If the task needs an available tool, call it now. Otherwise provide the final answer directly without saying you will run a tool.",
});
}
function mergeUsage(acc: Required<ToolAwareUsage>, usage: any) {
if (!usage) return false;
acc.inputTokens += usage.prompt_tokens ?? 0;
@@ -833,6 +875,7 @@ export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams):
const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
let sawUsage = false;
let totalToolCalls = 0;
let danglingToolIntentRetries = 0;
for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
const completion = await params.client.chat.completions.create({
@@ -858,8 +901,14 @@ export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams):
const toolCalls = Array.isArray(message.tool_calls) ? message.tool_calls : [];
if (!toolCalls.length) {
const text = typeof message.content === "string" ? message.content : "";
if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(text)) {
danglingToolIntentRetries += 1;
appendDanglingToolIntentCorrection(conversation, text);
continue;
}
return {
text: typeof message.content === "string" ? message.content : "",
text,
usage: sawUsage ? usageAcc : undefined,
raw: { responses: rawResponses, toolCallsUsed: totalToolCalls },
toolEvents,
@@ -914,6 +963,7 @@ export async function* runToolAwareOpenAIChatStream(
const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
let sawUsage = false;
let totalToolCalls = 0;
let danglingToolIntentRetries = 0;
for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
const stream = await params.client.chat.completions.create({
@@ -938,9 +988,6 @@ export async function* runToolAwareOpenAIChatStream(
const deltaText = choice?.delta?.content ?? "";
if (typeof deltaText === "string" && deltaText.length) {
roundText += deltaText;
if (roundToolCalls.size === 0) {
yield { type: "delta", text: deltaText };
}
}
const deltaToolCalls = Array.isArray(choice?.delta?.tool_calls) ? choice.delta.tool_calls : [];
@@ -969,6 +1016,14 @@ export async function* runToolAwareOpenAIChatStream(
}));
if (!normalizedToolCalls.length) {
if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(roundText)) {
danglingToolIntentRetries += 1;
appendDanglingToolIntentCorrection(conversation, roundText);
continue;
}
if (roundText) {
yield { type: "delta", text: roundText };
}
yield {
type: "done",
result: {
@@ -982,7 +1037,7 @@ export async function* runToolAwareOpenAIChatStream(
}
totalToolCalls += normalizedToolCalls.length;
conversation.push({
const assistantToolCallMessage: any = {
role: "assistant",
tool_calls: normalizedToolCalls.map((call) => ({
id: call.id,
@@ -992,7 +1047,11 @@ export async function* runToolAwareOpenAIChatStream(
arguments: call.arguments,
},
})),
});
};
if (roundText) {
assistantToolCallMessage.content = roundText;
}
conversation.push(assistantToolCallMessage);
for (const call of normalizedToolCalls) {
const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);