Sybil-2/server/src/llm/chat-tools.ts

import { execFile } from "node:child_process";
import { mkdtemp, rm, writeFile } from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { promisify } from "node:util";
import { convert as htmlToText } from "html-to-text";
import type OpenAI from "openai";
import { z } from "zod";
import { env } from "../env.js";
import { exaClient } from "../search/exa.js";
import { searchSearxng } from "../search/searxng.js";
import { buildOpenAIConversationMessage, buildOpenAIResponsesInputMessage } from "./message-content.js";
import type { ChatMessage } from "./types.js";

const MAX_TOOL_ROUNDS = env.CHAT_MAX_TOOL_ROUNDS;
const DEFAULT_WEB_RESULTS = 5;
const MAX_WEB_RESULTS = 10;
const DEFAULT_FETCH_MAX_CHARACTERS = 12_000;
const MAX_FETCH_MAX_CHARACTERS = 50_000;
const FETCH_TIMEOUT_MS = 12_000;
const MAX_CODEX_PROMPT_CHARACTERS = 60_000;
const DEFAULT_CODEX_MAX_OUTPUT_CHARACTERS = 24_000;
const MAX_CODEX_MAX_OUTPUT_CHARACTERS = 80_000;
const MAX_SHELL_COMMAND_CHARACTERS = 20_000;
const DEFAULT_SHELL_MAX_OUTPUT_CHARACTERS = 24_000;
const MAX_SHELL_MAX_OUTPUT_CHARACTERS = 80_000;
const REMOTE_EXEC_MAX_BUFFER_BYTES = 1_000_000;
const MAX_DANGLING_TOOL_INTENT_RETRIES = 1;

const execFileAsync = promisify(execFile);

const WebSearchArgsSchema = z
  .object({
    query: z.string().trim().min(1),
    numResults: z.coerce.number().int().min(1).max(MAX_WEB_RESULTS).optional(),
    type: z.enum(["auto", "fast", "instant"]).optional(),
    includeDomains: z.array(z.string().trim().min(1)).max(25).optional(),
    excludeDomains: z.array(z.string().trim().min(1)).max(25).optional(),
  })
  .strict();

type WebSearchArgs = z.infer<typeof WebSearchArgsSchema>;

const FetchUrlArgsSchema = z
  .object({
    url: z.string().trim().url(),
    maxCharacters: z.coerce.number().int().min(500).max(MAX_FETCH_MAX_CHARACTERS).optional(),
  })
  .strict();

const CodexExecArgsSchema = z
  .object({
    prompt: z.string().trim().min(1).max(MAX_CODEX_PROMPT_CHARACTERS),
    maxCharacters: z.coerce.number().int().min(1_000).max(MAX_CODEX_MAX_OUTPUT_CHARACTERS).optional(),
  })
  .strict();

type CodexExecArgs = z.infer<typeof CodexExecArgsSchema>;

const ShellExecArgsSchema = z
  .object({
    command: z.string().trim().min(1).max(MAX_SHELL_COMMAND_CHARACTERS),
    maxCharacters: z.coerce.number().int().min(1_000).max(MAX_SHELL_MAX_OUTPUT_CHARACTERS).optional(),
  })
  .strict();

type ShellExecArgs = z.infer<typeof ShellExecArgsSchema>;

const CODEX_EXEC_TOOL = {
  type: "function",
  function: {
    name: "codex_exec",
    description:
      "Delegate a coding, terminal, or multi-step software task to a persistent remote Codex CLI workspace. Use for complex code changes, repository inspection, running programs/tests, debugging build failures, or other tasks that need a real shell. The task runs non-interactively; the remote Codex instance must make reasonable assumptions, complete the task, and return a final summary with relevant stdout/stderr.",
    parameters: {
      type: "object",
      properties: {
        prompt: {
          type: "string",
          description:
            "A complete, self-contained instruction for the remote Codex instance. Include the goal, relevant context, constraints, and what result to report back.",
        },
        maxCharacters: {
          type: "integer",
          minimum: 1_000,
          maximum: MAX_CODEX_MAX_OUTPUT_CHARACTERS,
          description: "Maximum stdout/stderr characters returned to the model (default 24000).",
        },
      },
      required: ["prompt"],
      additionalProperties: false,
    },
  },
};

const SHELL_EXEC_TOOL = {
  type: "function",
  function: {
    name: "shell_exec",
    description:
      "Run an arbitrary non-interactive shell command on the configured remote devbox, starting in the persistent scratch workspace. Use for quick Python scripts, calculations, file inspection, package/tool checks, tests, and command-line work that needs a real shell. This does not run inside the Sybil server container.",
    parameters: {
      type: "object",
      properties: {
        command: {
          type: "string",
          description:
            "Shell command to run on the devbox. The command is executed with bash -lc when bash exists, otherwise sh -lc, starting in the persistent scratch workspace.",
        },
        maxCharacters: {
          type: "integer",
          minimum: 1_000,
          maximum: MAX_SHELL_MAX_OUTPUT_CHARACTERS,
          description: "Maximum stdout/stderr characters returned to the model (default 24000).",
        },
      },
      required: ["command"],
      additionalProperties: false,
    },
  },
};

const BASE_CHAT_TOOLS: any[] = [
  {
    type: "function",
    function: {
      name: "web_search",
      description:
        "Search the public web for recent or factual information. Returns ranked results with per-result summaries and snippets.",
      parameters: {
        type: "object",
        properties: {
          query: { type: "string", description: "Search query." },
          numResults: {
            type: "integer",
            minimum: 1,
            maximum: MAX_WEB_RESULTS,
            description: "Number of results to return (default 5).",
          },
          type: {
            type: "string",
            enum: ["auto", "fast", "instant"],
            description: "Search mode.",
          },
          includeDomains: {
            type: "array",
            items: { type: "string" },
            description: "Only include these domains.",
          },
          excludeDomains: {
            type: "array",
            items: { type: "string" },
            description: "Exclude these domains.",
          },
        },
        required: ["query"],
        additionalProperties: false,
      },
    },
  },
  {
    type: "function",
    function: {
      name: "fetch_url",
      description:
        "Fetch a webpage by URL and return readable plaintext content extracted from the page for deeper inspection.",
      parameters: {
        type: "object",
        properties: {
          url: { type: "string", description: "Absolute URL to fetch, including http/https." },
          maxCharacters: {
            type: "integer",
            minimum: 500,
            maximum: MAX_FETCH_MAX_CHARACTERS,
            description: "Maximum response text characters returned (default 12000).",
          },
        },
        required: ["url"],
        additionalProperties: false,
      },
    },
  },
];

const CHAT_TOOLS: any[] = [
  ...BASE_CHAT_TOOLS,
  ...(env.CHAT_CODEX_TOOL_ENABLED ? [CODEX_EXEC_TOOL] : []),
  ...(env.CHAT_SHELL_TOOL_ENABLED ? [SHELL_EXEC_TOOL] : []),
];

const RESPONSES_CHAT_TOOLS: any[] = CHAT_TOOLS.map((tool) => {
  if (tool?.type !== "function") return tool;
  return {
    type: "function",
    name: tool.function.name,
    description: tool.function.description,
    parameters: tool.function.parameters,
    strict: false,
  };
});

export const CHAT_TOOL_SYSTEM_PROMPT =
  "You can use tools to gather up-to-date web information when needed. " +
  "Use web_search for discovery and recent facts, and fetch_url to read the full content of a specific page. " +
  "Prefer tools when the user asks for current events, verification, sources, or details you do not already have. " +
  "When you decide tool use is needed, call the tool immediately in the same response; do not say you are running a tool unless you actually call it. " +
  (env.CHAT_CODEX_TOOL_ENABLED
    ? "Use codex_exec when a request needs substantial coding work, repository inspection, shell commands, tests, debugging, or another complex task suited to a persistent Codex workspace. Provide codex_exec a complete prompt with the goal, constraints, assumptions, and expected report-back format. Never ask codex_exec to wait for user input or run interactive commands. "
    : "") +
  (env.CHAT_SHELL_TOOL_ENABLED
    ? "Use shell_exec for direct non-interactive command-line work on the remote devbox, including quick Python programs, calculations, file inspection, running tests, and small scripts. "
    : "") +
  "Do not fabricate tool outputs; reason only from provided tool results.";

type ToolRunOutcome = {
  ok: boolean;
  [key: string]: unknown;
};

type ToolAwareUsage = {
  inputTokens?: number;
  outputTokens?: number;
  totalTokens?: number;
};

type ToolAwareCompletionResult = {
  text: string;
  usage?: ToolAwareUsage;
  raw: unknown;
  toolEvents: ToolExecutionEvent[];
};

export type ToolAwareStreamingEvent =
  | { type: "delta"; text: string }
  | { type: "tool_call"; event: ToolExecutionEvent }
  | { type: "done"; result: ToolAwareCompletionResult };

type ToolAwareCompletionParams = {
  client: OpenAI;
  model: string;
  messages: ChatMessage[];
  temperature?: number;
  maxTokens?: number;
  onToolEvent?: (event: ToolExecutionEvent) => void | Promise<void>;
  logContext?: {
    provider: string;
    model: string;
    chatId?: string;
  };
};

export type ToolExecutionEvent = {
  toolCallId: string;
  name: string;
  status: "completed" | "failed";
  summary: string;
  args: Record<string, unknown>;
  startedAt: string;
  completedAt: string;
  durationMs: number;
  error?: string;
  resultPreview?: string;
};

function compactWhitespace(input: string) {
  return input.replace(/\r/g, "").replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").trim();
}

function clipText(input: string, maxCharacters: number) {
  return input.length <= maxCharacters ? input : `${input.slice(0, maxCharacters)}...`;
}

function toRecord(value: unknown): Record<string, unknown> {
  if (!value || typeof value !== "object" || Array.isArray(value)) return {};
  return { ...(value as Record<string, unknown>) };
}

function toSingleLine(value: string, maxLength = 220) {
  return clipText(
    value
      .replace(/\r?\n+/g, " ")
      .replace(/\s+/g, " ")
      .trim(),
    maxLength
  );
}

function buildToolSummary(name: string, args: Record<string, unknown>, status: "completed" | "failed", error?: string) {
  const errSuffix = status === "failed" && error ? ` Error: ${toSingleLine(error, 140)}` : "";
  if (name === "web_search") {
    const query = typeof args.query === "string" ? args.query.trim() : "";
    if (status === "completed") {
      return query ? `Performed web search for '${toSingleLine(query, 100)}'.` : "Performed web search.";
    }
    return query ? `Web search for '${toSingleLine(query, 100)}' failed.${errSuffix}` : `Web search failed.${errSuffix}`;
  }

  if (name === "fetch_url") {
    const url = typeof args.url === "string" ? args.url.trim() : "";
    if (status === "completed") {
      return url ? `Fetched URL ${toSingleLine(url, 140)}.` : "Fetched URL.";
    }
    return url ? `Fetching URL ${toSingleLine(url, 140)} failed.${errSuffix}` : `Fetching URL failed.${errSuffix}`;
  }

  if (name === "codex_exec") {
    const prompt = typeof args.prompt === "string" ? args.prompt.trim() : "";
    if (status === "completed") {
      return prompt ? `Ran Codex task: '${toSingleLine(prompt, 120)}'.` : "Ran Codex task.";
    }
    return prompt ? `Codex task '${toSingleLine(prompt, 120)}' failed.${errSuffix}` : `Codex task failed.${errSuffix}`;
  }

  if (name === "shell_exec") {
    const command = typeof args.command === "string" ? args.command.trim() : "";
    if (status === "completed") {
      return command ? `Ran devbox shell command: '${toSingleLine(command, 120)}'.` : "Ran devbox shell command.";
    }
    return command
      ? `Devbox shell command '${toSingleLine(command, 120)}' failed.${errSuffix}`
      : `Devbox shell command failed.${errSuffix}`;
  }

  if (status === "completed") {
    return `Ran tool '${name}'.`;
  }
  return `Tool '${name}' failed.${errSuffix}`;
}

function logToolEvent(event: ToolExecutionEvent, context?: ToolAwareCompletionParams["logContext"]) {
  const payload = {
    kind: "tool_call",
    ...context,
    ...event,
  };
  const line = `[tool_call] ${JSON.stringify(payload)}`;
  if (event.status === "failed") console.error(line);
  else console.info(line);
}

function buildResultPreview(toolResult: ToolRunOutcome) {
  const serialized = JSON.stringify(toolResult);
  return serialized ? clipText(serialized, 400) : undefined;
}

export function buildToolLogMessageData(chatId: string, event: ToolExecutionEvent) {
  return {
    chatId,
    role: "tool" as const,
    content: event.summary,
    name: event.name,
    metadata: {
      kind: "tool_call",
      toolCallId: event.toolCallId,
      toolName: event.name,
      status: event.status,
      summary: event.summary,
      args: event.args,
      startedAt: event.startedAt,
      completedAt: event.completedAt,
      durationMs: event.durationMs,
      error: event.error ?? null,
      resultPreview: event.resultPreview ?? null,
    },
  };
}

function extractHtmlTitle(html: string) {
  const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
  if (!match?.[1]) return null;
  return compactWhitespace(
    match[1]
      .replace(/&nbsp;/gi, " ")
      .replace(/&amp;/gi, "&")
      .replace(/&lt;/gi, "<")
      .replace(/&gt;/gi, ">")
      .replace(/&quot;/gi, '"')
      .replace(/&#39;/gi, "'")
  );
}

function normalizeIncomingMessages(messages: ChatMessage[]) {
  const normalized = messages.map((message) => buildOpenAIConversationMessage(message));

  return [{ role: "system", content: CHAT_TOOL_SYSTEM_PROMPT }, ...normalized];
}

function normalizeIncomingResponsesInput(messages: ChatMessage[]) {
  const normalized = messages.map((message) => buildOpenAIResponsesInputMessage(message));

  return [{ role: "system", content: CHAT_TOOL_SYSTEM_PROMPT }, ...normalized];
}

async function runExaWebSearchTool(args: WebSearchArgs): Promise<ToolRunOutcome> {
  const exa = exaClient();
  const response = await exa.search(args.query, {
    type: args.type ?? "auto",
    numResults: args.numResults ?? DEFAULT_WEB_RESULTS,
    includeDomains: args.includeDomains,
    excludeDomains: args.excludeDomains,
    moderation: true,
    userLocation: "US",
    contents: {
      summary: { query: args.query },
      highlights: {
        query: args.query,
        maxCharacters: 320,
        numSentences: 2,
        highlightsPerUrl: 2,
      },
      text: { maxCharacters: 1_000 },
    },
  } as any);

  const results = Array.isArray(response?.results) ? response.results : [];
  return {
    ok: true,
    searchEngine: "exa",
    query: args.query,
    requestId: response?.requestId ?? null,
    results: results.map((result: any, index: number) => ({
      rank: index + 1,
      title: typeof result?.title === "string" ? result.title : null,
      url: typeof result?.url === "string" ? result.url : null,
      publishedDate: typeof result?.publishedDate === "string" ? result.publishedDate : null,
      author: typeof result?.author === "string" ? result.author : null,
      summary: typeof result?.summary === "string" ? clipText(result.summary, 1_400) : null,
      text: typeof result?.text === "string" ? clipText(result.text, 700) : null,
      highlights: Array.isArray(result?.highlights)
        ? result.highlights.filter((h: unknown) => typeof h === "string").slice(0, 3).map((h: string) => clipText(h, 280))
        : [],
    })),
  };
}

async function runSearxngWebSearchTool(args: WebSearchArgs): Promise<ToolRunOutcome> {
  const response = await searchSearxng(args.query, {
    numResults: args.numResults ?? DEFAULT_WEB_RESULTS,
    includeDomains: args.includeDomains,
    excludeDomains: args.excludeDomains,
  });

  return {
    ok: true,
    searchEngine: "searxng",
    query: args.query,
    requestId: response.requestId,
    results: response.results.map((result, index) => ({
      rank: index + 1,
      title: result.title,
      url: result.url,
      publishedDate: result.publishedDate,
      author: null,
      summary: result.summary,
      text: result.text,
      highlights: result.summary ? [clipText(result.summary, 280)] : [],
      engines: result.engines,
    })),
  };
}

async function runWebSearchTool(input: unknown): Promise<ToolRunOutcome> {
  const args = WebSearchArgsSchema.parse(input);
  if (env.CHAT_WEB_SEARCH_ENGINE === "searxng") {
    return runSearxngWebSearchTool(args);
  }
  return runExaWebSearchTool(args);
}

function assertSafeFetchUrl(urlRaw: string) {
  const parsed = new URL(urlRaw);
  if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
    throw new Error("Only http:// and https:// URLs are supported.");
  }
  return parsed;
}

async function runFetchUrlTool(input: unknown): Promise<ToolRunOutcome> {
  const args = FetchUrlArgsSchema.parse(input);
  const parsed = assertSafeFetchUrl(args.url);
  const maxCharacters = args.maxCharacters ?? DEFAULT_FETCH_MAX_CHARACTERS;

  const controller = new AbortController();
  const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);

  let response: Response;
  try {
    response = await fetch(parsed.toString(), {
      redirect: "follow",
      signal: controller.signal,
      headers: {
        "User-Agent": "SybilBot/1.0 (+https://sybil.local)",
        Accept: "text/html, text/plain, application/json;q=0.9, */*;q=0.5",
      },
    });
  } finally {
    clearTimeout(timeout);
  }

  if (!response.ok) {
    throw new Error(`Fetch failed with status ${response.status}.`);
  }

  const contentType = (response.headers.get("content-type") ?? "").toLowerCase();
  const body = await response.text();
  const isHtml = contentType.includes("text/html") || /<!doctype html|<html[\s>]/i.test(body);

  let extracted = body;
  if (isHtml) {
    extracted = htmlToText(body, {
      wordwrap: false,
      preserveNewlines: true,
      selectors: [
        { selector: "img", format: "skip" },
        { selector: "script", format: "skip" },
        { selector: "style", format: "skip" },
        { selector: "noscript", format: "skip" },
        { selector: "a", options: { ignoreHref: true } },
      ],
    });
  }

  const normalized = compactWhitespace(extracted);
  const truncated = normalized.length > maxCharacters;
  const text = truncated
    ? `${normalized.slice(0, maxCharacters)}\n\n[truncated ${normalized.length - maxCharacters} characters]`
    : normalized;

  return {
    ok: true,
    url: response.url || parsed.toString(),
    status: response.status,
    contentType: contentType || null,
    title: isHtml ? extractHtmlTitle(body) : null,
    truncated,
    text,
  };
}

function shellQuote(value: string) {
  return `'${value.replace(/'/g, `'\\''`)}'`;
}

function buildDevboxSshTarget() {
  const host = env.CHAT_CODEX_REMOTE_HOST;
  if (!host) {
    throw new Error("CHAT_CODEX_REMOTE_HOST not set");
  }
  if (!env.CHAT_CODEX_REMOTE_USER || host.includes("@")) {
    return host;
  }
  return `${env.CHAT_CODEX_REMOTE_USER}@${host}`;
}

function buildRemoteCodexCommand(prompt: string) {
  const workdir = env.CHAT_CODEX_REMOTE_WORKDIR.trim();
  const wrappedPrompt = [
    "You are running in a non-interactive batch environment.",
    "",
    "Rules:",
    "- Do not ask questions or wait for user input.",
    "- Do not use interactive commands, editors, pagers, or prompts.",
    "- If details are ambiguous, make a reasonable assumption and continue.",
    "- Complete the task in one run, including any requested file edits, commands, and verification.",
    "- End with a concise final report that includes changed files, commands run, and outcomes.",
    "",
    "Task:",
    prompt,
  ].join("\n");
  const codexCommand =
    `codex exec --dangerously-bypass-approvals-and-sandbox --skip-git-repo-check ${shellQuote(wrappedPrompt)} < /dev/null`;
  return `mkdir -p ${shellQuote(workdir)} && cd ${shellQuote(workdir)} && ${codexCommand}`;
}

function buildRemoteShellCommand(command: string) {
  const workdir = env.CHAT_CODEX_REMOTE_WORKDIR.trim();
  const quotedCommand = shellQuote(command);
  return (
    `mkdir -p ${shellQuote(workdir)} && cd ${shellQuote(workdir)} && ` +
    `if command -v bash >/dev/null 2>&1; then bash -lc ${quotedCommand}; else sh -lc ${quotedCommand}; fi`
  );
}

async function withDevboxSshKeyPath<T>(fn: (keyPath?: string) => Promise<T>) {
  if (env.CHAT_CODEX_SSH_KEY_PATH) {
    return fn(env.CHAT_CODEX_SSH_KEY_PATH);
  }

  if (!env.CHAT_CODEX_SSH_PRIVATE_KEY_B64) {
    return fn(undefined);
  }

  const tmpDir = await mkdtemp(path.join(os.tmpdir(), "sybil-codex-ssh-"));
  const keyPath = path.join(tmpDir, "id");
  try {
    await writeFile(keyPath, Buffer.from(env.CHAT_CODEX_SSH_PRIVATE_KEY_B64, "base64"), { mode: 0o600 });
    return await fn(keyPath);
  } finally {
    await rm(tmpDir, { recursive: true, force: true });
  }
}

function clipRemoteOutput(value: string, maxCharacters: number) {
  if (value.length <= maxCharacters) {
    return { text: value, truncated: false };
  }
  return {
    text: `${value.slice(0, maxCharacters)}\n\n[truncated ${value.length - maxCharacters} characters]`,
    truncated: true,
  };
}

function bufferOrStringToString(value: unknown) {
  if (typeof value === "string") return value;
  if (Buffer.isBuffer(value)) return value.toString("utf8");
  return "";
}

async function runCodexExecTool(input: unknown): Promise<ToolRunOutcome> {
  if (!env.CHAT_CODEX_TOOL_ENABLED) {
    return { ok: false, error: "codex_exec is disabled." };
  }

  const args: CodexExecArgs = CodexExecArgsSchema.parse(input);
  const maxCharacters = args.maxCharacters ?? DEFAULT_CODEX_MAX_OUTPUT_CHARACTERS;
  const sshTarget = buildDevboxSshTarget();
  const remoteCommand = buildRemoteCodexCommand(args.prompt);

  const run = async (keyPath?: string) => {
    const sshArgs = [
      "-n",
      "-o",
      "BatchMode=yes",
      "-o",
      "StrictHostKeyChecking=accept-new",
      "-o",
      "UserKnownHostsFile=/tmp/sybil-codex-known-hosts",
      "-p",
      String(env.CHAT_CODEX_REMOTE_PORT),
    ];

    if (keyPath) {
      sshArgs.push("-i", keyPath);
    }

    sshArgs.push(sshTarget, remoteCommand);

    try {
      const result = await execFileAsync("ssh", sshArgs, {
        timeout: env.CHAT_CODEX_EXEC_TIMEOUT_MS,
        maxBuffer: REMOTE_EXEC_MAX_BUFFER_BYTES,
      });
      const stdout = clipRemoteOutput(bufferOrStringToString(result.stdout), maxCharacters);
      const stderr = clipRemoteOutput(bufferOrStringToString(result.stderr), Math.min(maxCharacters, 12_000));
      return {
        ok: true,
        host: env.CHAT_CODEX_REMOTE_HOST,
        workdir: env.CHAT_CODEX_REMOTE_WORKDIR,
        stdout: stdout.text,
        stderr: stderr.text,
        stdoutTruncated: stdout.truncated,
        stderrTruncated: stderr.truncated,
      };
    } catch (err: any) {
      const stdout = clipRemoteOutput(bufferOrStringToString(err?.stdout), maxCharacters);
      const stderr = clipRemoteOutput(bufferOrStringToString(err?.stderr), Math.min(maxCharacters, 12_000));
      return {
        ok: false,
        error: err?.killed
          ? `Remote Codex command timed out after ${env.CHAT_CODEX_EXEC_TIMEOUT_MS}ms.`
          : err?.message ?? String(err),
        exitCode: typeof err?.code === "number" ? err.code : null,
        signal: typeof err?.signal === "string" ? err.signal : null,
        host: env.CHAT_CODEX_REMOTE_HOST,
        workdir: env.CHAT_CODEX_REMOTE_WORKDIR,
        stdout: stdout.text,
        stderr: stderr.text,
        stdoutTruncated: stdout.truncated,
        stderrTruncated: stderr.truncated,
      };
    }
  };

  return withDevboxSshKeyPath(run);
}

async function runShellExecTool(input: unknown): Promise<ToolRunOutcome> {
  if (!env.CHAT_SHELL_TOOL_ENABLED) {
    return { ok: false, error: "shell_exec is disabled." };
  }

  const args: ShellExecArgs = ShellExecArgsSchema.parse(input);
  const maxCharacters = args.maxCharacters ?? DEFAULT_SHELL_MAX_OUTPUT_CHARACTERS;
  const sshTarget = buildDevboxSshTarget();
  const remoteCommand = buildRemoteShellCommand(args.command);

  const run = async (keyPath?: string) => {
    const sshArgs = [
      "-n",
      "-o",
      "BatchMode=yes",
      "-o",
      "StrictHostKeyChecking=accept-new",
      "-o",
      "UserKnownHostsFile=/tmp/sybil-codex-known-hosts",
      "-p",
      String(env.CHAT_CODEX_REMOTE_PORT),
    ];

    if (keyPath) {
      sshArgs.push("-i", keyPath);
    }

    sshArgs.push(sshTarget, remoteCommand);

    try {
      const result = await execFileAsync("ssh", sshArgs, {
        timeout: env.CHAT_SHELL_EXEC_TIMEOUT_MS,
        maxBuffer: REMOTE_EXEC_MAX_BUFFER_BYTES,
      });
      const stdout = clipRemoteOutput(bufferOrStringToString(result.stdout), maxCharacters);
      const stderr = clipRemoteOutput(bufferOrStringToString(result.stderr), Math.min(maxCharacters, 12_000));
      return {
        ok: true,
        host: env.CHAT_CODEX_REMOTE_HOST,
        workdir: env.CHAT_CODEX_REMOTE_WORKDIR,
        command: args.command,
        stdout: stdout.text,
        stderr: stderr.text,
        stdoutTruncated: stdout.truncated,
        stderrTruncated: stderr.truncated,
      };
    } catch (err: any) {
      const stdout = clipRemoteOutput(bufferOrStringToString(err?.stdout), maxCharacters);
      const stderr = clipRemoteOutput(bufferOrStringToString(err?.stderr), Math.min(maxCharacters, 12_000));
      return {
        ok: false,
        error: err?.killed
          ? `Remote shell command timed out after ${env.CHAT_SHELL_EXEC_TIMEOUT_MS}ms.`
          : err?.message ?? String(err),
        exitCode: typeof err?.code === "number" ? err.code : null,
        signal: typeof err?.signal === "string" ? err.signal : null,
        host: env.CHAT_CODEX_REMOTE_HOST,
        workdir: env.CHAT_CODEX_REMOTE_WORKDIR,
        command: args.command,
        stdout: stdout.text,
        stderr: stderr.text,
        stdoutTruncated: stdout.truncated,
        stderrTruncated: stderr.truncated,
      };
    }
  };

  return withDevboxSshKeyPath(run);
}

async function executeTool(name: string, args: unknown): Promise<ToolRunOutcome> {
  if (name === "web_search") return runWebSearchTool(args);
  if (name === "fetch_url") return runFetchUrlTool(args);
  if (name === "codex_exec") return runCodexExecTool(args);
  if (name === "shell_exec") return runShellExecTool(args);
  return { ok: false, error: `Unknown tool: ${name}` };
}

function parseToolArgs(raw: unknown) {
  if (typeof raw !== "string") return {};
  const trimmed = raw.trim();
  if (!trimmed) return {};
  try {
    return JSON.parse(trimmed);
  } catch (err: any) {
    throw new Error(`Invalid JSON arguments: ${err?.message ?? String(err)}`);
  }
}

function buildEventArgs(name: string, args: Record<string, unknown>) {
  if (name === "codex_exec" && typeof args.prompt === "string") {
    return {
      ...args,
      prompt: clipText(args.prompt, 1_000),
    };
  }

  if (name === "shell_exec" && typeof args.command === "string") {
    return {
      ...args,
      command: clipText(args.command, 1_000),
    };
  }

  return args;
}

function looksLikeDanglingToolIntent(text: string) {
  const normalized = text
    .toLowerCase()
    .replace(/[`*_>#-]/g, " ")
    .replace(/\s+/g, " ")
    .trim();
  if (!normalized) return false;
  if (normalized.length > 800) return false;
  if (/\blet me know\b/.test(normalized) || /\bif you (want|would like)\b/.test(normalized)) return false;
  return (
    /\b(calling|running|executing|trying|checking|testing)\b.{0,80}\b(now|it|tool|command|shell_exec|codex_exec)\b/.test(normalized) ||
    /\b(let me|i'?ll|i will)\b.{0,120}\b(run|execute|call|try|check|test)\b/.test(normalized) ||
    /\b(stand by|hang on|one moment)\b/.test(normalized)
  );
}

function appendDanglingToolIntentCorrection(conversation: any[], text: string) {
  conversation.push({ role: "assistant", content: text });
  conversation.push({
    role: "system",
    content:
      "Internal correction: the previous assistant message claimed it would run a tool, but no tool call was made. If the task needs an available tool, call it now. Otherwise provide the final answer directly without saying you will run a tool.",
  });
}

function mergeUsage(acc: Required<ToolAwareUsage>, usage: any) {
  if (!usage) return false;
  acc.inputTokens += usage.prompt_tokens ?? 0;
  acc.outputTokens += usage.completion_tokens ?? 0;
  acc.totalTokens += usage.total_tokens ?? 0;
  return true;
}

function mergeResponsesUsage(acc: Required<ToolAwareUsage>, usage: any) {
  if (!usage) return false;
  acc.inputTokens += usage.input_tokens ?? 0;
  acc.outputTokens += usage.output_tokens ?? 0;
  acc.totalTokens += usage.total_tokens ?? 0;
  return true;
}

function getResponseOutputItems(response: any) {
  return Array.isArray(response?.output) ? response.output : [];
}

function extractResponsesText(response: any, fallback = "") {
  if (typeof response?.output_text === "string") return response.output_text;

  const parts: string[] = [];
  for (const item of getResponseOutputItems(response)) {
    if (item?.type !== "message" || !Array.isArray(item.content)) continue;
    for (const content of item.content) {
      if (content?.type === "output_text" && typeof content.text === "string") {
        parts.push(content.text);
      } else if (content?.type === "refusal" && typeof content.refusal === "string") {
        parts.push(content.refusal);
      }
    }
  }
  return parts.join("") || fallback;
}

function getUnstreamedText(finalText: string, streamedText: string) {
  if (!finalText) return "";
  if (!streamedText) return finalText;
  return finalText.startsWith(streamedText) ? finalText.slice(streamedText.length) : "";
}

function getResponseFailureMessage(response: any) {
  if (response?.status !== "failed" && response?.status !== "incomplete") return null;
  const errorMessage = typeof response?.error?.message === "string" ? response.error.message : null;
  const incompleteReason = typeof response?.incomplete_details?.reason === "string" ? response.incomplete_details.reason : null;
  return errorMessage ?? (incompleteReason ? `Response incomplete: ${incompleteReason}` : `Response ${response.status}.`);
}

function normalizeResponsesToolCalls(outputItems: any[], round: number): NormalizedToolCall[] {
  return outputItems
    .filter((item) => item?.type === "function_call")
    .map((call: any, index: number) => ({
      id: call.call_id ?? call.id ?? `tool_call_${round}_${index}`,
      name: call.name ?? "unknown_tool",
      arguments: call.arguments ?? "{}",
    }));
}

type NormalizedToolCall = {
  id: string;
  name: string;
  arguments: string;
};

function normalizeModelToolCalls(toolCalls: any[], round: number): NormalizedToolCall[] {
  return toolCalls.map((call: any, index: number) => ({
    id: call?.id ?? `tool_call_${round}_${index}`,
    name: call?.function?.name ?? "unknown_tool",
    arguments: call?.function?.arguments ?? "{}",
  }));
}

async function executeToolCallAndBuildEvent(
  call: NormalizedToolCall,
  params: ToolAwareCompletionParams
): Promise<{ event: ToolExecutionEvent; toolResult: ToolRunOutcome }> {
  const startedAtMs = Date.now();
  const startedAt = new Date(startedAtMs).toISOString();
  let toolResult: ToolRunOutcome;
  let parsedArgs: Record<string, unknown> = {};
  try {
    parsedArgs = toRecord(parseToolArgs(call.arguments));
    toolResult = await executeTool(call.name, parsedArgs);
  } catch (err: any) {
    toolResult = {
      ok: false,
      error: err?.message ?? String(err),
    };
  }

  const status: "completed" | "failed" = toolResult.ok ? "completed" : "failed";
  const error =
    status === "failed"
      ? typeof toolResult.error === "string"
        ? toolResult.error
        : "Tool execution failed."
      : undefined;

  const completedAtMs = Date.now();
  const eventArgs = buildEventArgs(call.name, parsedArgs);
  const event: ToolExecutionEvent = {
    toolCallId: call.id,
    name: call.name,
    status,
    summary: buildToolSummary(call.name, eventArgs, status, error),
    args: eventArgs,
    startedAt,
    completedAt: new Date(completedAtMs).toISOString(),
    durationMs: completedAtMs - startedAtMs,
    error,
    resultPreview: buildResultPreview(toolResult),
  };
  logToolEvent(event, params.logContext);
  if (params.onToolEvent) {
    await params.onToolEvent(event);
  }

  return { event, toolResult };
}

export async function runToolAwareOpenAIChat(params: ToolAwareCompletionParams): Promise<ToolAwareCompletionResult> {
  const input: any[] = normalizeIncomingResponsesInput(params.messages);
  const rawResponses: unknown[] = [];
  const toolEvents: ToolExecutionEvent[] = [];
  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
  let sawUsage = false;
  let totalToolCalls = 0;
  let danglingToolIntentRetries = 0;

  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
    const response = await params.client.responses.create({
      model: params.model,
      input,
      temperature: params.temperature,
      max_output_tokens: params.maxTokens,
      tools: RESPONSES_CHAT_TOOLS,
      tool_choice: "auto",
      parallel_tool_calls: true,
      // Tool loops pass response output items back as input; reasoning items need persistence.
      store: true,
    } as any);
    rawResponses.push(response);
    sawUsage = mergeResponsesUsage(usageAcc, response?.usage) || sawUsage;

    const failureMessage = getResponseFailureMessage(response);
    if (failureMessage) {
      throw new Error(failureMessage);
    }

    const outputItems = getResponseOutputItems(response);
    const normalizedToolCalls = normalizeResponsesToolCalls(outputItems, round);
    if (!normalizedToolCalls.length) {
      const text = extractResponsesText(response);
      if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(text)) {
        danglingToolIntentRetries += 1;
        appendDanglingToolIntentCorrection(input, text);
        continue;
      }
      return {
        text,
        usage: sawUsage ? usageAcc : undefined,
        raw: { responses: rawResponses, toolCallsUsed: totalToolCalls, api: "responses" },
        toolEvents,
      };
    }

    totalToolCalls += normalizedToolCalls.length;
    input.push(...outputItems);

    for (const call of normalizedToolCalls) {
      const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);
      toolEvents.push(event);

      input.push({
        type: "function_call_output",
        call_id: call.id,
        output: JSON.stringify(toolResult),
      });
    }
  }

  return {
    text: "I reached the tool-call limit while gathering information. Please narrow the request and try again.",
    usage: sawUsage ? usageAcc : undefined,
    raw: { responses: rawResponses, toolCallsUsed: totalToolCalls, toolCallLimitReached: true, api: "responses" },
    toolEvents,
  };
}

export async function runToolAwareChatCompletions(params: ToolAwareCompletionParams): Promise<ToolAwareCompletionResult> {
  const conversation: any[] = normalizeIncomingMessages(params.messages);
  const rawResponses: unknown[] = [];
  const toolEvents: ToolExecutionEvent[] = [];
  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
  let sawUsage = false;
  let totalToolCalls = 0;
  let danglingToolIntentRetries = 0;

  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
    const completion = await params.client.chat.completions.create({
      model: params.model,
      messages: conversation,
      temperature: params.temperature,
      max_tokens: params.maxTokens,
      tools: CHAT_TOOLS,
      tool_choice: "auto",
    } as any);
    rawResponses.push(completion);
    sawUsage = mergeUsage(usageAcc, completion?.usage) || sawUsage;

    const message = completion?.choices?.[0]?.message;
    if (!message) {
      return {
        text: "",
        usage: sawUsage ? usageAcc : undefined,
        raw: { responses: rawResponses, toolCallsUsed: totalToolCalls, missingMessage: true },
        toolEvents,
      };
    }

    const toolCalls = Array.isArray(message.tool_calls) ? message.tool_calls : [];
    if (!toolCalls.length) {
      const text = typeof message.content === "string" ? message.content : "";
      if (danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES && looksLikeDanglingToolIntent(text)) {
        danglingToolIntentRetries += 1;
        appendDanglingToolIntentCorrection(conversation, text);
        continue;
      }
      return {
        text,
        usage: sawUsage ? usageAcc : undefined,
        raw: { responses: rawResponses, toolCallsUsed: totalToolCalls },
        toolEvents,
      };
    }

    const normalizedToolCalls = normalizeModelToolCalls(toolCalls, round);
    totalToolCalls += normalizedToolCalls.length;

    const assistantToolCallMessage: any = {
      role: "assistant",
      tool_calls: normalizedToolCalls.map((call) => ({
        id: call.id,
        type: "function",
        function: {
          name: call.name,
          arguments: call.arguments,
        },
      })),
    };
    if (typeof message.content === "string" && message.content.length) {
      assistantToolCallMessage.content = message.content;
    }
    conversation.push(assistantToolCallMessage);

    for (const call of normalizedToolCalls) {
      const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);
      toolEvents.push(event);

      conversation.push({
        role: "tool",
        tool_call_id: call.id,
        content: JSON.stringify(toolResult),
      });
    }
  }

  return {
    text: "I reached the tool-call limit while gathering information. Please narrow the request and try again.",
    usage: sawUsage ? usageAcc : undefined,
    raw: { responses: rawResponses, toolCallsUsed: totalToolCalls, toolCallLimitReached: true },
    toolEvents,
  };
}

export async function* runToolAwareOpenAIChatStream(
  params: ToolAwareCompletionParams
): AsyncGenerator<ToolAwareStreamingEvent> {
  const input: any[] = normalizeIncomingResponsesInput(params.messages);
  const rawResponses: unknown[] = [];
  const toolEvents: ToolExecutionEvent[] = [];
  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
  let sawUsage = false;
  let totalToolCalls = 0;
  let danglingToolIntentRetries = 0;

  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
    const stream = await params.client.responses.create({
      model: params.model,
      input,
      temperature: params.temperature,
      max_output_tokens: params.maxTokens,
      tools: RESPONSES_CHAT_TOOLS,
      tool_choice: "auto",
      parallel_tool_calls: true,
      // Tool loops pass response output items back as input; reasoning items need persistence.
      store: true,
      stream: true,
    } as any);

    let roundText = "";
    let streamedRoundText = "";
    let roundHasToolCalls = false;
    let canStreamRoundText = false;
    let completedResponse: any | null = null;
    const completedOutputItems: any[] = [];

    for await (const event of stream as any as AsyncIterable<any>) {
      rawResponses.push(event);

      if (event?.type === "response.output_text.delta" && typeof event.delta === "string") {
        roundText += event.delta;
        if (canStreamRoundText && !roundHasToolCalls && event.delta.length) {
          streamedRoundText += event.delta;
          yield { type: "delta", text: event.delta };
        }
      } else if (event?.type === "response.output_item.added" && event.item) {
        if (event.item.type === "function_call") {
          roundHasToolCalls = true;
          canStreamRoundText = false;
        } else if (event.item.type === "message" && !roundHasToolCalls) {
          canStreamRoundText = true;
        }
      } else if (event?.type === "response.output_item.done" && event.item) {
        completedOutputItems[event.output_index ?? completedOutputItems.length] = event.item;
        if (event.item.type === "function_call") {
          roundHasToolCalls = true;
          canStreamRoundText = false;
        }
      } else if (event?.type === "response.completed") {
        completedResponse = event.response;
        sawUsage = mergeResponsesUsage(usageAcc, event.response?.usage) || sawUsage;
      } else if (event?.type === "response.failed" || event?.type === "response.incomplete") {
        completedResponse = event.response;
        sawUsage = mergeResponsesUsage(usageAcc, event.response?.usage) || sawUsage;
      } else if (event?.type === "error") {
        throw new Error(event.message ?? "OpenAI Responses stream failed.");
      }
    }

    const failureMessage = getResponseFailureMessage(completedResponse);
    if (failureMessage) {
      throw new Error(failureMessage);
    }

    const outputItems = getResponseOutputItems(completedResponse);
    const responseOutputItems = outputItems.length ? outputItems : completedOutputItems.filter(Boolean);
    const normalizedToolCalls = normalizeResponsesToolCalls(responseOutputItems, round);
    if (!normalizedToolCalls.length) {
      const text = extractResponsesText(completedResponse, roundText);
      if (
        !streamedRoundText &&
        danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES &&
        looksLikeDanglingToolIntent(text)
      ) {
        danglingToolIntentRetries += 1;
        appendDanglingToolIntentCorrection(input, text);
        continue;
      }
      const unstreamedText = getUnstreamedText(text, streamedRoundText);
      if (unstreamedText) {
        yield { type: "delta", text: unstreamedText };
      }
      yield {
        type: "done",
        result: {
          text,
          usage: sawUsage ? usageAcc : undefined,
          raw: { streamed: true, responses: rawResponses, toolCallsUsed: totalToolCalls, api: "responses" },
          toolEvents,
        },
      };
      return;
    }

    totalToolCalls += normalizedToolCalls.length;
    input.push(...responseOutputItems);

    for (const call of normalizedToolCalls) {
      const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);
      toolEvents.push(event);
      yield { type: "tool_call", event };
      input.push({
        type: "function_call_output",
        call_id: call.id,
        output: JSON.stringify(toolResult),
      });
    }
  }

  yield {
    type: "done",
    result: {
      text: "I reached the tool-call limit while gathering information. Please narrow the request and try again.",
      usage: sawUsage ? usageAcc : undefined,
      raw: { streamed: true, responses: rawResponses, toolCallsUsed: totalToolCalls, toolCallLimitReached: true, api: "responses" },
      toolEvents,
    },
  };
}

export async function* runToolAwareChatCompletionsStream(
  params: ToolAwareCompletionParams
): AsyncGenerator<ToolAwareStreamingEvent> {
  const conversation: any[] = normalizeIncomingMessages(params.messages);
  const rawResponses: unknown[] = [];
  const toolEvents: ToolExecutionEvent[] = [];
  const usageAcc: Required<ToolAwareUsage> = { inputTokens: 0, outputTokens: 0, totalTokens: 0 };
  let sawUsage = false;
  let totalToolCalls = 0;
  let danglingToolIntentRetries = 0;

  for (let round = 0; round < MAX_TOOL_ROUNDS; round += 1) {
    const stream = await params.client.chat.completions.create({
      model: params.model,
      messages: conversation,
      temperature: params.temperature,
      max_tokens: params.maxTokens,
      tools: CHAT_TOOLS,
      tool_choice: "auto",
      stream: true,
      stream_options: { include_usage: true },
    } as any);

    let roundText = "";
    let streamedRoundText = "";
    let roundHasToolCalls = false;
    const roundToolCalls = new Map<number, { id?: string; name?: string; arguments: string }>();

    for await (const chunk of stream as any as AsyncIterable<any>) {
      rawResponses.push(chunk);
      sawUsage = mergeUsage(usageAcc, chunk?.usage) || sawUsage;

      const choice = chunk?.choices?.[0];
      const deltaText = choice?.delta?.content ?? "";
      if (typeof deltaText === "string" && deltaText.length) {
        roundText += deltaText;
        if (!roundHasToolCalls) {
          streamedRoundText += deltaText;
          yield { type: "delta", text: deltaText };
        }
      }

      const deltaToolCalls = Array.isArray(choice?.delta?.tool_calls) ? choice.delta.tool_calls : [];
      if (deltaToolCalls.length) {
        roundHasToolCalls = true;
      }
      for (const toolCall of deltaToolCalls) {
        const idx = typeof toolCall?.index === "number" ? toolCall.index : 0;
        const entry = roundToolCalls.get(idx) ?? { arguments: "" };
        if (typeof toolCall?.id === "string" && toolCall.id.length) {
          entry.id = toolCall.id;
        }
        if (typeof toolCall?.function?.name === "string" && toolCall.function.name.length) {
          entry.name = toolCall.function.name;
        }
        if (typeof toolCall?.function?.arguments === "string" && toolCall.function.arguments.length) {
          entry.arguments += toolCall.function.arguments;
        }
        roundToolCalls.set(idx, entry);
      }
    }

    const normalizedToolCalls: NormalizedToolCall[] = [...roundToolCalls.entries()]
      .sort((a, b) => a[0] - b[0])
      .map(([_, call], index) => ({
        id: call.id ?? `tool_call_${round}_${index}`,
        name: call.name ?? "unknown_tool",
        arguments: call.arguments || "{}",
      }));

    if (!normalizedToolCalls.length) {
      if (
        !streamedRoundText &&
        danglingToolIntentRetries < MAX_DANGLING_TOOL_INTENT_RETRIES &&
        looksLikeDanglingToolIntent(roundText)
      ) {
        danglingToolIntentRetries += 1;
        appendDanglingToolIntentCorrection(conversation, roundText);
        continue;
      }
      const unstreamedText = getUnstreamedText(roundText, streamedRoundText);
      if (unstreamedText) {
        yield { type: "delta", text: unstreamedText };
      }
      yield {
        type: "done",
        result: {
          text: roundText,
          usage: sawUsage ? usageAcc : undefined,
          raw: { streamed: true, responses: rawResponses, toolCallsUsed: totalToolCalls },
          toolEvents,
        },
      };
      return;
    }

    totalToolCalls += normalizedToolCalls.length;
    const assistantToolCallMessage: any = {
      role: "assistant",
      tool_calls: normalizedToolCalls.map((call) => ({
        id: call.id,
        type: "function",
        function: {
          name: call.name,
          arguments: call.arguments,
        },
      })),
    };
    if (roundText) {
      assistantToolCallMessage.content = roundText;
    }
    conversation.push(assistantToolCallMessage);

    for (const call of normalizedToolCalls) {
      const { event, toolResult } = await executeToolCallAndBuildEvent(call, params);
      toolEvents.push(event);
      yield { type: "tool_call", event };
      conversation.push({
        role: "tool",
        tool_call_id: call.id,
        content: JSON.stringify(toolResult),
      });
    }
  }

  yield {
    type: "done",
    result: {
      text: "I reached the tool-call limit while gathering information. Please narrow the request and try again.",
      usage: sawUsage ? usageAcc : undefined,
      raw: { streamed: true, responses: rawResponses, toolCallsUsed: totalToolCalls, toolCallLimitReached: true },
      toolEvents,
    },
  };
}