adds attachment support

2026-05-02 19:21:06 -07:00
parent 11e6875de9
commit 38da3cea72
15 changed files with 949 additions and 67 deletions
--- a/server/src/index.ts
+++ b/server/src/index.ts
@@ -9,6 +9,7 @@ import { warmModelCatalog } from "./llm/model-catalog.js";
 import { registerRoutes } from "./routes.js";

 const app = Fastify({
+  bodyLimit: 32 * 1024 * 1024,
  disableRequestLogging: true,
  logger: {
    transport: {
--- a/server/src/llm/chat-tools.ts
+++ b/server/src/llm/chat-tools.ts
@@ -4,6 +4,7 @@ import { z } from "zod";
 import { env } from "../env.js";
 import { exaClient } from "../search/exa.js";
 import { searchSearxng } from "../search/searxng.js";
+import { buildOpenAIConversationMessage } from "./message-content.js";
 import type { ChatMessage } from "./types.js";

 const MAX_TOOL_ROUNDS = 4;
@@ -250,23 +251,7 @@ function extractHtmlTitle(html: string) {
 }

 function normalizeIncomingMessages(messages: ChatMessage[]) {
-  const normalized = messages.map((m) => {
-    if (m.role === "tool") {
-      const name = m.name?.trim() || "tool";
-      return {
-        role: "user",
-        content: `Tool output (${name}):\n${m.content}`,
-      };
-    }
-    if (m.role === "assistant" || m.role === "system" || m.role === "user") {
-      const out: any = { role: m.role, content: m.content };
-      if (m.name && (m.role === "assistant" || m.role === "user")) {
-        out.name = m.name;
-      }
-      return out;
-    }
-    return { role: "user", content: m.content };
-  });
+  const normalized = messages.map((message) => buildOpenAIConversationMessage(message));

  return [{ role: "system", content: CHAT_TOOL_SYSTEM_PROMPT }, ...normalized];
 }
--- a/server/src/llm/message-content.ts
+++ b/server/src/llm/message-content.ts
@@ -0,0 +1,211 @@
+import type { ChatAttachment, ChatImageAttachment, ChatMessage, ChatTextAttachment } from "./types.js";
+
+function escapeAttribute(value: string) {
+  return value.replace(/"/g, "&quot;");
+}
+
+function getImageAttachments(message: ChatMessage) {
+  return (message.attachments ?? []).filter((attachment): attachment is ChatImageAttachment => attachment.kind === "image");
+}
+
+function getTextAttachments(message: ChatMessage) {
+  return (message.attachments ?? []).filter((attachment): attachment is ChatTextAttachment => attachment.kind === "text");
+}
+
+function buildImageSummaryText(attachments: ChatImageAttachment[]) {
+  if (!attachments.length) return null;
+  const label = attachments.length === 1 ? "Attached image" : "Attached images";
+  return `${label}: ${attachments.map((attachment) => attachment.filename).join(", ")}.`;
+}
+
+function buildTextAttachmentPrompt(attachment: ChatTextAttachment) {
+  const truncationNote = attachment.truncated ? ' truncated="true"' : "";
+  return [
+    `Attached text file: ${attachment.filename}${attachment.truncated ? " (content truncated)" : ""}`,
+    `<attached_file filename="${escapeAttribute(attachment.filename)}" mime_type="${escapeAttribute(attachment.mimeType)}"${truncationNote}>`,
+    attachment.text,
+    "</attached_file>",
+  ].join("\n");
+}
+
+function toOpenAIContent(message: ChatMessage) {
+  const imageAttachments = getImageAttachments(message);
+  const textAttachments = getTextAttachments(message);
+  if (!imageAttachments.length && !textAttachments.length) {
+    return message.content;
+  }
+
+  const parts: Array<Record<string, unknown>> = [];
+
+  for (const attachment of imageAttachments) {
+    parts.push({
+      type: "image_url",
+      image_url: {
+        url: attachment.dataUrl,
+        detail: "auto",
+      },
+    });
+  }
+
+  const imageSummary = buildImageSummaryText(imageAttachments);
+  if (imageSummary) {
+    parts.push({ type: "text", text: imageSummary });
+  }
+
+  for (const attachment of textAttachments) {
+    parts.push({ type: "text", text: buildTextAttachmentPrompt(attachment) });
+  }
+
+  if (message.content.trim()) {
+    parts.push({ type: "text", text: message.content });
+  }
+
+  if (parts.length === 1 && parts[0]?.type === "text" && typeof parts[0].text === "string") {
+    return parts[0].text;
+  }
+
+  return parts;
+}
+
+function parseImageDataUrl(attachment: ChatImageAttachment) {
+  const match = attachment.dataUrl.match(/^data:(image\/(?:png|jpeg));base64,([a-z0-9+/=\s]+)$/i);
+  if (!match) {
+    throw new Error(`Invalid image attachment data URL for '${attachment.filename}'.`);
+  }
+
+  const mediaType = match[1].toLowerCase();
+  if (mediaType !== attachment.mimeType) {
+    throw new Error(`Image attachment MIME type mismatch for '${attachment.filename}'.`);
+  }
+
+  return {
+    mediaType,
+    data: match[2].replace(/\s+/g, ""),
+  };
+}
+
+function toAnthropicContent(message: ChatMessage) {
+  const imageAttachments = getImageAttachments(message);
+  const textAttachments = getTextAttachments(message);
+  if (!imageAttachments.length && !textAttachments.length) {
+    return message.content;
+  }
+
+  const blocks: Array<Record<string, unknown>> = [];
+
+  for (const attachment of imageAttachments) {
+    const source = parseImageDataUrl(attachment);
+    blocks.push({
+      type: "image",
+      source: {
+        type: "base64",
+        media_type: source.mediaType,
+        data: source.data,
+      },
+    });
+  }
+
+  const imageSummary = buildImageSummaryText(imageAttachments);
+  if (imageSummary) {
+    blocks.push({ type: "text", text: imageSummary });
+  }
+
+  for (const attachment of textAttachments) {
+    blocks.push({ type: "text", text: buildTextAttachmentPrompt(attachment) });
+  }
+
+  if (message.content.trim()) {
+    blocks.push({ type: "text", text: message.content });
+  }
+
+  if (blocks.length === 1 && blocks[0]?.type === "text" && typeof blocks[0].text === "string") {
+    return blocks[0].text;
+  }
+
+  return blocks;
+}
+
+export function buildOpenAIConversationMessage(message: ChatMessage) {
+  if (message.role === "tool") {
+    const name = message.name?.trim() || "tool";
+    return {
+      role: "user",
+      content: `Tool output (${name}):\n${message.content}`,
+    };
+  }
+
+  const out: Record<string, unknown> = {
+    role: message.role,
+    content: toOpenAIContent(message),
+  };
+
+  if (message.name && (message.role === "assistant" || message.role === "user")) {
+    out.name = message.name;
+  }
+
+  return out;
+}
+
+export function getAnthropicSystemPrompt(messages: ChatMessage[]) {
+  return messages.find((message) => message.role === "system")?.content;
+}
+
+export function buildAnthropicConversationMessage(message: ChatMessage) {
+  if (message.role === "system") {
+    throw new Error("System messages must be handled separately for Anthropic.");
+  }
+
+  if (message.role === "tool") {
+    const name = message.name?.trim() || "tool";
+    return {
+      role: "user",
+      content: `Tool output (${name}):\n${message.content}`,
+    };
+  }
+
+  return {
+    role: message.role === "assistant" ? "assistant" : "user",
+    content: toAnthropicContent(message),
+  };
+}
+
+export function buildComparableAttachments(input: unknown): ChatAttachment[] {
+  if (!Array.isArray(input)) return [];
+
+  const attachments: ChatAttachment[] = [];
+  for (const entry of input) {
+    if (!entry || typeof entry !== "object" || Array.isArray(entry)) continue;
+    const record = entry as Record<string, unknown>;
+    const kind = record.kind;
+    const id = typeof record.id === "string" ? record.id : "";
+    const filename = typeof record.filename === "string" ? record.filename : "";
+    const mimeType = typeof record.mimeType === "string" ? record.mimeType : "";
+    const sizeBytes = typeof record.sizeBytes === "number" ? record.sizeBytes : 0;
+
+    if (kind === "image" && typeof record.dataUrl === "string") {
+      attachments.push({
+        kind,
+        id,
+        filename,
+        mimeType: mimeType === "image/png" ? "image/png" : "image/jpeg",
+        sizeBytes,
+        dataUrl: record.dataUrl,
+      });
+      continue;
+    }
+
+    if (kind === "text" && typeof record.text === "string") {
+      attachments.push({
+        kind,
+        id,
+        filename,
+        mimeType,
+        sizeBytes,
+        text: record.text,
+        truncated: record.truncated === true,
+      });
+    }
+  }
+
+  return attachments;
+}
--- a/server/src/llm/multiplexer.ts
+++ b/server/src/llm/multiplexer.ts
@@ -2,6 +2,7 @@ import { performance } from "node:perf_hooks";
 import { prisma } from "../db.js";
 import { anthropicClient, openaiClient, xaiClient } from "./providers.js";
 import { buildToolLogMessageData, runToolAwareOpenAIChat } from "./chat-tools.js";
+import { buildAnthropicConversationMessage, getAnthropicSystemPrompt } from "./message-content.js";
 import type { MultiplexRequest, MultiplexResponse, Provider } from "./types.js";

 function asProviderEnum(p: Provider) {
@@ -68,11 +69,8 @@ export async function runMultiplex(req: MultiplexRequest): Promise<MultiplexResp
    } else if (req.provider === "anthropic") {
      const client = anthropicClient();

-      // Anthropic splits system prompt. We'll convert first system message into system string.
-      const system = req.messages.find((m) => m.role === "system")?.content;
-      const msgs = req.messages
-        .filter((m) => m.role !== "system")
-        .map((m) => ({ role: m.role === "assistant" ? "assistant" : "user", content: m.content }));
+      const system = getAnthropicSystemPrompt(req.messages);
+      const msgs = req.messages.filter((message) => message.role !== "system").map((message) => buildAnthropicConversationMessage(message));

      const r = await client.messages.create({
        model: req.model,
--- a/server/src/llm/streaming.ts
+++ b/server/src/llm/streaming.ts
@@ -2,6 +2,7 @@ import { performance } from "node:perf_hooks";
 import { prisma } from "../db.js";
 import { anthropicClient, openaiClient, xaiClient } from "./providers.js";
 import { buildToolLogMessageData, runToolAwareOpenAIChatStream, type ToolExecutionEvent } from "./chat-tools.js";
+import { buildAnthropicConversationMessage, getAnthropicSystemPrompt } from "./message-content.js";
 import type { MultiplexRequest, Provider } from "./types.js";

 export type StreamEvent =
@@ -88,10 +89,8 @@ export async function* runMultiplexStream(req: MultiplexRequest): AsyncGenerator
    } else if (req.provider === "anthropic") {
      const client = anthropicClient();

-      const system = req.messages.find((m) => m.role === "system")?.content;
-      const msgs = req.messages
-        .filter((m) => m.role !== "system")
-        .map((m) => ({ role: m.role === "assistant" ? "assistant" : "user", content: m.content }));
+      const system = getAnthropicSystemPrompt(req.messages);
+      const msgs = req.messages.filter((message) => message.role !== "system").map((message) => buildAnthropicConversationMessage(message));

      const stream = await client.messages.create({
        model: req.model,
--- a/server/src/llm/types.ts
+++ b/server/src/llm/types.ts
@@ -1,9 +1,31 @@
 export type Provider = "openai" | "anthropic" | "xai";

+export type ChatImageAttachment = {
+  kind: "image";
+  id: string;
+  filename: string;
+  mimeType: "image/png" | "image/jpeg";
+  sizeBytes: number;
+  dataUrl: string;
+};
+
+export type ChatTextAttachment = {
+  kind: "text";
+  id: string;
+  filename: string;
+  mimeType: string;
+  sizeBytes: number;
+  text: string;
+  truncated?: boolean;
+};
+
+export type ChatAttachment = ChatImageAttachment | ChatTextAttachment;
+
 export type ChatMessage = {
  role: "system" | "user" | "assistant" | "tool";
  content: string;
  name?: string;
+  attachments?: ChatAttachment[];
 };

 export type MultiplexRequest = {
--- a/server/src/routes.ts
+++ b/server/src/routes.ts
@@ -4,23 +4,33 @@ import type { FastifyInstance } from "fastify";
 import { prisma } from "./db.js";
 import { requireAdmin } from "./auth.js";
 import { env } from "./env.js";
+import { buildComparableAttachments } from "./llm/message-content.js";
 import { runMultiplex } from "./llm/multiplexer.js";
 import { runMultiplexStream } from "./llm/streaming.js";
 import { getModelCatalogSnapshot } from "./llm/model-catalog.js";
 import { openaiClient } from "./llm/providers.js";
 import { exaClient } from "./search/exa.js";
+import type { ChatAttachment } from "./llm/types.js";

 type IncomingChatMessage = {
  role: "system" | "user" | "assistant" | "tool";
  content: string;
  name?: string;
+  attachments?: ChatAttachment[];
 };

 function sameMessage(
-  a: { role: string; content: string; name?: string | null },
-  b: { role: string; content: string; name?: string | null }
+  a: { role: string; content: string; name?: string | null; metadata?: unknown },
+  b: { role: string; content: string; name?: string | null; attachments?: ChatAttachment[] }
 ) {
-  return a.role === b.role && a.content === b.content && (a.name ?? null) === (b.name ?? null);
+  const existingAttachments = JSON.stringify(buildComparableAttachments((a.metadata as Record<string, unknown> | null)?.attachments ?? null));
+  const incomingAttachments = JSON.stringify(b.attachments ?? []);
+  return (
+    a.role === b.role &&
+    a.content === b.content &&
+    (a.name ?? null) === (b.name ?? null) &&
+    existingAttachments === incomingAttachments
+  );
 }

 function isToolCallLogMetadata(value: unknown) {
@@ -60,10 +70,67 @@ async function storeNonAssistantMessages(chatId: string, messages: IncomingChatM
      role: m.role as any,
      content: m.content,
      name: m.name,
+      metadata: m.attachments?.length ? ({ attachments: m.attachments } as any) : undefined,
    })),
  });
 }

+const MAX_CHAT_ATTACHMENTS = 8;
+const MAX_IMAGE_ATTACHMENT_BYTES = 6 * 1024 * 1024;
+const MAX_TEXT_ATTACHMENT_CHARS = 200_000;
+const MAX_IMAGE_DATA_URL_CHARS = 8_500_000;
+
+const ChatAttachmentSchema = z.discriminatedUnion("kind", [
+  z.object({
+    kind: z.literal("image"),
+    id: z.string().trim().min(1).max(128),
+    filename: z.string().trim().min(1).max(255),
+    mimeType: z.enum(["image/png", "image/jpeg"]),
+    sizeBytes: z.number().int().positive().max(MAX_IMAGE_ATTACHMENT_BYTES),
+    dataUrl: z
+      .string()
+      .max(MAX_IMAGE_DATA_URL_CHARS)
+      .regex(/^data:image\/(?:png|jpeg);base64,[a-z0-9+/=\s]+$/i, "Invalid image data URL"),
+  }),
+  z.object({
+    kind: z.literal("text"),
+    id: z.string().trim().min(1).max(128),
+    filename: z.string().trim().min(1).max(255),
+    mimeType: z.string().trim().min(1).max(127),
+    sizeBytes: z.number().int().positive().max(8 * 1024 * 1024),
+    text: z.string().max(MAX_TEXT_ATTACHMENT_CHARS),
+    truncated: z.boolean().optional(),
+  }),
+]);
+
+const CompletionMessageSchema = z
+  .object({
+    role: z.enum(["system", "user", "assistant", "tool"]),
+    content: z.string(),
+    name: z.string().optional(),
+    attachments: z.array(ChatAttachmentSchema).max(MAX_CHAT_ATTACHMENTS).optional(),
+  })
+  .superRefine((value, ctx) => {
+    if (value.attachments?.length && value.role === "tool") {
+      ctx.addIssue({
+        code: z.ZodIssueCode.custom,
+        message: "Tool messages cannot include attachments.",
+        path: ["attachments"],
+      });
+    }
+  });
+
+function mergeAttachmentsIntoMetadata(metadata: unknown, attachments?: ChatAttachment[]) {
+  if (!attachments?.length) return metadata as any;
+  if (!metadata || typeof metadata !== "object" || Array.isArray(metadata)) {
+    return { attachments };
+  }
+  return {
+    ...(metadata as Record<string, unknown>),
+    attachments,
+  };
+}
+
 const SearchRunBody = z.object({
  query: z.string().trim().min(1).optional(),
  title: z.string().trim().min(1).optional(),
@@ -768,6 +835,7 @@ export async function registerRoutes(app: FastifyInstance) {
      content: z.string(),
      name: z.string().optional(),
      metadata: z.unknown().optional(),
+      attachments: z.array(ChatAttachmentSchema).max(MAX_CHAT_ATTACHMENTS).optional(),
    });

    const { chatId } = Params.parse(req.params);
@@ -779,7 +847,7 @@ export async function registerRoutes(app: FastifyInstance) {
        role: body.role as any,
        content: body.content,
        name: body.name,
-        metadata: body.metadata as any,
+        metadata: mergeAttachmentsIntoMetadata(body.metadata, body.attachments) as any,
      },
    });

@@ -794,13 +862,7 @@ export async function registerRoutes(app: FastifyInstance) {
      chatId: z.string().optional(),
      provider: z.enum(["openai", "anthropic", "xai"]),
      model: z.string().min(1),
-      messages: z.array(
-        z.object({
-          role: z.enum(["system", "user", "assistant", "tool"]),
-          content: z.string(),
-          name: z.string().optional(),
-        })
-      ),
+      messages: z.array(CompletionMessageSchema),
      temperature: z.number().min(0).max(2).optional(),
      maxTokens: z.number().int().positive().optional(),
    });
@@ -834,13 +896,7 @@ export async function registerRoutes(app: FastifyInstance) {
      chatId: z.string().optional(),
      provider: z.enum(["openai", "anthropic", "xai"]),
      model: z.string().min(1),
-      messages: z.array(
-        z.object({
-          role: z.enum(["system", "user", "assistant", "tool"]),
-          content: z.string(),
-          name: z.string().optional(),
-        })
-      ),
+      messages: z.array(CompletionMessageSchema),
      temperature: z.number().min(0).max(2).optional(),
      maxTokens: z.number().int().positive().optional(),
    });