feat: Anthropic Messages API compat (/v1/messages)

Add a wire-compatible Anthropic endpoint alongside the existing OpenAI one so Claude Code / anthropic-sdk / Cursor Agent can hit Lingma directly. - app/anthropic_schema.py (new): request model + content-block flattener + internal-messages adapter + affinity key helper. Handles text / image / tool_use / tool_result blocks; unknown types degrade gracefully. - app/auth.py: add require_anthropic_key (x-api-key, Bearer fallback) and AnthropicAuthError so auth failures render in Anthropic's error envelope instead of FastAPI's {detail:...} wrapper. - app/main.py: POST /v1/messages. Shares LingmaPool / SessionCache / InFlightGuard / StatsCollector with the OpenAI path — same api_key + same conversation prefix hits the same upstream sessionId across both protocols (KV cache carries over). Streaming emits the named Anthropic event sequence (message_start / content_block_start / content_block_delta / content_block_stop / message_delta / message_stop). No claude-* model mapping table: resolve_model's default fallback handles it. - README.md / DESIGN.md: document the new endpoint, add decision 5.12, iteration history M5, and a 4.3b streaming flow diagram. - Bump FastAPI app version to 0.4.0. Made-with: Cursor
2026-04-18 15:40:43 +08:00
parent d9dffbb8ba
commit 0b08dc6573
5 changed files with 716 additions and 3 deletions
--- a/app/anthropic_schema.py
+++ b/app/anthropic_schema.py
@@ -0,0 +1,165 @@
+from __future__ import annotations
+
+"""Anthropic Messages API schema + content adapters.
+
+Why this exists
+---------------
+The Anthropic Messages API (`POST /v1/messages`) is wire-incompatible with
+OpenAI chat completions even though it covers the same ground:
+
+* auth:     `x-api-key` header (not `Authorization: Bearer`)
+* system:   separate top-level field, never a message role
+* content:  `str` or array of typed blocks (`text`, `image`, `tool_use`, ...)
+* streaming: a named-event SSE protocol (`message_start`, `content_block_delta`,
+             `message_delta`, `message_stop`) rather than OpenAI's `delta.content`
+* errors:   `{"type":"error","error":{"type":"...","message":"..."}}`
+
+We keep a separate schema module rather than squeezing everything into
+`openai_schema.py` so both adapters stay small and auditable. Both eventually
+collapse to the same Lingma prompt shape inside `main.py`.
+"""
+
+import json
+from typing import Any, Literal
+
+from pydantic import BaseModel
+
+
+# Anthropic accepts either a raw string or a list of typed content blocks.
+# We keep the list loosely typed (plain dicts) so future block kinds
+# (e.g. `thinking`, `document`) don't break the gateway — they simply fall
+# into the generic flattener below.
+AnthropicContent = str | list[dict[str, Any]] | None
+
+
+class AnthropicMessage(BaseModel):
+    # Anthropic: system is a top-level field, messages only carry user/assistant.
+    role: Literal["user", "assistant"]
+    content: AnthropicContent = None
+
+
+class AnthropicMessagesRequest(BaseModel):
+    model: str
+    # max_tokens is REQUIRED by Anthropic. We default to a sane value so callers
+    # that forget it don't 422 — easier migration from OpenAI clients.
+    max_tokens: int = 1024
+    messages: list[AnthropicMessage]
+    system: AnthropicContent = None
+    stream: bool = False
+    temperature: float | None = None
+    top_p: float | None = None
+    top_k: int | None = None
+    stop_sequences: list[str] | None = None
+    # metadata.user_id is the official hint for per-user routing / abuse tracking.
+    metadata: dict[str, Any] | None = None
+    # Tools / tool_choice are accepted but we can't forward them to Lingma yet —
+    # they're preserved here so the request doesn't 422, and the flattener
+    # surfaces any tool_use blocks as `[tool_use] {...}` text so the assistant
+    # still sees the context.
+    tools: list[dict[str, Any]] | None = None
+    tool_choice: dict[str, Any] | None = None
+
+
+def flatten_anthropic_content(content: AnthropicContent) -> str:
+    """Reduce Anthropic block arrays to a plain-string prompt for Lingma.
+
+    Handled block types:
+      * text          -> verbatim text
+      * image         -> `[image]` placeholder (Lingma has no vision)
+      * tool_use      -> `[tool_use] {json}` so the assistant can reference it
+      * tool_result   -> `[tool_result] ...` (string or nested blocks)
+      * unknown       -> fall back to `.text` / `.content` if present, else drop
+
+    Returning an empty string here means the caller (prompt builder) will skip
+    the whole message rather than emit a bare `[role] ` line.
+    """
+    if content is None:
+        return ""
+    if isinstance(content, str):
+        return content
+    if not isinstance(content, list):
+        return str(content)
+
+    parts: list[str] = []
+    for item in content:
+        if not isinstance(item, dict):
+            parts.append(str(item))
+            continue
+        t = item.get("type")
+        if t == "text":
+            text = item.get("text") or ""
+            if text:
+                parts.append(text)
+        elif t == "image":
+            parts.append("[image]")
+        elif t == "tool_use":
+            # Compact one-line JSON keeps prompt_tokens estimate stable.
+            try:
+                payload = json.dumps(
+                    {"name": item.get("name"), "input": item.get("input")},
+                    ensure_ascii=False,
+                )
+            except Exception:
+                payload = str(item)
+            parts.append(f"[tool_use] {payload}")
+        elif t == "tool_result":
+            inner = item.get("content")
+            if isinstance(inner, str):
+                parts.append(f"[tool_result] {inner}")
+            elif isinstance(inner, list):
+                parts.append(f"[tool_result] {flatten_anthropic_content(inner)}")
+        else:
+            fallback = item.get("text") or item.get("content")
+            if isinstance(fallback, str) and fallback:
+                parts.append(fallback)
+    return "\n".join(p for p in parts if p)
+
+
+def anthropic_to_internal_messages(req: AnthropicMessagesRequest) -> list[dict]:
+    """Project an Anthropic request into the gateway's internal message list.
+
+    Internal shape matches what `_messages_to_prompt` already expects:
+    `[{"role": "system"|"user"|"assistant", "content": "..."}]`. This means
+    session-cache hashing is identical across OpenAI and Anthropic callers —
+    a user who migrates between the two endpoints keeps their session affinity
+    as long as they send the same conversation prefix.
+    """
+    out: list[dict] = []
+    if req.system:
+        sys_text = flatten_anthropic_content(req.system)
+        if sys_text:
+            out.append({"role": "system", "content": sys_text})
+    for m in req.messages:
+        text = flatten_anthropic_content(m.content)
+        out.append({"role": m.role, "content": text})
+    return out
+
+
+def affinity_key_for_anthropic(req: AnthropicMessagesRequest) -> str | None:
+    """Best-effort stable routing key for an Anthropic request.
+
+    Priority mirrors the OpenAI side:
+      1. metadata.user_id (the official per-user hint)
+      2. hash of the system prompt
+      3. hash of the first message
+
+    Kept here rather than in `main.py` because it needs the flatten helper and
+    the request type — `main.py` stays endpoint-shaped, not schema-shaped.
+    """
+    import hashlib
+
+    if req.metadata:
+        user_id = req.metadata.get("user_id")
+        if isinstance(user_id, str) and user_id.strip():
+            return user_id.strip()
+
+    if req.system:
+        text = flatten_anthropic_content(req.system)
+        if text:
+            return "sys:" + hashlib.sha1(text.encode("utf-8")).hexdigest()[:16]
+
+    if req.messages:
+        text = flatten_anthropic_content(req.messages[0].content)
+        if text:
+            return "first:" + hashlib.sha1(text.encode("utf-8")).hexdigest()[:16]
+    return None