feat: Anthropic Messages API compat (/v1/messages)
Add a wire-compatible Anthropic endpoint alongside the existing OpenAI one
so Claude Code / anthropic-sdk / Cursor Agent can hit Lingma directly.
- app/anthropic_schema.py (new): request model + content-block flattener
+ internal-messages adapter + affinity key helper. Handles text / image /
tool_use / tool_result blocks; unknown types degrade gracefully.
- app/auth.py: add require_anthropic_key (x-api-key, Bearer fallback)
and AnthropicAuthError so auth failures render in Anthropic's error
envelope instead of FastAPI's {detail:...} wrapper.
- app/main.py: POST /v1/messages. Shares LingmaPool / SessionCache /
InFlightGuard / StatsCollector with the OpenAI path — same api_key +
same conversation prefix hits the same upstream sessionId across both
protocols (KV cache carries over). Streaming emits the named Anthropic
event sequence (message_start / content_block_start / content_block_delta
/ content_block_stop / message_delta / message_stop). No claude-*
model mapping table: resolve_model's default fallback handles it.
- README.md / DESIGN.md: document the new endpoint, add decision 5.12,
iteration history M5, and a 4.3b streaming flow diagram.
- Bump FastAPI app version to 0.4.0.
Made-with: Cursor
This commit is contained in:
165
app/anthropic_schema.py
Normal file
165
app/anthropic_schema.py
Normal file
@@ -0,0 +1,165 @@
|
||||
from __future__ import annotations
|
||||
|
||||
"""Anthropic Messages API schema + content adapters.
|
||||
|
||||
Why this exists
|
||||
---------------
|
||||
The Anthropic Messages API (`POST /v1/messages`) is wire-incompatible with
|
||||
OpenAI chat completions even though it covers the same ground:
|
||||
|
||||
* auth: `x-api-key` header (not `Authorization: Bearer`)
|
||||
* system: separate top-level field, never a message role
|
||||
* content: `str` or array of typed blocks (`text`, `image`, `tool_use`, ...)
|
||||
* streaming: a named-event SSE protocol (`message_start`, `content_block_delta`,
|
||||
`message_delta`, `message_stop`) rather than OpenAI's `delta.content`
|
||||
* errors: `{"type":"error","error":{"type":"...","message":"..."}}`
|
||||
|
||||
We keep a separate schema module rather than squeezing everything into
|
||||
`openai_schema.py` so both adapters stay small and auditable. Both eventually
|
||||
collapse to the same Lingma prompt shape inside `main.py`.
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
# Anthropic accepts either a raw string or a list of typed content blocks.
|
||||
# We keep the list loosely typed (plain dicts) so future block kinds
|
||||
# (e.g. `thinking`, `document`) don't break the gateway — they simply fall
|
||||
# into the generic flattener below.
|
||||
AnthropicContent = str | list[dict[str, Any]] | None
|
||||
|
||||
|
||||
class AnthropicMessage(BaseModel):
|
||||
# Anthropic: system is a top-level field, messages only carry user/assistant.
|
||||
role: Literal["user", "assistant"]
|
||||
content: AnthropicContent = None
|
||||
|
||||
|
||||
class AnthropicMessagesRequest(BaseModel):
|
||||
model: str
|
||||
# max_tokens is REQUIRED by Anthropic. We default to a sane value so callers
|
||||
# that forget it don't 422 — easier migration from OpenAI clients.
|
||||
max_tokens: int = 1024
|
||||
messages: list[AnthropicMessage]
|
||||
system: AnthropicContent = None
|
||||
stream: bool = False
|
||||
temperature: float | None = None
|
||||
top_p: float | None = None
|
||||
top_k: int | None = None
|
||||
stop_sequences: list[str] | None = None
|
||||
# metadata.user_id is the official hint for per-user routing / abuse tracking.
|
||||
metadata: dict[str, Any] | None = None
|
||||
# Tools / tool_choice are accepted but we can't forward them to Lingma yet —
|
||||
# they're preserved here so the request doesn't 422, and the flattener
|
||||
# surfaces any tool_use blocks as `[tool_use] {...}` text so the assistant
|
||||
# still sees the context.
|
||||
tools: list[dict[str, Any]] | None = None
|
||||
tool_choice: dict[str, Any] | None = None
|
||||
|
||||
|
||||
def flatten_anthropic_content(content: AnthropicContent) -> str:
|
||||
"""Reduce Anthropic block arrays to a plain-string prompt for Lingma.
|
||||
|
||||
Handled block types:
|
||||
* text -> verbatim text
|
||||
* image -> `[image]` placeholder (Lingma has no vision)
|
||||
* tool_use -> `[tool_use] {json}` so the assistant can reference it
|
||||
* tool_result -> `[tool_result] ...` (string or nested blocks)
|
||||
* unknown -> fall back to `.text` / `.content` if present, else drop
|
||||
|
||||
Returning an empty string here means the caller (prompt builder) will skip
|
||||
the whole message rather than emit a bare `[role] ` line.
|
||||
"""
|
||||
if content is None:
|
||||
return ""
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if not isinstance(content, list):
|
||||
return str(content)
|
||||
|
||||
parts: list[str] = []
|
||||
for item in content:
|
||||
if not isinstance(item, dict):
|
||||
parts.append(str(item))
|
||||
continue
|
||||
t = item.get("type")
|
||||
if t == "text":
|
||||
text = item.get("text") or ""
|
||||
if text:
|
||||
parts.append(text)
|
||||
elif t == "image":
|
||||
parts.append("[image]")
|
||||
elif t == "tool_use":
|
||||
# Compact one-line JSON keeps prompt_tokens estimate stable.
|
||||
try:
|
||||
payload = json.dumps(
|
||||
{"name": item.get("name"), "input": item.get("input")},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
except Exception:
|
||||
payload = str(item)
|
||||
parts.append(f"[tool_use] {payload}")
|
||||
elif t == "tool_result":
|
||||
inner = item.get("content")
|
||||
if isinstance(inner, str):
|
||||
parts.append(f"[tool_result] {inner}")
|
||||
elif isinstance(inner, list):
|
||||
parts.append(f"[tool_result] {flatten_anthropic_content(inner)}")
|
||||
else:
|
||||
fallback = item.get("text") or item.get("content")
|
||||
if isinstance(fallback, str) and fallback:
|
||||
parts.append(fallback)
|
||||
return "\n".join(p for p in parts if p)
|
||||
|
||||
|
||||
def anthropic_to_internal_messages(req: AnthropicMessagesRequest) -> list[dict]:
|
||||
"""Project an Anthropic request into the gateway's internal message list.
|
||||
|
||||
Internal shape matches what `_messages_to_prompt` already expects:
|
||||
`[{"role": "system"|"user"|"assistant", "content": "..."}]`. This means
|
||||
session-cache hashing is identical across OpenAI and Anthropic callers —
|
||||
a user who migrates between the two endpoints keeps their session affinity
|
||||
as long as they send the same conversation prefix.
|
||||
"""
|
||||
out: list[dict] = []
|
||||
if req.system:
|
||||
sys_text = flatten_anthropic_content(req.system)
|
||||
if sys_text:
|
||||
out.append({"role": "system", "content": sys_text})
|
||||
for m in req.messages:
|
||||
text = flatten_anthropic_content(m.content)
|
||||
out.append({"role": m.role, "content": text})
|
||||
return out
|
||||
|
||||
|
||||
def affinity_key_for_anthropic(req: AnthropicMessagesRequest) -> str | None:
|
||||
"""Best-effort stable routing key for an Anthropic request.
|
||||
|
||||
Priority mirrors the OpenAI side:
|
||||
1. metadata.user_id (the official per-user hint)
|
||||
2. hash of the system prompt
|
||||
3. hash of the first message
|
||||
|
||||
Kept here rather than in `main.py` because it needs the flatten helper and
|
||||
the request type — `main.py` stays endpoint-shaped, not schema-shaped.
|
||||
"""
|
||||
import hashlib
|
||||
|
||||
if req.metadata:
|
||||
user_id = req.metadata.get("user_id")
|
||||
if isinstance(user_id, str) and user_id.strip():
|
||||
return user_id.strip()
|
||||
|
||||
if req.system:
|
||||
text = flatten_anthropic_content(req.system)
|
||||
if text:
|
||||
return "sys:" + hashlib.sha1(text.encode("utf-8")).hexdigest()[:16]
|
||||
|
||||
if req.messages:
|
||||
text = flatten_anthropic_content(req.messages[0].content)
|
||||
if text:
|
||||
return "first:" + hashlib.sha1(text.encode("utf-8")).hexdigest()[:16]
|
||||
return None
|
||||
Reference in New Issue
Block a user