feat: Anthropic Messages API compat (/v1/messages)

Add a wire-compatible Anthropic endpoint alongside the existing OpenAI one
so Claude Code / anthropic-sdk / Cursor Agent can hit Lingma directly.

- app/anthropic_schema.py (new): request model + content-block flattener
  + internal-messages adapter + affinity key helper. Handles text / image /
  tool_use / tool_result blocks; unknown types degrade gracefully.
- app/auth.py: add require_anthropic_key (x-api-key, Bearer fallback)
  and AnthropicAuthError so auth failures render in Anthropic's error
  envelope instead of FastAPI's {detail:...} wrapper.
- app/main.py: POST /v1/messages. Shares LingmaPool / SessionCache /
  InFlightGuard / StatsCollector with the OpenAI path — same api_key +
  same conversation prefix hits the same upstream sessionId across both
  protocols (KV cache carries over). Streaming emits the named Anthropic
  event sequence (message_start / content_block_start / content_block_delta
  / content_block_stop / message_delta / message_stop). No claude-*
  model mapping table: resolve_model's default fallback handles it.
- README.md / DESIGN.md: document the new endpoint, add decision 5.12,
  iteration history M5, and a 4.3b streaming flow diagram.
- Bump FastAPI app version to 0.4.0.

Made-with: Cursor
This commit is contained in:
GitHub Actions
2026-04-18 15:40:43 +08:00
parent d9dffbb8ba
commit 0b08dc6573
5 changed files with 716 additions and 3 deletions

165
app/anthropic_schema.py Normal file
View File

@@ -0,0 +1,165 @@
from __future__ import annotations
"""Anthropic Messages API schema + content adapters.
Why this exists
---------------
The Anthropic Messages API (`POST /v1/messages`) is wire-incompatible with
OpenAI chat completions even though it covers the same ground:
* auth: `x-api-key` header (not `Authorization: Bearer`)
* system: separate top-level field, never a message role
* content: `str` or array of typed blocks (`text`, `image`, `tool_use`, ...)
* streaming: a named-event SSE protocol (`message_start`, `content_block_delta`,
`message_delta`, `message_stop`) rather than OpenAI's `delta.content`
* errors: `{"type":"error","error":{"type":"...","message":"..."}}`
We keep a separate schema module rather than squeezing everything into
`openai_schema.py` so both adapters stay small and auditable. Both eventually
collapse to the same Lingma prompt shape inside `main.py`.
"""
import json
from typing import Any, Literal
from pydantic import BaseModel
# Anthropic accepts either a raw string or a list of typed content blocks.
# We keep the list loosely typed (plain dicts) so future block kinds
# (e.g. `thinking`, `document`) don't break the gateway — they simply fall
# into the generic flattener below.
AnthropicContent = str | list[dict[str, Any]] | None
class AnthropicMessage(BaseModel):
# Anthropic: system is a top-level field, messages only carry user/assistant.
role: Literal["user", "assistant"]
content: AnthropicContent = None
class AnthropicMessagesRequest(BaseModel):
model: str
# max_tokens is REQUIRED by Anthropic. We default to a sane value so callers
# that forget it don't 422 — easier migration from OpenAI clients.
max_tokens: int = 1024
messages: list[AnthropicMessage]
system: AnthropicContent = None
stream: bool = False
temperature: float | None = None
top_p: float | None = None
top_k: int | None = None
stop_sequences: list[str] | None = None
# metadata.user_id is the official hint for per-user routing / abuse tracking.
metadata: dict[str, Any] | None = None
# Tools / tool_choice are accepted but we can't forward them to Lingma yet —
# they're preserved here so the request doesn't 422, and the flattener
# surfaces any tool_use blocks as `[tool_use] {...}` text so the assistant
# still sees the context.
tools: list[dict[str, Any]] | None = None
tool_choice: dict[str, Any] | None = None
def flatten_anthropic_content(content: AnthropicContent) -> str:
"""Reduce Anthropic block arrays to a plain-string prompt for Lingma.
Handled block types:
* text -> verbatim text
* image -> `[image]` placeholder (Lingma has no vision)
* tool_use -> `[tool_use] {json}` so the assistant can reference it
* tool_result -> `[tool_result] ...` (string or nested blocks)
* unknown -> fall back to `.text` / `.content` if present, else drop
Returning an empty string here means the caller (prompt builder) will skip
the whole message rather than emit a bare `[role] ` line.
"""
if content is None:
return ""
if isinstance(content, str):
return content
if not isinstance(content, list):
return str(content)
parts: list[str] = []
for item in content:
if not isinstance(item, dict):
parts.append(str(item))
continue
t = item.get("type")
if t == "text":
text = item.get("text") or ""
if text:
parts.append(text)
elif t == "image":
parts.append("[image]")
elif t == "tool_use":
# Compact one-line JSON keeps prompt_tokens estimate stable.
try:
payload = json.dumps(
{"name": item.get("name"), "input": item.get("input")},
ensure_ascii=False,
)
except Exception:
payload = str(item)
parts.append(f"[tool_use] {payload}")
elif t == "tool_result":
inner = item.get("content")
if isinstance(inner, str):
parts.append(f"[tool_result] {inner}")
elif isinstance(inner, list):
parts.append(f"[tool_result] {flatten_anthropic_content(inner)}")
else:
fallback = item.get("text") or item.get("content")
if isinstance(fallback, str) and fallback:
parts.append(fallback)
return "\n".join(p for p in parts if p)
def anthropic_to_internal_messages(req: AnthropicMessagesRequest) -> list[dict]:
"""Project an Anthropic request into the gateway's internal message list.
Internal shape matches what `_messages_to_prompt` already expects:
`[{"role": "system"|"user"|"assistant", "content": "..."}]`. This means
session-cache hashing is identical across OpenAI and Anthropic callers —
a user who migrates between the two endpoints keeps their session affinity
as long as they send the same conversation prefix.
"""
out: list[dict] = []
if req.system:
sys_text = flatten_anthropic_content(req.system)
if sys_text:
out.append({"role": "system", "content": sys_text})
for m in req.messages:
text = flatten_anthropic_content(m.content)
out.append({"role": m.role, "content": text})
return out
def affinity_key_for_anthropic(req: AnthropicMessagesRequest) -> str | None:
"""Best-effort stable routing key for an Anthropic request.
Priority mirrors the OpenAI side:
1. metadata.user_id (the official per-user hint)
2. hash of the system prompt
3. hash of the first message
Kept here rather than in `main.py` because it needs the flatten helper and
the request type — `main.py` stays endpoint-shaped, not schema-shaped.
"""
import hashlib
if req.metadata:
user_id = req.metadata.get("user_id")
if isinstance(user_id, str) and user_id.strip():
return user_id.strip()
if req.system:
text = flatten_anthropic_content(req.system)
if text:
return "sys:" + hashlib.sha1(text.encode("utf-8")).hexdigest()[:16]
if req.messages:
text = flatten_anthropic_content(req.messages[0].content)
if text:
return "first:" + hashlib.sha1(text.encode("utf-8")).hexdigest()[:16]
return None

View File

@@ -98,6 +98,58 @@ def require_metrics_access(
)
class AnthropicAuthError(Exception):
"""Raised when an Anthropic Messages request fails authentication.
Carries enough context for the endpoint to render the Anthropic-shaped
error body (`{"type":"error","error":{"type":..., "message":...}}`) — we
don't use `HTTPException` here because FastAPI would wrap the detail in
`{"detail": ...}`, which is not the Anthropic wire format.
"""
def __init__(self, status_code: int, error_type: str, message: str) -> None:
super().__init__(message)
self.status_code = status_code
self.error_type = error_type
self.message = message
def require_anthropic_key(request: Request, api_keys: list[str]) -> None:
"""Authenticate a `POST /v1/messages` request the Anthropic way.
Accept order:
1. `x-api-key` header (official Anthropic SDK / CLI / Claude Code)
2. `Authorization: Bearer <token>` (OpenAI-shaped clients / curl)
Empty `api_keys` means auth is disabled — the startup auth-posture warning
already covers that case loudly, same as `require_bearer`.
Note: we keep `anthropic-version` header permissive (don't parse/validate)
so clients on any official version work without gateway churn.
"""
if not api_keys:
return
token = request.headers.get("x-api-key", "").strip()
if not token:
auth = request.headers.get("authorization", "")
if auth.startswith("Bearer "):
token = auth[len("Bearer ") :].strip()
if not token:
raise AnthropicAuthError(
status.HTTP_401_UNAUTHORIZED,
"authentication_error",
"missing x-api-key header (or Authorization: Bearer ...)",
)
if not _match_any(token, api_keys):
raise AnthropicAuthError(
status.HTTP_401_UNAUTHORIZED,
"authentication_error",
"invalid x-api-key",
)
def require_admin_access(
request: Request,
api_keys: list[str],

View File

@@ -10,7 +10,18 @@ from contextlib import asynccontextmanager
from fastapi import Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse, StreamingResponse
from .auth import require_admin_access, require_bearer, require_metrics_access
from .anthropic_schema import (
AnthropicMessagesRequest,
affinity_key_for_anthropic,
anthropic_to_internal_messages,
)
from .auth import (
AnthropicAuthError,
require_admin_access,
require_anthropic_key,
require_bearer,
require_metrics_access,
)
from .concurrency import BackpressureRejected, InFlightGuard
from .config import Settings, load_settings
from .lingma_pool import LingmaPool, PoolInstance
@@ -85,7 +96,24 @@ async def lifespan(_app: FastAPI):
await pool.close()
app = FastAPI(title="Lingma OpenAI Gateway", version="0.3.0", lifespan=lifespan)
app = FastAPI(title="Lingma OpenAI Gateway", version="0.4.0", lifespan=lifespan)
@app.exception_handler(AnthropicAuthError)
async def _anthropic_auth_error_handler(_request: Request, exc: AnthropicAuthError):
"""Render auth failures on /v1/messages in the Anthropic wire format.
FastAPI's default handler wraps everything in `{"detail": ...}`, which
Anthropic SDKs don't parse. We emit the canonical
`{"type":"error","error":{"type":"...","message":"..."}}` instead.
"""
return JSONResponse(
status_code=exc.status_code,
content={
"type": "error",
"error": {"type": exc.error_type, "message": exc.message},
},
)
@app.middleware("http")
@@ -594,6 +622,356 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
ticket.release()
def _anthropic_error(status_code: int, error_type: str, message: str) -> JSONResponse:
"""Build an Anthropic-shaped error response (`type:error` envelope)."""
return JSONResponse(
status_code=status_code,
content={"type": "error", "error": {"type": error_type, "message": message}},
)
def _anthropic_stop_reason(completion_tokens: int, max_tokens: int) -> str:
"""Approximate Anthropic `stop_reason`.
Lingma doesn't expose a `max_tokens` knob, so we can't truly enforce it;
we report `max_tokens` only when the generated length meets or exceeds
the caller's stated ceiling. Everything else is `end_turn`.
"""
if max_tokens and completion_tokens >= max_tokens:
return "max_tokens"
return "end_turn"
@app.post("/v1/messages")
async def v1_messages(req: AnthropicMessagesRequest, request: Request):
"""Anthropic Messages API compatible endpoint.
Wire contract:
* auth: `x-api-key` header (fallback Authorization: Bearer)
* body: Anthropic Messages spec (system top-level, content blocks, ...)
* stream: named-event SSE (message_start / content_block_delta / ...)
Internally we:
1. Normalise to the gateway's internal message list (`role/content` dicts)
2. Reuse the same pool pick + session cache + backpressure guard as
`/v1/chat/completions`. Session-cache keys include the API key, so
Anthropic and OpenAI callers on the same key share KV-cache warmth.
3. Re-wrap outputs in Anthropic's response / SSE format.
"""
# ------------------------------------------------------------- auth
try:
require_anthropic_key(request, settings.api_keys)
except AnthropicAuthError as exc:
return _anthropic_error(exc.status_code, exc.error_type, exc.message)
# ------------------------------------------------------------- plumbing
try:
p = _require_pool()
except HTTPException as exc:
return _anthropic_error(exc.status_code, "overloaded_error", "gateway not ready")
messages_dump = anthropic_to_internal_messages(req)
# Prefer the auth token actually accepted so session-cache bucketing is
# consistent regardless of which auth header style the caller used.
api_key = (
request.headers.get("x-api-key", "").strip()
or _extract_api_key(request)
or "-"
)
# ------------------------------------------------------------- session reuse
# Anthropic clients don't expose an ask_mode, so we always run in "chat".
ask_mode = "chat"
reuse_eligible = (
session_cache.enabled and ask_mode == "chat" and len(messages_dump) >= 2
)
lookup_key: str | None = None
write_key: str | None = None
cached_session_id: str | None = None
cached_instance_name: str | None = None
if reuse_eligible:
lookup_key = session_cache.build_key(api_key, messages_dump[:-1])
write_key = session_cache.build_key(api_key, messages_dump)
entry = await session_cache.get(lookup_key)
if entry is not None:
cached_session_id = entry.session_id
cached_instance_name = entry.instance_name or None
affinity = cached_instance_name or affinity_key_for_anthropic(req)
inst = p.pick(affinity_key=affinity)
if cached_instance_name and inst.name != cached_instance_name:
logger.info(
"anthropic session cache instance %s unhealthy, falling back to %s",
cached_instance_name,
inst.name,
)
cached_session_id = None
if lookup_key:
await session_cache.invalidate(lookup_key)
try:
await _ensure_instance_logged_in(inst)
except HTTPException as exc:
# 503/401/502 from login: map to closest Anthropic kind.
err_type = "authentication_error" if exc.status_code == 401 else "overloaded_error"
detail = exc.detail if isinstance(exc.detail, dict) else {}
msg = (detail.get("error") or {}).get("message") or str(detail) or "upstream error"
return _anthropic_error(exc.status_code, err_type, msg)
# ------------------------------------------------------------- prompt & model
models = await inst.client.query_models()
available = flatten_model_keys(models)
name_map = build_model_name_map(models)
# Anthropic callers send `claude-*` model names. resolve_model's
# final fallback (default_model / first available) handles that cleanly
# without us having to hard-code a mapping table.
model = resolve_model(req.model, available, settings.default_model, name_map)
if cached_session_id:
prompt = _last_user_text(messages_dump)
is_reply = True
else:
prompt = _messages_to_prompt(messages_dump)
is_reply = False
if not prompt:
return _anthropic_error(400, "invalid_request_error", "messages is empty")
prompt_tokens = estimate_tokens(prompt)
# ------------------------------------------------------------- backpressure
try:
ticket = await chat_guard.try_acquire()
except BackpressureRejected as exc:
retry_after = max(1, int(exc.retry_after))
logger.warning("anthropic rejected by backpressure, retry_after=%ds", retry_after)
resp = _anthropic_error(
429,
"overloaded_error",
"too many in-flight requests, please retry later",
)
resp.headers["Retry-After"] = str(retry_after)
return resp
inst.in_flight += 1
message_id = f"msg_{uuid.uuid4().hex}"
logger.info(
"anthropic.start inst=%s model=%s stream=%s prompt_tokens~%d reuse=%s",
inst.name,
model,
req.stream,
prompt_tokens,
bool(cached_session_id),
extra={
"ctx_instance": inst.name,
"ctx_model": model,
"ctx_ask_mode": ask_mode,
"ctx_stream": req.stream,
"ctx_prompt_tokens": prompt_tokens,
"ctx_in_flight": chat_guard.in_flight,
"ctx_affinity": affinity,
"ctx_session_reuse": bool(cached_session_id),
"ctx_api": "anthropic",
},
)
ticket_transferred = False
def _sse(event: str, data: dict) -> str:
return f"event: {event}\ndata: {json.dumps(data, ensure_ascii=False)}\n\n"
try:
if req.stream:
completion_tokens_holder = {"n": 0}
stream_meta: dict = {}
max_tokens = req.max_tokens
async def event_stream(_ticket=ticket, _inst=inst, _meta=stream_meta):
success = False
try:
# 1) message_start — Anthropic SDKs read this first to get
# the message envelope (id/model/initial usage).
start_payload = {
"type": "message_start",
"message": {
"id": message_id,
"type": "message",
"role": "assistant",
"model": model,
"content": [],
"stop_reason": None,
"stop_sequence": None,
# input_tokens is authoritative here; output_tokens
# is seeded to 0 and updated in message_delta.
"usage": {
"input_tokens": prompt_tokens,
"output_tokens": 0,
},
},
}
yield _sse("message_start", start_payload)
# 2) content_block_start for a single text block (index 0).
yield _sse(
"content_block_start",
{
"type": "content_block_start",
"index": 0,
"content_block": {"type": "text", "text": ""},
},
)
# 3) content_block_delta stream of text tokens.
async for chunk in _inst.client.chat_stream(
prompt,
model,
ask_mode,
session_id=cached_session_id,
is_reply=is_reply,
out_meta=_meta,
):
if not chunk:
continue
completion_tokens_holder["n"] += estimate_tokens(chunk)
yield _sse(
"content_block_delta",
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": chunk},
},
)
# 4) content_block_stop closes the single text block.
yield _sse(
"content_block_stop",
{"type": "content_block_stop", "index": 0},
)
# 5) message_delta carries the terminal stop_reason and
# the final cumulative output_tokens count.
stop_reason = _anthropic_stop_reason(
completion_tokens_holder["n"], max_tokens
)
yield _sse(
"message_delta",
{
"type": "message_delta",
"delta": {
"stop_reason": stop_reason,
"stop_sequence": None,
},
"usage": {"output_tokens": completion_tokens_holder["n"]},
},
)
# 6) message_stop — terminal event, no [DONE] sentinel.
yield _sse("message_stop", {"type": "message_stop"})
success = True
except asyncio.CancelledError:
logger.info("anthropic.stream cancelled (inst=%s)", _inst.name)
raise
except Exception as exc:
logger.warning("anthropic.stream error (inst=%s): %s", _inst.name, exc)
# Best-effort error frame. Anthropic clients treat any
# unexpected event gracefully; we prefer visibility over
# silent truncation.
try:
yield _sse(
"error",
{
"type": "error",
"error": {
"type": "api_error",
"message": str(exc) or "upstream error",
},
},
)
except Exception:
pass
finally:
# Session write-back only on clean finish — partial streams
# leave Lingma's session in an indeterminate state.
if success and write_key:
sid = _meta.get("session_id")
if sid:
await session_cache.put(write_key, sid, _inst.name)
await stats_collector.record_chat(
stream=True,
success=success,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens_holder["n"],
)
_inst.in_flight = max(0, _inst.in_flight - 1)
_ticket.release()
ticket_transferred = True
return StreamingResponse(
event_stream(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache, no-transform",
"X-Accel-Buffering": "no",
"Connection": "keep-alive",
},
)
# ------------------------------------------------------------- non-stream
try:
result = await inst.client.chat_complete(
prompt,
model,
ask_mode,
session_id=cached_session_id,
is_reply=is_reply,
)
except Exception as exc:
logger.warning("anthropic.complete error (inst=%s): %s", inst.name, exc)
await stats_collector.record_chat(
stream=False,
success=False,
prompt_tokens=prompt_tokens,
completion_tokens=0,
)
if cached_session_id and lookup_key:
await session_cache.invalidate(lookup_key)
return _anthropic_error(502, "api_error", "upstream lingma error")
text = result.get("text") or ""
completion_tokens = estimate_tokens(text)
await stats_collector.record_chat(
stream=False,
success=True,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
if write_key:
sid = result.get("sessionId")
if sid:
await session_cache.put(write_key, sid, inst.name)
response_body: dict = {
"id": message_id,
"type": "message",
"role": "assistant",
"model": model,
"content": [{"type": "text", "text": text}],
"stop_reason": _anthropic_stop_reason(completion_tokens, req.max_tokens),
"stop_sequence": None,
"usage": {
"input_tokens": prompt_tokens,
"output_tokens": completion_tokens,
},
}
return JSONResponse(content=response_body)
finally:
if not ticket_transferred:
inst.in_flight = max(0, inst.in_flight - 1)
ticket.release()
@app.post("/internal/auto-login/start", dependencies=[Depends(admin_auth_guard)])
async def internal_auto_login_start(instance: str | None = None):
p = _require_pool()