feat: add capability and admin introspection endpoints

Expose capability discovery plus admin-only config and request inspection endpoints so clients and operators can understand gateway behavior without reading code.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
mmc
2026-05-12 14:30:08 +08:00
parent 94a8025ae5
commit b719bdeaa2
5 changed files with 780 additions and 93 deletions

View File

@@ -5,6 +5,7 @@ import hashlib
import json
import time
import uuid
from collections import deque
from contextlib import asynccontextmanager
from typing import Any
@@ -15,6 +16,7 @@ from .anthropic_schema import (
AnthropicMessagesRequest,
affinity_key_for_anthropic,
anthropic_to_internal_messages,
flatten_anthropic_content,
)
from .auth import (
AnthropicAuthError,
@@ -112,6 +114,8 @@ STREAMING_RESPONSE_HEADERS = {
"Connection": "keep-alive",
}
_DEBUG_REQUEST_LOG: deque[dict[str, Any]] = deque(maxlen=100)
def _require_pool() -> LingmaPool:
if pool is None:
@@ -249,6 +253,63 @@ def _log_auth_posture() -> None:
)
def _safe_setting_value(key: str, value: Any) -> Any:
key_upper = key.upper()
if any(
marker in key_upper
for marker in {"KEY", "TOKEN", "PASSWORD", "SECRET", "BUNDLE"}
):
if isinstance(value, list):
return ["***" for _ in value]
return "***"
return value
def _redact_debug_value(path: tuple[str, ...], value: Any) -> Any:
if isinstance(value, dict):
return {
k: _redact_debug_value(path + (str(k).lower(),), v)
for k, v in value.items()
}
if isinstance(value, list):
return [_redact_debug_value(path + ("[]",), item) for item in value]
if isinstance(value, str):
lowered_path = "/".join(path)
if any(marker in lowered_path for marker in ("authorization", "x-api-key", "api_key", "token", "password", "secret", "session_bundle")):
return "***"
if value.startswith("data:"):
return "[redacted-data-url]"
if "session bundle" in value.lower():
return "[redacted-session-bundle]"
if any(part in {"args", "arguments"} for part in path) and len(value) > 2048:
return value[:1024] + "... [truncated]"
return value
def _record_debug_request(protocol: str, path: str, body: dict[str, Any], request: Request) -> None:
_DEBUG_REQUEST_LOG.appendleft(
{
"timestamp": int(time.time()),
"protocol": protocol,
"path": path,
"request_id": request.headers.get("x-request-id", ""),
"body": _redact_debug_value((), body),
}
)
@app.get("/internal/debug/requests", dependencies=[Depends(admin_auth_guard)])
async def internal_debug_requests(limit: int = 20):
safe_limit = min(max(limit, 1), 100)
return JSONResponse(
content={
"ok": True,
"count": min(safe_limit, len(_DEBUG_REQUEST_LOG)),
"items": list(_DEBUG_REQUEST_LOG)[:safe_limit],
}
)
@app.get("/healthz")
async def healthz():
if pool is None:
@@ -267,6 +328,62 @@ async def healthz():
}
def _capabilities_payload() -> dict[str, Any]:
return {
"service": "lingma-openai-gateway",
"version": app.version,
"protocols": {
"openai": {
"models": True,
"chat_completions": True,
"responses": True,
"streaming": True,
"response_tool_calls": True,
"request_tools_forwarded": settings.tool_forward_enabled,
},
"anthropic": {
"messages": True,
"count_tokens": True,
"streaming": True,
"response_tool_use": True,
"request_tools_forwarded": settings.tool_forward_enabled,
},
},
"features": {
"session_reuse": {
"enabled": settings.session_reuse_enabled,
"cache_max_entries": settings.session_cache_max_entries,
"cache_ttl_sec": settings.session_cache_ttl_sec,
},
"tooling": {
"forward_enabled": settings.tool_forward_enabled,
"allowlist": settings.tool_allowlist,
"emulation_bridge_enabled": True,
},
"pool": {
"configured_instance_count": settings.instance_count,
"default_model": settings.default_model,
"default_ask_mode": settings.default_ask_mode,
},
"auth": {
"v1_requires_auth": bool(settings.api_keys),
"admin_token_configured": bool(settings.admin_token),
"metrics_public": settings.metrics_public,
},
},
}
@app.get("/capabilities")
async def capabilities():
return JSONResponse(content=_capabilities_payload())
@app.get("/v1/capabilities", dependencies=[Depends(anthropic_auth_guard)])
async def v1_capabilities():
return JSONResponse(content=_capabilities_payload())
async def _ensure_instance_logged_in(inst: PoolInstance) -> dict:
client = inst.client
auto_login = inst.auto_login
@@ -433,6 +550,75 @@ def _messages_to_prompt(messages: list[dict]) -> str:
return "\n".join(parts).strip()
def _assistant_tool_calls_to_emulation_text(tool_calls: Any) -> str:
if not isinstance(tool_calls, list):
return ""
blocks: list[str] = []
for item in tool_calls:
if not isinstance(item, dict):
continue
fn = item.get("function") if isinstance(item.get("function"), dict) else None
name = str((fn or {}).get("name") or item.get("name") or "").strip()
if not name:
continue
arguments = (fn or {}).get("arguments")
if isinstance(arguments, str):
try:
arguments = json.loads(arguments)
except Exception:
arguments = {"raw": arguments}
if not isinstance(arguments, dict):
arguments = {}
blocks.append(
"```json action\n"
+ json.dumps(
{"tool": name, "parameters": arguments}, ensure_ascii=False, indent=2
)
+ "\n```"
)
return "\n\n".join(blocks)
def _tool_action_block(name: str, arguments: dict[str, Any]) -> str:
return (
"```json action\n"
+ json.dumps(
{"tool": name, "parameters": arguments}, ensure_ascii=False, indent=2
)
+ "\n```"
)
def _anthropic_flattened_tool_history_to_emulation_text(text: str) -> str:
if not text:
return ""
out: list[str] = []
for line in text.splitlines():
stripped = line.strip()
if stripped.startswith("[tool_use]"):
raw = stripped[len("[tool_use]") :].strip()
try:
payload = json.loads(raw)
except Exception:
out.append(line)
continue
if not isinstance(payload, dict):
out.append(line)
continue
name = str(payload.get("name") or "").strip()
arguments = payload.get("input")
if name and isinstance(arguments, dict):
out.append(_tool_action_block(name, arguments))
else:
out.append(line)
continue
if stripped.startswith("[tool_result]"):
out.append(action_output_prompt(None, stripped[len("[tool_result]") :].strip()))
continue
out.append(line)
return "\n".join(part for part in out if part).strip()
def _messages_to_emulation_prompt(
messages: list[dict[str, Any]],
*,
@@ -446,6 +632,10 @@ def _messages_to_emulation_prompt(
if role in {"system", "developer"}:
continue
text = flatten_content(message.get("content"))
if role == "assistant" and message.get("tool_calls"):
projected = _assistant_tool_calls_to_emulation_text(message.get("tool_calls"))
if projected:
text = "\n\n".join(part for part in [text, projected] if part)
if role == "tool":
text = action_output_prompt(message.get("tool_call_id"), text)
role = "user"
@@ -472,6 +662,22 @@ def _messages_to_emulation_prompt(
return "\n\n".join(parts).strip()
def _effective_tool_config_for_emulation(
tool_config: dict[str, Any] | None,
*,
use_emulation: bool,
) -> dict[str, Any] | None:
if use_emulation:
return None
return tool_config
def _emulation_tools(raw_tools: list[dict[str, Any]] | None, tool_config: dict[str, Any] | None) -> list[dict[str, Any]] | None:
if isinstance(tool_config, dict) and isinstance(tool_config.get("tools"), list):
return tool_config.get("tools")
return raw_tools
def _anthropic_messages_to_emulation_prompt(
messages: list[dict[str, Any]],
*,
@@ -483,6 +689,10 @@ def _anthropic_messages_to_emulation_prompt(
for message in messages:
role = str(message.get("role") or "").strip().lower()
text = str(message.get("content") or "").strip()
if role == "assistant" and "[tool_use]" in text:
text = _anthropic_flattened_tool_history_to_emulation_text(text)
elif role == "user" and "[tool_result]" in text:
text = _anthropic_flattened_tool_history_to_emulation_text(text)
if role == "tool":
text = action_output_prompt(message.get("tool_call_id"), text)
role = "user"
@@ -575,6 +785,7 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
p = _require_pool()
messages_dump = [m.model_dump() for m in req.messages]
_record_debug_request("openai", "/v1/chat/completions", req.model_dump(mode="json"), request)
api_key = _extract_api_key(request) or "-"
# ------------------------------------------------------------- session reuse
@@ -617,9 +828,11 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
is_reply = execution.is_reply
include_usage = _include_usage(req.stream_options)
em_tools = _em_extract_openai_tools(req.tools)
emulation_tools = _emulation_tools(req.tools, tool_config)
em_tools = _em_extract_openai_tools(emulation_tools)
em_choice = _em_extract_openai_tool_choice(req.tool_choice)
if _em_has_tool_request(em_tools, em_choice):
use_emulation = has_tooling_context
if use_emulation:
system_parts = [
flatten_content(m.content)
for m in req.messages
@@ -628,9 +841,14 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
prompt = _messages_to_emulation_prompt(
messages_dump,
system_text="\n\n".join(system_parts),
tools=req.tools,
tools=emulation_tools,
tool_choice=req.tool_choice,
)
execution.prompt = prompt
effective_tool_config = _effective_tool_config_for_emulation(
tool_config,
use_emulation=use_emulation,
)
try:
started = await start_execution(
@@ -708,7 +926,7 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
ask_mode,
session_id=cached_session_id,
is_reply=is_reply,
tool_config=tool_config,
tool_config=effective_tool_config,
out_meta=_meta,
):
if _stream_event_type(chunk) == "tool":
@@ -763,6 +981,8 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
continue
buffered_text_parts.append(text)
completion_tokens_holder["n"] += estimate_tokens(text)
if use_emulation:
continue
full_text = "".join(buffered_text_parts)
if req.tools:
@@ -855,9 +1075,6 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
buffered_text_parts.clear()
yield f"data: {json.dumps(payload, ensure_ascii=False)}\n\n"
if buffered_text_parts and forced_tool_name and saw_tool_call:
buffered_text_parts.clear()
if buffered_text_parts and req.tools and not saw_tool_call:
merged_text = "".join(buffered_text_parts)
inferred = _infer_tool_event_from_declared_tools(
@@ -924,6 +1141,11 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
yield f"data: {json.dumps(payload, ensure_ascii=False)}\n\n"
buffered_text_parts = [remaining] if remaining else []
if buffered_text_parts and saw_tool_call:
text_to_yield = "".join(buffered_text_parts)
buffered_text_parts.clear()
yield _text_payload(text_to_yield)
done_payload = {
"id": completion_id,
"object": "chat.completion.chunk",
@@ -996,7 +1218,7 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
protocol="chat",
execution=execution,
prompt_tokens=prompt_tokens,
tool_config=tool_config,
tool_config=effective_tool_config,
logger=logger,
stats_collector=stats_collector,
session_cache=session_cache,
@@ -1095,7 +1317,7 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
ask_mode,
session_id=None,
is_reply=False,
tool_config=tool_config,
tool_config=effective_tool_config,
)
retry_text = retry_result.get("text") or ""
parsed_calls, remaining = parse_action_blocks(retry_text, em_tools)
@@ -1227,6 +1449,7 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
)
messages_dump = anthropic_to_internal_messages(req)
_record_debug_request("anthropic", "/v1/messages", req.model_dump(mode="json"), request)
# Prefer the auth token actually accepted so session-cache bucketing is
# consistent regardless of which auth header style the caller used.
api_key = (
@@ -1284,16 +1507,23 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
model = execution.model
prompt = execution.prompt
is_reply = execution.is_reply
em_anthropic_tools = _em_extract_anthropic_tools(req.tools)
emulation_tools = _emulation_tools(req.tools, tool_config)
em_anthropic_tools = _em_extract_anthropic_tools(emulation_tools)
em_anthropic_choice = _em_extract_anthropic_tool_choice(req.tool_choice)
if _em_has_tool_request(em_anthropic_tools, em_anthropic_choice):
use_emulation = has_tooling_context
if use_emulation:
system_text = flatten_anthropic_content(req.system) if req.system else ""
prompt = _anthropic_messages_to_emulation_prompt(
messages_dump,
system_text=system_text,
tools=req.tools,
tools=emulation_tools,
tool_choice=req.tool_choice,
)
execution.prompt = prompt
effective_tool_config = _effective_tool_config_for_emulation(
tool_config,
use_emulation=use_emulation,
)
try:
started = await start_execution(
@@ -1372,7 +1602,7 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
ask_mode,
session_id=cached_session_id,
is_reply=is_reply,
tool_config=tool_config,
tool_config=effective_tool_config,
out_meta=_meta,
):
if _stream_event_type(chunk) == "tool":
@@ -1703,7 +1933,7 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
protocol="anthropic",
execution=execution,
prompt_tokens=prompt_tokens,
tool_config=tool_config,
tool_config=effective_tool_config,
logger=logger,
stats_collector=stats_collector,
session_cache=session_cache,
@@ -1757,10 +1987,8 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
text = remaining
if not saw_tool_event and em_anthropic_tools:
inferred_call = infer_declared_tool_call_from_text(text, em_anthropic_tools)
if inferred_call is None:
inferred_calls = infer_tool_calls_from_text(text, em_anthropic_tools)
inferred_call = inferred_calls[0] if inferred_calls else None
inferred_calls = infer_tool_calls_from_text(text, em_anthropic_tools)
inferred_call = inferred_calls[0] if inferred_calls else None
if inferred_call is not None:
content_blocks = [
{
@@ -1774,7 +2002,7 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
saw_pending_tool_use = True
text = ""
if not saw_tool_event and em_anthropic_tools:
if not saw_tool_event and em_anthropic_tools and not text.strip():
retry_prompt = f"{prompt}\n\n{force_tooling_prompt(em_anthropic_choice)}"
retry_result = await inst.client.chat_complete(
retry_prompt,
@@ -1782,53 +2010,7 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
ask_mode,
session_id=None,
is_reply=False,
tool_config=tool_config,
)
retry_text = retry_result.get("text") or ""
parsed_calls, remaining = parse_action_blocks(retry_text, em_anthropic_tools)
if parsed_calls:
content_blocks = []
if remaining:
content_blocks.append({"type": "text", "text": remaining})
for call in parsed_calls:
content_blocks.append(
{
"type": "tool_use",
"id": call.id,
"name": call.name,
"input": call.arguments,
}
)
saw_tool_event = True
saw_pending_tool_use = True
text = remaining
else:
inferred_call = infer_declared_tool_call_from_text(retry_text, em_anthropic_tools)
if inferred_call is None:
inferred_calls = infer_tool_calls_from_text(retry_text, em_anthropic_tools)
inferred_call = inferred_calls[0] if inferred_calls else None
if inferred_call is not None:
content_blocks = [
{
"type": "tool_use",
"id": inferred_call.id,
"name": inferred_call.name,
"input": inferred_call.arguments,
}
]
saw_tool_event = True
saw_pending_tool_use = True
text = ""
if not saw_tool_event and em_anthropic_tools and text.strip():
retry_prompt = f"{prompt}\n\n{force_tooling_prompt(em_anthropic_choice)}"
retry_result = await inst.client.chat_complete(
retry_prompt,
model,
ask_mode,
session_id=None,
is_reply=False,
tool_config=tool_config,
tool_config=effective_tool_config,
)
retry_text = retry_result.get("text") or ""
parsed_calls, remaining = parse_action_blocks(retry_text, em_anthropic_tools)
@@ -2090,6 +2272,60 @@ async def internal_stats():
}
@app.get("/internal/effective-config", dependencies=[Depends(admin_auth_guard)])
async def internal_effective_config():
cfg = settings
return JSONResponse(content={
"ok": True,
"settings": {
"host": cfg.host,
"port": cfg.port,
"api_keys": _safe_setting_value("api_keys", cfg.api_keys),
"metrics_token": _safe_setting_value("metrics_token", cfg.metrics_token),
"admin_token": _safe_setting_value("admin_token", cfg.admin_token),
"metrics_public": cfg.metrics_public,
"log_level": cfg.log_level,
"gateway_max_in_flight": cfg.gateway_max_in_flight,
"gateway_queue_timeout_sec": cfg.gateway_queue_timeout_sec,
"lingma_bin": cfg.lingma_bin,
"lingma_work_dir": cfg.lingma_work_dir,
"lingma_socket_port": cfg.lingma_socket_port,
"lingma_startup_timeout": cfg.lingma_startup_timeout,
"lingma_rpc_timeout": cfg.lingma_rpc_timeout,
"default_model": cfg.default_model,
"default_ask_mode": cfg.default_ask_mode,
"dedicated_domain_url": cfg.dedicated_domain_url,
"auto_login_enabled": cfg.auto_login_enabled,
"auto_login_headless": cfg.auto_login_headless,
"auto_login_timeout": cfg.auto_login_timeout,
"auto_login_max_retry": cfg.auto_login_max_retry,
"instance_count": cfg.instance_count,
"session_reuse_enabled": cfg.session_reuse_enabled,
"session_cache_max_entries": cfg.session_cache_max_entries,
"session_cache_ttl_sec": cfg.session_cache_ttl_sec,
"tool_forward_enabled": cfg.tool_forward_enabled,
"tool_allowlist": cfg.tool_allowlist,
"accounts": [
{
"username": account.username,
"password": _safe_setting_value("password", account.password),
"session_bundle_b64": _safe_setting_value(
"session_bundle_b64", account.session_bundle_b64
),
"session_bundle_file": account.session_bundle_file,
}
for account in cfg.accounts
],
},
"feature_flags": {
"tool_forward_enabled": cfg.tool_forward_enabled,
"session_reuse_enabled": cfg.session_reuse_enabled,
"metrics_public": cfg.metrics_public,
"auto_login_enabled": cfg.auto_login_enabled,
},
})
@app.get("/metrics", dependencies=[Depends(metrics_auth_guard)])
async def metrics():
base = await stats_collector.prometheus_text()