feat: harden cache reuse semantics and expand protocol regressions

Stabilize cross-protocol ask-mode/streaming behavior and reduce session-reuse branch collisions, then add focused docs/tests for multimodal normalization and pool/stats/config paths.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
GitHub Actions
2026-04-20 14:26:11 +08:00
parent b96b91e5b7
commit 12a4d9584e
9 changed files with 441 additions and 55 deletions

View File

@@ -38,7 +38,7 @@ from .openai_schema import (
flatten_content,
)
from .session_bundle import encode_bundle, pack_workdir
from .session_cache import SessionCache
from .session_cache import SessionCache, hash_branch_context
from .stats import StatsCollector, estimate_tokens
@@ -57,6 +57,12 @@ session_cache = SessionCache(
ttl_sec=settings.session_cache_ttl_sec,
)
STREAMING_RESPONSE_HEADERS = {
"Cache-Control": "no-cache, no-transform",
"X-Accel-Buffering": "no",
"Connection": "keep-alive",
}
def _require_pool() -> LingmaPool:
if pool is None:
@@ -416,6 +422,43 @@ def _anthropic_has_tooling_context(req: AnthropicMessagesRequest) -> bool:
return False
def _resolve_ask_mode(model: str, has_tooling_context: bool) -> str:
model_name = (model or "").lower()
if model_name in {"lingma-agent", "agent"} or has_tooling_context:
return "agent"
return settings.default_ask_mode
async def _apply_cached_instance_or_invalidate(
*,
protocol: str,
inst: PoolInstance,
cached_instance_name: str | None,
cached_session_id: str | None,
lookup_key: str | None,
) -> str | None:
if cached_instance_name and inst.name != cached_instance_name:
logger.info(
"%s session cache instance %s unhealthy, falling back to %s",
protocol,
cached_instance_name,
inst.name,
)
if lookup_key:
await session_cache.invalidate(lookup_key)
return None
return cached_session_id
def _streaming_response(event_stream) -> StreamingResponse:
return StreamingResponse(
event_stream,
media_type="text/event-stream",
headers=STREAMING_RESPONSE_HEADERS,
)
def _stream_event_type(event: Any) -> str:
if isinstance(event, dict):
t = event.get("type")
@@ -595,9 +638,7 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
tool_config = _openai_tool_config(req)
has_tooling_context = _openai_has_tooling_context(req, messages_dump)
ask_mode = settings.default_ask_mode
if req.model.lower() in {"lingma-agent", "agent"} or has_tooling_context:
ask_mode = "agent"
ask_mode = _resolve_ask_mode(req.model, has_tooling_context)
reuse_eligible = (
session_cache.enabled
@@ -610,29 +651,38 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
cached_session_id: str | None = None
cached_instance_name: str | None = None
if reuse_eligible:
lookup_key = session_cache.build_key(api_key, messages_dump[:-1], tool_config=tool_config)
write_key = session_cache.build_key(api_key, messages_dump, tool_config=tool_config)
prefix_branch_context = hash_branch_context(messages_dump[:-1])
lookup_key = session_cache.build_key(
api_key,
messages_dump[:-1],
tool_config=tool_config,
branch_context=prefix_branch_context,
)
write_key = session_cache.build_key(
api_key,
messages_dump,
tool_config=tool_config,
branch_context=hash_branch_context(messages_dump),
)
entry = await session_cache.get(lookup_key)
if entry is None:
legacy_lookup_key = session_cache.build_key(api_key, messages_dump[:-1], tool_config=tool_config)
entry = await session_cache.get(legacy_lookup_key)
if entry is not None:
lookup_key = legacy_lookup_key
if entry is not None:
cached_session_id = entry.session_id
cached_instance_name = entry.instance_name or None
# Instance selection: prefer cached instance for continuity, else normal affinity.
affinity = cached_instance_name or _affinity_key_for(req)
inst = p.pick(affinity_key=affinity)
# If cache pointed at a specific instance that's no longer healthy, we already
# fell back via pool.pick -> drop the cached session since Lingma on a
# different process won't know about it.
if cached_instance_name and inst.name != cached_instance_name:
logger.info(
"session cache instance %s unhealthy, falling back to %s (dropping cached session)",
cached_instance_name,
inst.name,
)
cached_session_id = None
if lookup_key:
await session_cache.invalidate(lookup_key)
cached_session_id = await _apply_cached_instance_or_invalidate(
protocol="chat",
inst=inst,
cached_instance_name=cached_instance_name,
cached_session_id=cached_session_id,
lookup_key=lookup_key,
)
await _ensure_instance_logged_in(inst)
@@ -831,15 +881,8 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
_ticket.release()
ticket_transferred = True
return StreamingResponse(
event_stream(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache, no-transform",
"X-Accel-Buffering": "no",
"Connection": "keep-alive",
},
)
return _streaming_response(event_stream())
try:
result = await inst.client.chat_complete(
@@ -1329,9 +1372,7 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
tool_config = _anthropic_tool_config(req)
has_tooling_context = _anthropic_has_tooling_context(req)
ask_mode = settings.default_ask_mode
if req.model.lower() in {"lingma-agent", "agent"} or has_tooling_context:
ask_mode = "agent"
ask_mode = _resolve_ask_mode(req.model, has_tooling_context)
reuse_eligible = (
session_cache.enabled and ask_mode == "chat" and len(messages_dump) >= 2 and not has_tooling_context
@@ -1341,9 +1382,25 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
cached_session_id: str | None = None
cached_instance_name: str | None = None
if reuse_eligible:
lookup_key = session_cache.build_key(api_key, messages_dump[:-1], tool_config=tool_config)
write_key = session_cache.build_key(api_key, messages_dump, tool_config=tool_config)
prefix_branch_context = hash_branch_context(messages_dump[:-1])
lookup_key = session_cache.build_key(
api_key,
messages_dump[:-1],
tool_config=tool_config,
branch_context=prefix_branch_context,
)
write_key = session_cache.build_key(
api_key,
messages_dump,
tool_config=tool_config,
branch_context=hash_branch_context(messages_dump),
)
entry = await session_cache.get(lookup_key)
if entry is None:
legacy_lookup_key = session_cache.build_key(api_key, messages_dump[:-1], tool_config=tool_config)
entry = await session_cache.get(legacy_lookup_key)
if entry is not None:
lookup_key = legacy_lookup_key
if entry is not None:
cached_session_id = entry.session_id
cached_instance_name = entry.instance_name or None
@@ -1613,15 +1670,8 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
_ticket.release()
ticket_transferred = True
return StreamingResponse(
event_stream(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache, no-transform",
"X-Accel-Buffering": "no",
"Connection": "keep-alive",
},
)
return _streaming_response(event_stream())
# ------------------------------------------------------------- non-stream
try: