perf: session reuse for multi-turn latency

- Add SessionCache (LRU + TTL, per-API-key scoped) mapping
  conversation-prefix hash -> upstream Lingma sessionId.
- Hash only user/system/developer turns so client-side
  assistant reformatting doesn't invalidate the key.
- On cache hit: reuse sessionId, send only the latest user
  message with isReply=true, and stick the request to the
  instance that originally served it.
- LingmaGatewayClient.chat_complete/chat_stream accept
  session_id/is_reply and report the real finish.sessionId
  via out_meta so we persist what Lingma actually allocated.
- Invalidate cache on non-stream failure; skip writes on
  cancelled/partial streams.
- Expose cache stats in /internal/stats and /metrics.
- Configurable via SESSION_REUSE_ENABLED / SESSION_CACHE_MAX_ENTRIES
  / SESSION_CACHE_TTL_SEC (documented in README + .env.example).

Made-with: Cursor
This commit is contained in:
GitHub Actions
2026-04-18 08:10:39 +08:00
parent d209d8ac0b
commit dfdb7087dc
6 changed files with 360 additions and 19 deletions

View File

@@ -536,7 +536,16 @@ class LingmaGatewayClient:
# ------------------------------------------------------------------ chat
def _build_payload(self, prompt: str, model_key: str, ask_mode: str, session_id: str, request_id: str):
def _build_payload(
self,
prompt: str,
model_key: str,
ask_mode: str,
session_id: str,
request_id: str,
*,
is_reply: bool = False,
):
session_type = "developer" if ask_mode == "agent" else "chat"
return {
"requestId": request_id,
@@ -546,7 +555,7 @@ class LingmaGatewayClient:
"mode": ask_mode,
"stream": True,
"source": 1,
"isReply": False,
"isReply": is_reply,
"taskDefinitionType": "system",
"content": prompt,
"text": prompt,
@@ -579,11 +588,21 @@ class LingmaGatewayClient:
"""
await self.rpc.notify("chat/ask", payload)
async def chat_complete(self, prompt: str, model_key: str, ask_mode: str) -> dict:
async def chat_complete(
self,
prompt: str,
model_key: str,
ask_mode: str,
*,
session_id: str | None = None,
is_reply: bool = False,
) -> dict:
await self.ensure_ready()
request_id = str(uuid.uuid4())
session_id = str(uuid.uuid4())
payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id)
sid = session_id or str(uuid.uuid4())
payload = self._build_payload(
prompt, model_key, ask_mode, sid, request_id, is_reply=is_reply
)
self.rpc.create_stream(request_id)
try:
await self._kick_chat_ask(payload)
@@ -597,16 +616,37 @@ class LingmaGatewayClient:
self.rpc.pop_stream(request_id)
finish = result.get("finish") or {}
result["requestId"] = request_id
result["sessionId"] = finish.get("sessionId") or session_id
# Prefer upstream-reported sessionId so the next turn binds to whatever
# Lingma actually allocated (sometimes differs from our hint).
result["sessionId"] = finish.get("sessionId") or sid
result["model"] = model_key
result["mode"] = ask_mode
result["isReply"] = is_reply
return result
async def chat_stream(self, prompt: str, model_key: str, ask_mode: str) -> AsyncIterator[str]:
async def chat_stream(
self,
prompt: str,
model_key: str,
ask_mode: str,
*,
session_id: str | None = None,
is_reply: bool = False,
out_meta: dict | None = None,
) -> AsyncIterator[str]:
"""Stream `chat/answer` chunks.
If `out_meta` is provided, the final `chat/finish` payload's sessionId
(and the raw finish dict) is written into it when the stream ends or is
cancelled. This is the hook the session cache uses to record the
upstream sessionId without holding a second reference to the RPC.
"""
await self.ensure_ready()
request_id = str(uuid.uuid4())
session_id = str(uuid.uuid4())
payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id)
sid = session_id or str(uuid.uuid4())
payload = self._build_payload(
prompt, model_key, ask_mode, sid, request_id, is_reply=is_reply
)
self.rpc.create_stream(request_id)
try:
await self._kick_chat_ask(payload)
@@ -616,4 +656,14 @@ class LingmaGatewayClient:
yield chunk
finally:
# Runs on normal completion, exception, or consumer GeneratorExit (client disconnect).
if out_meta is not None:
try:
stream_result = self.rpc.get_stream_result(request_id)
finish = stream_result.get("finish") or {}
out_meta["session_id"] = finish.get("sessionId") or sid
out_meta["finish"] = finish
out_meta["request_id"] = request_id
out_meta["chars"] = len(stream_result.get("text") or "")
except Exception:
pass
self.rpc.pop_stream(request_id)