perf: session reuse for multi-turn latency
- Add SessionCache (LRU + TTL, per-API-key scoped) mapping conversation-prefix hash -> upstream Lingma sessionId. - Hash only user/system/developer turns so client-side assistant reformatting doesn't invalidate the key. - On cache hit: reuse sessionId, send only the latest user message with isReply=true, and stick the request to the instance that originally served it. - LingmaGatewayClient.chat_complete/chat_stream accept session_id/is_reply and report the real finish.sessionId via out_meta so we persist what Lingma actually allocated. - Invalidate cache on non-stream failure; skip writes on cancelled/partial streams. - Expose cache stats in /internal/stats and /metrics. - Configurable via SESSION_REUSE_ENABLED / SESSION_CACHE_MAX_ENTRIES / SESSION_CACHE_TTL_SEC (documented in README + .env.example). Made-with: Cursor
This commit is contained in:
@@ -536,7 +536,16 @@ class LingmaGatewayClient:
|
||||
|
||||
# ------------------------------------------------------------------ chat
|
||||
|
||||
def _build_payload(self, prompt: str, model_key: str, ask_mode: str, session_id: str, request_id: str):
|
||||
def _build_payload(
|
||||
self,
|
||||
prompt: str,
|
||||
model_key: str,
|
||||
ask_mode: str,
|
||||
session_id: str,
|
||||
request_id: str,
|
||||
*,
|
||||
is_reply: bool = False,
|
||||
):
|
||||
session_type = "developer" if ask_mode == "agent" else "chat"
|
||||
return {
|
||||
"requestId": request_id,
|
||||
@@ -546,7 +555,7 @@ class LingmaGatewayClient:
|
||||
"mode": ask_mode,
|
||||
"stream": True,
|
||||
"source": 1,
|
||||
"isReply": False,
|
||||
"isReply": is_reply,
|
||||
"taskDefinitionType": "system",
|
||||
"content": prompt,
|
||||
"text": prompt,
|
||||
@@ -579,11 +588,21 @@ class LingmaGatewayClient:
|
||||
"""
|
||||
await self.rpc.notify("chat/ask", payload)
|
||||
|
||||
async def chat_complete(self, prompt: str, model_key: str, ask_mode: str) -> dict:
|
||||
async def chat_complete(
|
||||
self,
|
||||
prompt: str,
|
||||
model_key: str,
|
||||
ask_mode: str,
|
||||
*,
|
||||
session_id: str | None = None,
|
||||
is_reply: bool = False,
|
||||
) -> dict:
|
||||
await self.ensure_ready()
|
||||
request_id = str(uuid.uuid4())
|
||||
session_id = str(uuid.uuid4())
|
||||
payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id)
|
||||
sid = session_id or str(uuid.uuid4())
|
||||
payload = self._build_payload(
|
||||
prompt, model_key, ask_mode, sid, request_id, is_reply=is_reply
|
||||
)
|
||||
self.rpc.create_stream(request_id)
|
||||
try:
|
||||
await self._kick_chat_ask(payload)
|
||||
@@ -597,16 +616,37 @@ class LingmaGatewayClient:
|
||||
self.rpc.pop_stream(request_id)
|
||||
finish = result.get("finish") or {}
|
||||
result["requestId"] = request_id
|
||||
result["sessionId"] = finish.get("sessionId") or session_id
|
||||
# Prefer upstream-reported sessionId so the next turn binds to whatever
|
||||
# Lingma actually allocated (sometimes differs from our hint).
|
||||
result["sessionId"] = finish.get("sessionId") or sid
|
||||
result["model"] = model_key
|
||||
result["mode"] = ask_mode
|
||||
result["isReply"] = is_reply
|
||||
return result
|
||||
|
||||
async def chat_stream(self, prompt: str, model_key: str, ask_mode: str) -> AsyncIterator[str]:
|
||||
async def chat_stream(
|
||||
self,
|
||||
prompt: str,
|
||||
model_key: str,
|
||||
ask_mode: str,
|
||||
*,
|
||||
session_id: str | None = None,
|
||||
is_reply: bool = False,
|
||||
out_meta: dict | None = None,
|
||||
) -> AsyncIterator[str]:
|
||||
"""Stream `chat/answer` chunks.
|
||||
|
||||
If `out_meta` is provided, the final `chat/finish` payload's sessionId
|
||||
(and the raw finish dict) is written into it when the stream ends or is
|
||||
cancelled. This is the hook the session cache uses to record the
|
||||
upstream sessionId without holding a second reference to the RPC.
|
||||
"""
|
||||
await self.ensure_ready()
|
||||
request_id = str(uuid.uuid4())
|
||||
session_id = str(uuid.uuid4())
|
||||
payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id)
|
||||
sid = session_id or str(uuid.uuid4())
|
||||
payload = self._build_payload(
|
||||
prompt, model_key, ask_mode, sid, request_id, is_reply=is_reply
|
||||
)
|
||||
self.rpc.create_stream(request_id)
|
||||
try:
|
||||
await self._kick_chat_ask(payload)
|
||||
@@ -616,4 +656,14 @@ class LingmaGatewayClient:
|
||||
yield chunk
|
||||
finally:
|
||||
# Runs on normal completion, exception, or consumer GeneratorExit (client disconnect).
|
||||
if out_meta is not None:
|
||||
try:
|
||||
stream_result = self.rpc.get_stream_result(request_id)
|
||||
finish = stream_result.get("finish") or {}
|
||||
out_meta["session_id"] = finish.get("sessionId") or sid
|
||||
out_meta["finish"] = finish
|
||||
out_meta["request_id"] = request_id
|
||||
out_meta["chars"] = len(stream_result.get("text") or "")
|
||||
except Exception:
|
||||
pass
|
||||
self.rpc.pop_stream(request_id)
|
||||
|
||||
Reference in New Issue
Block a user