perf: stop blocking on chat/ask RPC timeout (fixes ~30s TTFB)
Lingma streams answers via chat/answer + chat/finish notifications and
never sends a JSON-RPC response for chat/ask. The old code awaited
rpc.request("chat/ask") and swallowed the TimeoutError, so every chat
was forced to wait the full rpc_timeout (default 30s) before draining
the stream queue - even though the first token was already present in
the queue within ~2s.
Effect:
- non-stream TTFB dropped from ~30s to actual upstream latency (~2-3s).
- stream first-chunk dropped from ~30s to upstream first-token latency.
- consume_stream idle timeout decoupled from rpc_timeout so shortening
rpc_timeout no longer starves long completions.
Switch chat/ask to rpc.notify (fire-and-forget) and rely entirely on the
existing chat/answer + chat/finish handlers for result delivery.
Made-with: Cursor
This commit is contained in:
@@ -568,6 +568,17 @@ class LingmaGatewayClient:
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async def _kick_chat_ask(self, payload: dict) -> None:
|
||||||
|
"""Fire chat/ask as a notification.
|
||||||
|
|
||||||
|
Lingma streams answers back via `chat/answer` + `chat/finish` and never
|
||||||
|
returns a JSON-RPC `result` for `chat/ask`. Waiting for one wasted
|
||||||
|
`rpc_timeout` seconds before the first byte could leave the gateway —
|
||||||
|
matching our previous 30s TTFB bug. `notify` sidesteps that entirely
|
||||||
|
by not registering a pending future.
|
||||||
|
"""
|
||||||
|
await self.rpc.notify("chat/ask", payload)
|
||||||
|
|
||||||
async def chat_complete(self, prompt: str, model_key: str, ask_mode: str) -> dict:
|
async def chat_complete(self, prompt: str, model_key: str, ask_mode: str) -> dict:
|
||||||
await self.ensure_ready()
|
await self.ensure_ready()
|
||||||
request_id = str(uuid.uuid4())
|
request_id = str(uuid.uuid4())
|
||||||
@@ -575,13 +586,10 @@ class LingmaGatewayClient:
|
|||||||
payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id)
|
payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id)
|
||||||
self.rpc.create_stream(request_id)
|
self.rpc.create_stream(request_id)
|
||||||
try:
|
try:
|
||||||
try:
|
await self._kick_chat_ask(payload)
|
||||||
await self.rpc.request("chat/ask", payload, timeout=self.rpc_timeout)
|
# Consume until chat/finish closes the stream or the upstream idles.
|
||||||
except TIMEOUT_EXCEPTIONS:
|
|
||||||
# chat/ask often returns nothing until chat/finish arrives; tolerate.
|
|
||||||
pass
|
|
||||||
async for _ in self.rpc.consume_stream(
|
async for _ in self.rpc.consume_stream(
|
||||||
request_id, timeout=max(20.0, self.rpc_timeout + 20.0)
|
request_id, timeout=max(60.0, self.rpc_timeout + 30.0)
|
||||||
):
|
):
|
||||||
pass
|
pass
|
||||||
result = self.rpc.get_stream_result(request_id)
|
result = self.rpc.get_stream_result(request_id)
|
||||||
@@ -601,12 +609,9 @@ class LingmaGatewayClient:
|
|||||||
payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id)
|
payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id)
|
||||||
self.rpc.create_stream(request_id)
|
self.rpc.create_stream(request_id)
|
||||||
try:
|
try:
|
||||||
try:
|
await self._kick_chat_ask(payload)
|
||||||
await self.rpc.request("chat/ask", payload, timeout=self.rpc_timeout)
|
|
||||||
except TIMEOUT_EXCEPTIONS:
|
|
||||||
pass
|
|
||||||
async for chunk in self.rpc.consume_stream(
|
async for chunk in self.rpc.consume_stream(
|
||||||
request_id, timeout=max(20.0, self.rpc_timeout + 40.0)
|
request_id, timeout=max(60.0, self.rpc_timeout + 60.0)
|
||||||
):
|
):
|
||||||
yield chunk
|
yield chunk
|
||||||
finally:
|
finally:
|
||||||
|
|||||||
Reference in New Issue
Block a user