diff --git a/app/lingma_client.py b/app/lingma_client.py index 9c4c80d..b5fe213 100644 --- a/app/lingma_client.py +++ b/app/lingma_client.py @@ -568,6 +568,17 @@ class LingmaGatewayClient: }, } + async def _kick_chat_ask(self, payload: dict) -> None: + """Fire chat/ask as a notification. + + Lingma streams answers back via `chat/answer` + `chat/finish` and never + returns a JSON-RPC `result` for `chat/ask`. Waiting for one wasted + `rpc_timeout` seconds before the first byte could leave the gateway — + matching our previous 30s TTFB bug. `notify` sidesteps that entirely + by not registering a pending future. + """ + await self.rpc.notify("chat/ask", payload) + async def chat_complete(self, prompt: str, model_key: str, ask_mode: str) -> dict: await self.ensure_ready() request_id = str(uuid.uuid4()) @@ -575,13 +586,10 @@ class LingmaGatewayClient: payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id) self.rpc.create_stream(request_id) try: - try: - await self.rpc.request("chat/ask", payload, timeout=self.rpc_timeout) - except TIMEOUT_EXCEPTIONS: - # chat/ask often returns nothing until chat/finish arrives; tolerate. - pass + await self._kick_chat_ask(payload) + # Consume until chat/finish closes the stream or the upstream idles. async for _ in self.rpc.consume_stream( - request_id, timeout=max(20.0, self.rpc_timeout + 20.0) + request_id, timeout=max(60.0, self.rpc_timeout + 30.0) ): pass result = self.rpc.get_stream_result(request_id) @@ -601,12 +609,9 @@ class LingmaGatewayClient: payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id) self.rpc.create_stream(request_id) try: - try: - await self.rpc.request("chat/ask", payload, timeout=self.rpc_timeout) - except TIMEOUT_EXCEPTIONS: - pass + await self._kick_chat_ask(payload) async for chunk in self.rpc.consume_stream( - request_id, timeout=max(20.0, self.rpc_timeout + 40.0) + request_id, timeout=max(60.0, self.rpc_timeout + 60.0) ): yield chunk finally: