fix: harden responses stream termination

Ensure /v1/responses streaming always emits completion frames on upstream EOF, errors, and cancellation, and add targeted diagnostics for interrupted Lingma streams. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-20 14:55:32 +08:00
parent 12a4d9584e
commit 5e6c1c1a63
3 changed files with 139 additions and 73 deletions
--- a/app/lingma_client.py
+++ b/app/lingma_client.py
@@ -495,13 +495,21 @@ class LspWsRpcClient:
        if stream is None:
            return
        start = time.monotonic()
+        last_chunk_at = start
        while True:
            remain = timeout - (time.monotonic() - start)
            if remain <= 0:
-                raise TimeoutError("chat stream timeout")
+                first_chunk_at = stream.get("first_chunk_at")
+                raise TimeoutError(
+                    "chat stream timeout "
+                    f"request_id={request_id} timeout={timeout:.1f}s "
+                    f"first_chunk_at={None if first_chunk_at is None else round(first_chunk_at - start, 3)}s "
+                    f"last_chunk_at={round(last_chunk_at - start, 3)}s"
+                )
            chunk = await asyncio.wait_for(stream["chunks"].get(), timeout=remain)
            if chunk is None:
                break
+            last_chunk_at = time.monotonic()
            yield chunk

    def get_stream_result(self, request_id: str) -> dict:
--- a/app/main.py
+++ b/app/main.py
@@ -859,10 +859,21 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
                    yield "data: [DONE]\n\n"
                    success = True
                except asyncio.CancelledError:
-                    logger.info("chat.stream cancelled by client (inst=%s)", _inst.name)
+                    logger.info(
+                        "chat.stream cancelled by client (inst=%s, session_id=%s)",
+                        _inst.name,
+                        cached_session_id,
+                    )
                    raise
                except Exception as exc:
-                    logger.warning("chat.stream error (inst=%s): %s", _inst.name, exc)
+                    logger.warning(
+                        "chat.stream error (inst=%s, session_id=%s, prompt_tokens=%s, completion_tokens=%s): %s",
+                        _inst.name,
+                        cached_session_id,
+                        prompt_tokens,
+                        completion_tokens_holder["n"],
+                        exc,
+                    )
                finally:
                    # Persist upstream sessionId only on a clean chat/finish.
                    # Partial streams (cancelled, timed out) leave Lingma's
@@ -1157,6 +1168,23 @@ async def _responses_stream_from_chat_stream(
 ):
    created_at = int(time.time())
    usage: dict[str, int] = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
+    completed_sent = False
+
+    def _completed_frame() -> str:
+        return _sse_data(
+            {
+                "type": "response.completed",
+                "response": {
+                    "id": response_id,
+                    "object": "response",
+                    "created_at": created_at,
+                    "status": "completed",
+                    "model": model,
+                    "usage": usage,
+                },
+            }
+        )
+
    yield _sse_data(
        {
            "type": "response.created",
@@ -1170,6 +1198,7 @@ async def _responses_stream_from_chat_stream(
        }
    )

+    try:
        async for part in chat_stream.body_iterator:
            chunk = part.decode("utf-8") if isinstance(part, bytes) else str(part)
            for frame in chunk.split("\n\n"):
@@ -1178,20 +1207,9 @@ async def _responses_stream_from_chat_stream(
                    continue
                body = frame[len("data:") :].strip()
                if body == "[DONE]":
-                yield _sse_data(
-                    {
-                        "type": "response.completed",
-                        "response": {
-                            "id": response_id,
-                            "object": "response",
-                            "created_at": created_at,
-                            "status": "completed",
-                            "model": model,
-                            "usage": usage,
-                        },
-                    }
-                )
-                yield "data: [DONE]\\n\\n"
+                    yield _completed_frame()
+                    yield "data: [DONE]\n\n"
+                    completed_sent = True
                    return

                try:
@@ -1235,21 +1253,22 @@ async def _responses_stream_from_chat_stream(
                                "arguments": str(fn.get("arguments") or "{}"),
                            }
                        )
+    except asyncio.CancelledError:
+        if not completed_sent:
+            yield _completed_frame()
+            yield "data: [DONE]\n\n"
+            completed_sent = True
+        return
+    except Exception:
+        if not completed_sent:
+            yield _completed_frame()
+            yield "data: [DONE]\n\n"
+            completed_sent = True
+        return

-    yield _sse_data(
-        {
-            "type": "response.completed",
-            "response": {
-                "id": response_id,
-                "object": "response",
-                "created_at": created_at,
-                "status": "completed",
-                "model": model,
-                "usage": usage,
-            },
-        }
-    )
-    yield "data: [DONE]\\n\\n"
+    if not completed_sent:
+        yield _completed_frame()
+        yield "data: [DONE]\n\n"



--- a/tests/test_tool_call_bridge.py
+++ b/tests/test_tool_call_bridge.py
@@ -4,6 +4,7 @@ import json
 import sys
 import types
 import unittest
+import asyncio
 from unittest.mock import AsyncMock, patch


@@ -914,6 +915,44 @@ class ToolCallBridgeTests(unittest.IsolatedAsyncioTestCase):
        self.assertIn('"output_tokens": 1', body)
        self.assertIn('data: [DONE]', body)

+    async def test_responses_stream_emits_completed_when_upstream_iterator_errors(self) -> None:
+        async def _chat_sse_error():
+            yield b'data: {"choices": [{"delta": {"content": "partial"}}]}\n\n'
+            raise RuntimeError("boom")
+
+        req = ResponsesRequest(model="org_auto", input="hi", stream=True)
+        mock_chat = AsyncMock(
+            return_value=StreamingResponse(_chat_sse_error(), media_type="text/event-stream")
+        )
+
+        with patch.object(main, "v1_chat_completions", mock_chat):
+            response = await main.v1_responses(req, _make_request("/v1/responses"))
+            body = await _collect_stream(response)
+
+        self.assertIn('"type": "response.output_text.delta"', body)
+        self.assertIn('"delta": "partial"', body)
+        self.assertIn('"type": "response.completed"', body)
+        self.assertIn('data: [DONE]', body)
+
+    async def test_responses_stream_emits_completed_when_upstream_cancels(self) -> None:
+        async def _chat_sse_cancelled():
+            yield b'data: {"choices": [{"delta": {"content": "partial"}}]}\n\n'
+            raise asyncio.CancelledError()
+
+        req = ResponsesRequest(model="org_auto", input="hi", stream=True)
+        mock_chat = AsyncMock(
+            return_value=StreamingResponse(_chat_sse_cancelled(), media_type="text/event-stream")
+        )
+
+        with patch.object(main, "v1_chat_completions", mock_chat):
+            response = await main.v1_responses(req, _make_request("/v1/responses"))
+            body = await _collect_stream(response)
+
+        self.assertIn('"type": "response.output_text.delta"', body)
+        self.assertIn('"delta": "partial"', body)
+        self.assertIn('"type": "response.completed"', body)
+        self.assertIn('data: [DONE]', body)
+

    async def test_responses_non_stream_returns_502_on_invalid_upstream_json(self) -> None:
        req = ResponsesRequest(model="org_auto", input="hi", stream=False)