feat: M1+M2 gateway hardening and multi-instance pool

Behavior hardening (M1): - Fix `_chat_streams` memory leak: pop_stream on completion, error, and client disconnect. - Add WebSocket reconnect with state machine (stopped/starting/ready/ reconnecting/failed/closed) and exponential backoff, so a Lingma restart no longer requires restarting the gateway. - Lazy initialization: startup failure is non-fatal, first real request triggers retry, `/healthz` reflects readiness. - Migrate FastAPI on_event to lifespan. - Structured JSON logging with request_id ContextVar; `x-request-id` propagated to responses. - SSE now sets `Cache-Control: no-cache`, `X-Accel-Buffering: no` to defeat proxy buffering. - OpenAI schema compatibility: `content` accepts str | list[parts] | None, added `developer`/`function` roles, `tools/tool_choice/stream_options/ user/max_tokens` fields, and `stream_options.include_usage` emits final usage chunk. - `require_bearer` uses `hmac.compare_digest`; `/metrics` now requires Bearer when `METRICS_TOKEN` or `API_KEYS` are set. - Python 3.10/3.11 `TimeoutError` vs `asyncio.TimeoutError` unified. - Error responses no longer leak `auto_login.status()` details. Backpressure (M2 / A2): - New `InFlightGuard` with per-request ticket, queue + rejection accounting, `BackpressureRejected` raises 429 + `Retry-After` once `GATEWAY_QUEUE_TIMEOUT_SEC` elapses. - Streaming ticket ownership transfers to the generator so CancelledError from client disconnect still releases the slot. - `/internal/stats.concurrency` and `/metrics` expose in_flight/queued/ accepted_total/rejected_total/max_in_flight. Multi-instance pool (M2 / A1 + B3): - New `LingmaPool` with N processes, each with its own workDir, socket port (dynamic when N>1), and `AutoLoginManager`. - Account parser supports CSV (`u1:p1,u2:p2`) and JSON formats via `LINGMA_ACCOUNTS`; falls back to `LINGMA_USERNAME/LINGMA_PASSWORD` for backwards compatibility (N=1 keeps legacy paths/ports). - Routing: sticky affinity by `user` / system-prompt hash, then least-in-flight, finally round-robin fallback for unhealthy pool. - `/healthz` reports per-instance state and ready count. - `/internal/stats.pool` and `/metrics` expose per-instance `gateway_pool_instance_in_flight{name}` / `gateway_pool_instance_ready{name}`. - `/internal/auto-login/start?instance=inst-N` targets a specific instance; `/internal/auto-login/status` lists all instances. Compat notes: - `.env.example` adds `METRICS_TOKEN`, `LOG_LEVEL`, `GATEWAY_MAX_IN_FLIGHT`, `GATEWAY_QUEUE_TIMEOUT_SEC`, `LINGMA_ACCOUNTS`, `LINGMA_INSTANCE_COUNT`. - `.gitignore` cleaned up data/ duplication. - Existing single-instance deployments keep working without config change. Made-with: Cursor
2026-04-18 07:40:32 +08:00
parent 6114c66aed
commit 707acc9005
11 changed files with 1360 additions and 222 deletions
--- a/app/lingma_client.py
+++ b/app/lingma_client.py
@@ -9,10 +9,23 @@ import subprocess
 import time
 import uuid
 from pathlib import Path
-from typing import AsyncIterator
+from typing import AsyncIterator, Callable, Optional

 import websockets

+from .logging_config import get_logger
+
+
+logger = get_logger("lingma_gateway.client")
+
+
+# Some callers live on Python 3.10 where asyncio.TimeoutError is a distinct class,
+# while 3.11+ unifies it with the builtin TimeoutError. Always catch both.
+TIMEOUT_EXCEPTIONS: tuple[type[BaseException], ...] = (
+    asyncio.TimeoutError,
+    TimeoutError,
+)
+

 def _is_port_open(host: str, port: int, timeout_sec: float = 0.5) -> bool:
    try:
@@ -79,23 +92,37 @@ def _parse_lsp_frames(buf: bytes):


 class LspWsRpcClient:
-    def __init__(self, ws):
+    def __init__(self, ws, on_disconnect: Optional[Callable[[BaseException], None]] = None):
        self.ws = ws
        self._id = 1
        self._pending: dict[int, asyncio.Future] = {}
        self._send_lock = asyncio.Lock()
-        self._reader_task = None
+        self._reader_task: asyncio.Task | None = None
        self._rx_buffer = b""
        self._chat_streams: dict[str, dict] = {}
+        self._on_disconnect = on_disconnect
+        self._closed = False

    async def start(self):
        self._reader_task = asyncio.create_task(self._reader_loop())

    async def close(self):
+        self._closed = True
        if self._reader_task:
            self._reader_task.cancel()
            with contextlib.suppress(Exception):
                await self._reader_task
+        # Abort any pending futures so callers fail fast instead of hanging.
+        for fut in self._pending.values():
+            if not fut.done():
+                fut.set_exception(ConnectionError("lingma client closed"))
+        self._pending.clear()
+        # Signal open streams to terminate.
+        for stream in self._chat_streams.values():
+            if not stream["done"].is_set():
+                stream["done"].set()
+                stream["chunks"].put_nowait(None)
+        self._chat_streams.clear()

    async def _send(self, payload: dict):
        async with self._send_lock:
@@ -127,10 +154,23 @@ class LspWsRpcClient:
        except asyncio.CancelledError:
            pass
        except Exception as exc:
+            if not self._closed:
+                logger.warning("lingma reader loop terminated: %s", exc)
+            # Propagate failure to anyone waiting on an RPC.
            for fut in self._pending.values():
                if not fut.done():
                    fut.set_exception(exc)
            self._pending.clear()
+            # Also unblock any in-flight chat streams so consumers exit.
+            for stream in self._chat_streams.values():
+                if not stream["done"].is_set():
+                    stream["done"].set()
+                    stream["chunks"].put_nowait(None)
+            if not self._closed and self._on_disconnect is not None:
+                try:
+                    self._on_disconnect(exc)
+                except Exception:
+                    logger.exception("on_disconnect callback failed")

    async def _handle_server_message(self, msg: dict):
        method = msg.get("method")
@@ -168,7 +208,7 @@ class LspWsRpcClient:
        await self._send(payload)
        try:
            msg = await asyncio.wait_for(fut, timeout=timeout)
-        except TimeoutError:
+        except TIMEOUT_EXCEPTIONS:
            self._pending.pop(rid, None)
            raise TimeoutError(f"RPC timeout: {method}")
        if "error" in msg:
@@ -189,8 +229,20 @@ class LspWsRpcClient:
            "finish_at": None,
        }

+    def pop_stream(self, request_id: str) -> None:
+        stream = self._chat_streams.pop(request_id, None)
+        if stream is None:
+            return
+        # Drain queue so no stray future gets stuck if the consumer bailed early.
+        if not stream["done"].is_set():
+            stream["done"].set()
+            with contextlib.suppress(Exception):
+                stream["chunks"].put_nowait(None)
+
    async def consume_stream(self, request_id: str, timeout: float) -> AsyncIterator[str]:
-        stream = self._chat_streams[request_id]
+        stream = self._chat_streams.get(request_id)
+        if stream is None:
+            return
        start = time.monotonic()
        while True:
            remain = timeout - (time.monotonic() - start)
@@ -218,6 +270,19 @@ class LspWsRpcClient:


 class LingmaGatewayClient:
+    """Owns the Lingma subprocess and the LSP-over-WS connection.
+
+    Adds a small state machine + reconnect loop so the gateway can survive Lingma
+    restarts and slow cold starts without bringing down the FastAPI app.
+    """
+
+    STATE_STOPPED = "stopped"
+    STATE_STARTING = "starting"
+    STATE_READY = "ready"
+    STATE_RECONNECTING = "reconnecting"
+    STATE_FAILED = "failed"
+    STATE_CLOSED = "closed"
+
    def __init__(
        self,
        lingma_bin: str,
@@ -227,7 +292,11 @@ class LingmaGatewayClient:
        rpc_timeout: int,
        default_model: str,
        default_ask_mode: str,
+        *,
+        name: str = "lingma",
+        extra_info_paths: list[Path] | None = None,
    ):
+        self.name = name
        self.lingma_bin = Path(lingma_bin)
        self.work_dir = Path(work_dir)
        self.socket_port = socket_port
@@ -235,19 +304,115 @@ class LingmaGatewayClient:
        self.rpc_timeout = rpc_timeout
        self.default_model = default_model
        self.default_ask_mode = default_ask_mode
+        # Each pool instance should only look at its own workDir .info to avoid
+        # cross-instance clobbering via the shared ~/.lingma/.info path.
+        if extra_info_paths is None:
+            extra_info_paths = [Path.home() / ".lingma" / ".info"]
+        self._extra_info_paths = list(extra_info_paths)
        self._rpc: LspWsRpcClient | None = None
        self._ws = None
+        self._state = self.STATE_STOPPED
+        self._state_lock = asyncio.Lock()
+        self._ready_event = asyncio.Event()
+        self._reconnect_task: asyncio.Task | None = None
+        self._last_error: str = ""
+
+    # ------------------------------------------------------------------ state
+
+    @property
+    def state(self) -> str:
+        return self._state
+
+    @property
+    def last_error(self) -> str:
+        return self._last_error
+
+    def _set_state(self, state: str, err: str = "") -> None:
+        if state != self._state:
+            logger.info("lingma client state %s -> %s", self._state, state, extra={"ctx_new_state": state})
+        self._state = state
+        if err:
+            self._last_error = err
+        if state == self.STATE_READY:
+            self._ready_event.set()
+        else:
+            self._ready_event.clear()
+
+    # -------------------------------------------------------------- lifecycle
+
+    async def start(self) -> None:
+        """Initial start. Failure is non-fatal: ensure_ready() will retry later."""
+        try:
+            await self._connect(initial=True)
+        except Exception as exc:
+            self._set_state(self.STATE_FAILED, err=str(exc))
+            logger.exception("initial lingma start failed; will retry on demand")
+
+    async def close(self) -> None:
+        self._set_state(self.STATE_CLOSED)
+        if self._reconnect_task and not self._reconnect_task.done():
+            self._reconnect_task.cancel()
+            with contextlib.suppress(Exception):
+                await self._reconnect_task
+        if self._rpc:
+            await self._rpc.close()
+        if self._ws:
+            with contextlib.suppress(Exception):
+                await self._ws.close()
+
+    async def ensure_ready(self, timeout: float | None = None) -> None:
+        """Block until the RPC connection is usable, (re)connecting on demand."""
+        if self._state == self.STATE_CLOSED:
+            raise RuntimeError("lingma client is closed")
+        if self._state == self.STATE_READY and self._ws is not None:
+            return
+
+        async with self._state_lock:
+            if self._state == self.STATE_READY and self._ws is not None:
+                return
+            if self._state in (self.STATE_STOPPED, self.STATE_FAILED):
+                try:
+                    await self._connect(initial=False)
+                    return
+                except Exception as exc:
+                    self._set_state(self.STATE_FAILED, err=str(exc))
+                    raise
+
+        wait_timeout = timeout if timeout is not None else max(
+            30.0, float(self.startup_timeout) + 10.0
+        )
+        try:
+            await asyncio.wait_for(self._ready_event.wait(), timeout=wait_timeout)
+        except TIMEOUT_EXCEPTIONS:
+            raise RuntimeError(f"lingma not ready (state={self._state}, err={self._last_error})")
+
+    # --------------------------------------------------------------- connect
+
+    async def _connect(self, *, initial: bool) -> None:
+        self._set_state(self.STATE_STARTING)

-    async def start(self):
        if not self.lingma_bin.exists():
            raise FileNotFoundError(f"Lingma not found: {self.lingma_bin}")
-        if not _is_port_open("127.0.0.1", self.socket_port):
+
+        info_paths = [self.work_dir / ".info", *self._extra_info_paths]
+
+        # socket_port <= 0 is the pool-friendly "always spawn and read .info" mode.
+        port_prewarmed = self.socket_port > 0 and _is_port_open(
+            "127.0.0.1", self.socket_port
+        )
+        if not port_prewarmed:
            self.work_dir.mkdir(parents=True, exist_ok=True)
            # Remove stale info files from host-mounted workspace before boot.
-            for p in [self.work_dir / ".info", Path.home() / ".lingma" / ".info"]:
+            for p in info_paths:
                with contextlib.suppress(Exception):
                    if p.exists():
                        p.unlink()
+            logger.info(
+                "[%s] spawning lingma: %s start --workDir %s",
+                self.name,
+                self.lingma_bin,
+                self.work_dir,
+            )
            subprocess.Popen(
                [str(self.lingma_bin), "start", "--workDir", str(self.work_dir)],
                cwd=str(self.lingma_bin.parent),
@@ -255,13 +420,9 @@ class LingmaGatewayClient:
                stderr=subprocess.DEVNULL,
                start_new_session=True,
            )
-            info, _, _ = _wait_info_any(
-                [self.work_dir / ".info", Path.home() / ".lingma" / ".info"],
-                timeout_sec=self.startup_timeout,
-            )
+            info, _, _ = _wait_info_any(info_paths, timeout_sec=self.startup_timeout)
            self.socket_port = info

-        # Wait for socket to actually become connectable.
        deadline = time.time() + self.startup_timeout
        while time.time() < deadline:
            if _is_port_open("127.0.0.1", self.socket_port, timeout_sec=0.3):
@@ -270,9 +431,19 @@ class LingmaGatewayClient:
        else:
            raise TimeoutError(f"Lingma socket not open on port {self.socket_port}")

+        # Close any stale ws/rpc before creating fresh ones (reconnect path).
+        if self._rpc is not None:
+            with contextlib.suppress(Exception):
+                await self._rpc.close()
+            self._rpc = None
+        if self._ws is not None:
+            with contextlib.suppress(Exception):
+                await self._ws.close()
+            self._ws = None
+
        ws_url = f"ws://127.0.0.1:{self.socket_port}"
        self._ws = await websockets.connect(ws_url, max_size=10 * 1024 * 1024)
-        self._rpc = LspWsRpcClient(self._ws)
+        self._rpc = LspWsRpcClient(self._ws, on_disconnect=self._on_disconnect)
        await self._rpc.start()
        await self._rpc.request(
            "initialize",
@@ -286,32 +457,73 @@ class LingmaGatewayClient:
            timeout=self.rpc_timeout,
        )
        await self._rpc.notify("initialized", {})
+        self._set_state(self.STATE_READY)
+        logger.info(
+            "[%s] lingma ready on port %d (initial=%s)",
+            self.name,
+            self.socket_port,
+            initial,
+        )

-    async def close(self):
-        if self._rpc:
-            await self._rpc.close()
-        if self._ws:
-            await self._ws.close()
+    def _on_disconnect(self, exc: BaseException) -> None:
+        if self._state == self.STATE_CLOSED:
+            return
+        self._set_state(self.STATE_RECONNECTING, err=str(exc))
+        if self._reconnect_task and not self._reconnect_task.done():
+            return
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            return
+        self._reconnect_task = loop.create_task(self._reconnect_loop())
+
+    async def _reconnect_loop(self) -> None:
+        backoff = 1.0
+        max_backoff = 30.0
+        max_attempts = 20
+        for attempt in range(1, max_attempts + 1):
+            if self._state == self.STATE_CLOSED:
+                return
+            await asyncio.sleep(backoff)
+            try:
+                async with self._state_lock:
+                    await self._connect(initial=False)
+                logger.info("lingma reconnected after %d attempt(s)", attempt)
+                return
+            except Exception as exc:
+                self._last_error = str(exc)
+                logger.warning("lingma reconnect attempt %d failed: %s", attempt, exc)
+                backoff = min(backoff * 2, max_backoff)
+        self._set_state(self.STATE_FAILED, err="reconnect exhausted")
+
+    # ------------------------------------------------------------------ RPC

    @property
    def rpc(self) -> LspWsRpcClient:
        if self._rpc is None:
-            raise RuntimeError("Lingma RPC not initialized")
+            raise RuntimeError(f"Lingma RPC not initialized (state={self._state})")
        return self._rpc

    async def auth_status(self):
+        await self.ensure_ready()
        return await self.rpc.request("auth/status", {}, timeout=self.rpc_timeout)

    async def query_models(self):
+        await self.ensure_ready()
        return await self.rpc.request("config/queryModels", {}, timeout=self.rpc_timeout)

    async def get_endpoint(self):
+        await self.ensure_ready()
        return await self.rpc.request("config/getEndpoint", {}, timeout=self.rpc_timeout)

    async def update_endpoint(self, endpoint: str):
-        return await self.rpc.request("config/updateEndpoint", {"endpoint": endpoint}, timeout=self.rpc_timeout)
+        await self.ensure_ready()
+        return await self.rpc.request(
+            "config/updateEndpoint", {"endpoint": endpoint}, timeout=self.rpc_timeout
+        )

    async def generate_login_url(self):
+        await self.ensure_ready()
        result = await self.rpc.request("login/generateUrl", {}, timeout=self.rpc_timeout)
        if isinstance(result, str):
            return result, {"raw": result}
@@ -322,6 +534,8 @@ class LingmaGatewayClient:
            return "", result
        return "", {"raw": result}

+    # ------------------------------------------------------------------ chat
+
    def _build_payload(self, prompt: str, model_key: str, ask_mode: str, session_id: str, request_id: str):
        session_type = "developer" if ask_mode == "agent" else "chat"
        return {
@@ -355,17 +569,24 @@ class LingmaGatewayClient:
        }

    async def chat_complete(self, prompt: str, model_key: str, ask_mode: str) -> dict:
+        await self.ensure_ready()
        request_id = str(uuid.uuid4())
        session_id = str(uuid.uuid4())
        payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id)
        self.rpc.create_stream(request_id)
        try:
-            await self.rpc.request("chat/ask", payload, timeout=self.rpc_timeout)
-        except (TimeoutError, asyncio.TimeoutError):
-            pass
-        async for _ in self.rpc.consume_stream(request_id, timeout=max(20.0, self.rpc_timeout + 20.0)):
-            pass
-        result = self.rpc.get_stream_result(request_id)
+            try:
+                await self.rpc.request("chat/ask", payload, timeout=self.rpc_timeout)
+            except TIMEOUT_EXCEPTIONS:
+                # chat/ask often returns nothing until chat/finish arrives; tolerate.
+                pass
+            async for _ in self.rpc.consume_stream(
+                request_id, timeout=max(20.0, self.rpc_timeout + 20.0)
+            ):
+                pass
+            result = self.rpc.get_stream_result(request_id)
+        finally:
+            self.rpc.pop_stream(request_id)
        finish = result.get("finish") or {}
        result["requestId"] = request_id
        result["sessionId"] = finish.get("sessionId") or session_id
@@ -374,13 +595,20 @@ class LingmaGatewayClient:
        return result

    async def chat_stream(self, prompt: str, model_key: str, ask_mode: str) -> AsyncIterator[str]:
+        await self.ensure_ready()
        request_id = str(uuid.uuid4())
        session_id = str(uuid.uuid4())
        payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id)
        self.rpc.create_stream(request_id)
        try:
-            await self.rpc.request("chat/ask", payload, timeout=self.rpc_timeout)
-        except (TimeoutError, asyncio.TimeoutError):
-            pass
-        async for chunk in self.rpc.consume_stream(request_id, timeout=max(20.0, self.rpc_timeout + 40.0)):
-            yield chunk
+            try:
+                await self.rpc.request("chat/ask", payload, timeout=self.rpc_timeout)
+            except TIMEOUT_EXCEPTIONS:
+                pass
+            async for chunk in self.rpc.consume_stream(
+                request_id, timeout=max(20.0, self.rpc_timeout + 40.0)
+            ):
+                yield chunk
+        finally:
+            # Runs on normal completion, exception, or consumer GeneratorExit (client disconnect).
+            self.rpc.pop_stream(request_id)