feat: M1+M2 gateway hardening and multi-instance pool

Behavior hardening (M1):
- Fix `_chat_streams` memory leak: pop_stream on completion, error, and
  client disconnect.
- Add WebSocket reconnect with state machine (stopped/starting/ready/
  reconnecting/failed/closed) and exponential backoff, so a Lingma
  restart no longer requires restarting the gateway.
- Lazy initialization: startup failure is non-fatal, first real request
  triggers retry, `/healthz` reflects readiness.
- Migrate FastAPI on_event to lifespan.
- Structured JSON logging with request_id ContextVar; `x-request-id`
  propagated to responses.
- SSE now sets `Cache-Control: no-cache`, `X-Accel-Buffering: no` to
  defeat proxy buffering.
- OpenAI schema compatibility: `content` accepts str | list[parts] | None,
  added `developer`/`function` roles, `tools/tool_choice/stream_options/
  user/max_tokens` fields, and `stream_options.include_usage` emits final
  usage chunk.
- `require_bearer` uses `hmac.compare_digest`; `/metrics` now requires
  Bearer when `METRICS_TOKEN` or `API_KEYS` are set.
- Python 3.10/3.11 `TimeoutError` vs `asyncio.TimeoutError` unified.
- Error responses no longer leak `auto_login.status()` details.

Backpressure (M2 / A2):
- New `InFlightGuard` with per-request ticket, queue + rejection
  accounting, `BackpressureRejected` raises 429 + `Retry-After` once
  `GATEWAY_QUEUE_TIMEOUT_SEC` elapses.
- Streaming ticket ownership transfers to the generator so CancelledError
  from client disconnect still releases the slot.
- `/internal/stats.concurrency` and `/metrics` expose in_flight/queued/
  accepted_total/rejected_total/max_in_flight.

Multi-instance pool (M2 / A1 + B3):
- New `LingmaPool` with N processes, each with its own workDir, socket
  port (dynamic when N>1), and `AutoLoginManager`.
- Account parser supports CSV (`u1:p1,u2:p2`) and JSON formats via
  `LINGMA_ACCOUNTS`; falls back to `LINGMA_USERNAME/LINGMA_PASSWORD` for
  backwards compatibility (N=1 keeps legacy paths/ports).
- Routing: sticky affinity by `user` / system-prompt hash, then
  least-in-flight, finally round-robin fallback for unhealthy pool.
- `/healthz` reports per-instance state and ready count.
- `/internal/stats.pool` and `/metrics` expose per-instance
  `gateway_pool_instance_in_flight{name}` / `gateway_pool_instance_ready{name}`.
- `/internal/auto-login/start?instance=inst-N` targets a specific instance;
  `/internal/auto-login/status` lists all instances.

Compat notes:
- `.env.example` adds `METRICS_TOKEN`, `LOG_LEVEL`, `GATEWAY_MAX_IN_FLIGHT`,
  `GATEWAY_QUEUE_TIMEOUT_SEC`, `LINGMA_ACCOUNTS`, `LINGMA_INSTANCE_COUNT`.
- `.gitignore` cleaned up data/ duplication.
- Existing single-instance deployments keep working without config change.

Made-with: Cursor
This commit is contained in:
GitHub Actions
2026-04-18 07:40:32 +08:00
parent 6114c66aed
commit 707acc9005
11 changed files with 1360 additions and 222 deletions

View File

@@ -9,10 +9,23 @@ import subprocess
import time
import uuid
from pathlib import Path
from typing import AsyncIterator
from typing import AsyncIterator, Callable, Optional
import websockets
from .logging_config import get_logger
logger = get_logger("lingma_gateway.client")
# Some callers live on Python 3.10 where asyncio.TimeoutError is a distinct class,
# while 3.11+ unifies it with the builtin TimeoutError. Always catch both.
TIMEOUT_EXCEPTIONS: tuple[type[BaseException], ...] = (
asyncio.TimeoutError,
TimeoutError,
)
def _is_port_open(host: str, port: int, timeout_sec: float = 0.5) -> bool:
try:
@@ -79,23 +92,37 @@ def _parse_lsp_frames(buf: bytes):
class LspWsRpcClient:
def __init__(self, ws):
def __init__(self, ws, on_disconnect: Optional[Callable[[BaseException], None]] = None):
self.ws = ws
self._id = 1
self._pending: dict[int, asyncio.Future] = {}
self._send_lock = asyncio.Lock()
self._reader_task = None
self._reader_task: asyncio.Task | None = None
self._rx_buffer = b""
self._chat_streams: dict[str, dict] = {}
self._on_disconnect = on_disconnect
self._closed = False
async def start(self):
self._reader_task = asyncio.create_task(self._reader_loop())
async def close(self):
self._closed = True
if self._reader_task:
self._reader_task.cancel()
with contextlib.suppress(Exception):
await self._reader_task
# Abort any pending futures so callers fail fast instead of hanging.
for fut in self._pending.values():
if not fut.done():
fut.set_exception(ConnectionError("lingma client closed"))
self._pending.clear()
# Signal open streams to terminate.
for stream in self._chat_streams.values():
if not stream["done"].is_set():
stream["done"].set()
stream["chunks"].put_nowait(None)
self._chat_streams.clear()
async def _send(self, payload: dict):
async with self._send_lock:
@@ -127,10 +154,23 @@ class LspWsRpcClient:
except asyncio.CancelledError:
pass
except Exception as exc:
if not self._closed:
logger.warning("lingma reader loop terminated: %s", exc)
# Propagate failure to anyone waiting on an RPC.
for fut in self._pending.values():
if not fut.done():
fut.set_exception(exc)
self._pending.clear()
# Also unblock any in-flight chat streams so consumers exit.
for stream in self._chat_streams.values():
if not stream["done"].is_set():
stream["done"].set()
stream["chunks"].put_nowait(None)
if not self._closed and self._on_disconnect is not None:
try:
self._on_disconnect(exc)
except Exception:
logger.exception("on_disconnect callback failed")
async def _handle_server_message(self, msg: dict):
method = msg.get("method")
@@ -168,7 +208,7 @@ class LspWsRpcClient:
await self._send(payload)
try:
msg = await asyncio.wait_for(fut, timeout=timeout)
except TimeoutError:
except TIMEOUT_EXCEPTIONS:
self._pending.pop(rid, None)
raise TimeoutError(f"RPC timeout: {method}")
if "error" in msg:
@@ -189,8 +229,20 @@ class LspWsRpcClient:
"finish_at": None,
}
def pop_stream(self, request_id: str) -> None:
stream = self._chat_streams.pop(request_id, None)
if stream is None:
return
# Drain queue so no stray future gets stuck if the consumer bailed early.
if not stream["done"].is_set():
stream["done"].set()
with contextlib.suppress(Exception):
stream["chunks"].put_nowait(None)
async def consume_stream(self, request_id: str, timeout: float) -> AsyncIterator[str]:
stream = self._chat_streams[request_id]
stream = self._chat_streams.get(request_id)
if stream is None:
return
start = time.monotonic()
while True:
remain = timeout - (time.monotonic() - start)
@@ -218,6 +270,19 @@ class LspWsRpcClient:
class LingmaGatewayClient:
"""Owns the Lingma subprocess and the LSP-over-WS connection.
Adds a small state machine + reconnect loop so the gateway can survive Lingma
restarts and slow cold starts without bringing down the FastAPI app.
"""
STATE_STOPPED = "stopped"
STATE_STARTING = "starting"
STATE_READY = "ready"
STATE_RECONNECTING = "reconnecting"
STATE_FAILED = "failed"
STATE_CLOSED = "closed"
def __init__(
self,
lingma_bin: str,
@@ -227,7 +292,11 @@ class LingmaGatewayClient:
rpc_timeout: int,
default_model: str,
default_ask_mode: str,
*,
name: str = "lingma",
extra_info_paths: list[Path] | None = None,
):
self.name = name
self.lingma_bin = Path(lingma_bin)
self.work_dir = Path(work_dir)
self.socket_port = socket_port
@@ -235,19 +304,115 @@ class LingmaGatewayClient:
self.rpc_timeout = rpc_timeout
self.default_model = default_model
self.default_ask_mode = default_ask_mode
# Each pool instance should only look at its own workDir .info to avoid
# cross-instance clobbering via the shared ~/.lingma/.info path.
if extra_info_paths is None:
extra_info_paths = [Path.home() / ".lingma" / ".info"]
self._extra_info_paths = list(extra_info_paths)
self._rpc: LspWsRpcClient | None = None
self._ws = None
self._state = self.STATE_STOPPED
self._state_lock = asyncio.Lock()
self._ready_event = asyncio.Event()
self._reconnect_task: asyncio.Task | None = None
self._last_error: str = ""
# ------------------------------------------------------------------ state
@property
def state(self) -> str:
return self._state
@property
def last_error(self) -> str:
return self._last_error
def _set_state(self, state: str, err: str = "") -> None:
if state != self._state:
logger.info("lingma client state %s -> %s", self._state, state, extra={"ctx_new_state": state})
self._state = state
if err:
self._last_error = err
if state == self.STATE_READY:
self._ready_event.set()
else:
self._ready_event.clear()
# -------------------------------------------------------------- lifecycle
async def start(self) -> None:
"""Initial start. Failure is non-fatal: ensure_ready() will retry later."""
try:
await self._connect(initial=True)
except Exception as exc:
self._set_state(self.STATE_FAILED, err=str(exc))
logger.exception("initial lingma start failed; will retry on demand")
async def close(self) -> None:
self._set_state(self.STATE_CLOSED)
if self._reconnect_task and not self._reconnect_task.done():
self._reconnect_task.cancel()
with contextlib.suppress(Exception):
await self._reconnect_task
if self._rpc:
await self._rpc.close()
if self._ws:
with contextlib.suppress(Exception):
await self._ws.close()
async def ensure_ready(self, timeout: float | None = None) -> None:
"""Block until the RPC connection is usable, (re)connecting on demand."""
if self._state == self.STATE_CLOSED:
raise RuntimeError("lingma client is closed")
if self._state == self.STATE_READY and self._ws is not None:
return
async with self._state_lock:
if self._state == self.STATE_READY and self._ws is not None:
return
if self._state in (self.STATE_STOPPED, self.STATE_FAILED):
try:
await self._connect(initial=False)
return
except Exception as exc:
self._set_state(self.STATE_FAILED, err=str(exc))
raise
wait_timeout = timeout if timeout is not None else max(
30.0, float(self.startup_timeout) + 10.0
)
try:
await asyncio.wait_for(self._ready_event.wait(), timeout=wait_timeout)
except TIMEOUT_EXCEPTIONS:
raise RuntimeError(f"lingma not ready (state={self._state}, err={self._last_error})")
# --------------------------------------------------------------- connect
async def _connect(self, *, initial: bool) -> None:
self._set_state(self.STATE_STARTING)
async def start(self):
if not self.lingma_bin.exists():
raise FileNotFoundError(f"Lingma not found: {self.lingma_bin}")
if not _is_port_open("127.0.0.1", self.socket_port):
info_paths = [self.work_dir / ".info", *self._extra_info_paths]
# socket_port <= 0 is the pool-friendly "always spawn and read .info" mode.
port_prewarmed = self.socket_port > 0 and _is_port_open(
"127.0.0.1", self.socket_port
)
if not port_prewarmed:
self.work_dir.mkdir(parents=True, exist_ok=True)
# Remove stale info files from host-mounted workspace before boot.
for p in [self.work_dir / ".info", Path.home() / ".lingma" / ".info"]:
for p in info_paths:
with contextlib.suppress(Exception):
if p.exists():
p.unlink()
logger.info(
"[%s] spawning lingma: %s start --workDir %s",
self.name,
self.lingma_bin,
self.work_dir,
)
subprocess.Popen(
[str(self.lingma_bin), "start", "--workDir", str(self.work_dir)],
cwd=str(self.lingma_bin.parent),
@@ -255,13 +420,9 @@ class LingmaGatewayClient:
stderr=subprocess.DEVNULL,
start_new_session=True,
)
info, _, _ = _wait_info_any(
[self.work_dir / ".info", Path.home() / ".lingma" / ".info"],
timeout_sec=self.startup_timeout,
)
info, _, _ = _wait_info_any(info_paths, timeout_sec=self.startup_timeout)
self.socket_port = info
# Wait for socket to actually become connectable.
deadline = time.time() + self.startup_timeout
while time.time() < deadline:
if _is_port_open("127.0.0.1", self.socket_port, timeout_sec=0.3):
@@ -270,9 +431,19 @@ class LingmaGatewayClient:
else:
raise TimeoutError(f"Lingma socket not open on port {self.socket_port}")
# Close any stale ws/rpc before creating fresh ones (reconnect path).
if self._rpc is not None:
with contextlib.suppress(Exception):
await self._rpc.close()
self._rpc = None
if self._ws is not None:
with contextlib.suppress(Exception):
await self._ws.close()
self._ws = None
ws_url = f"ws://127.0.0.1:{self.socket_port}"
self._ws = await websockets.connect(ws_url, max_size=10 * 1024 * 1024)
self._rpc = LspWsRpcClient(self._ws)
self._rpc = LspWsRpcClient(self._ws, on_disconnect=self._on_disconnect)
await self._rpc.start()
await self._rpc.request(
"initialize",
@@ -286,32 +457,73 @@ class LingmaGatewayClient:
timeout=self.rpc_timeout,
)
await self._rpc.notify("initialized", {})
self._set_state(self.STATE_READY)
logger.info(
"[%s] lingma ready on port %d (initial=%s)",
self.name,
self.socket_port,
initial,
)
async def close(self):
if self._rpc:
await self._rpc.close()
if self._ws:
await self._ws.close()
def _on_disconnect(self, exc: BaseException) -> None:
if self._state == self.STATE_CLOSED:
return
self._set_state(self.STATE_RECONNECTING, err=str(exc))
if self._reconnect_task and not self._reconnect_task.done():
return
try:
loop = asyncio.get_running_loop()
except RuntimeError:
return
self._reconnect_task = loop.create_task(self._reconnect_loop())
async def _reconnect_loop(self) -> None:
backoff = 1.0
max_backoff = 30.0
max_attempts = 20
for attempt in range(1, max_attempts + 1):
if self._state == self.STATE_CLOSED:
return
await asyncio.sleep(backoff)
try:
async with self._state_lock:
await self._connect(initial=False)
logger.info("lingma reconnected after %d attempt(s)", attempt)
return
except Exception as exc:
self._last_error = str(exc)
logger.warning("lingma reconnect attempt %d failed: %s", attempt, exc)
backoff = min(backoff * 2, max_backoff)
self._set_state(self.STATE_FAILED, err="reconnect exhausted")
# ------------------------------------------------------------------ RPC
@property
def rpc(self) -> LspWsRpcClient:
if self._rpc is None:
raise RuntimeError("Lingma RPC not initialized")
raise RuntimeError(f"Lingma RPC not initialized (state={self._state})")
return self._rpc
async def auth_status(self):
await self.ensure_ready()
return await self.rpc.request("auth/status", {}, timeout=self.rpc_timeout)
async def query_models(self):
await self.ensure_ready()
return await self.rpc.request("config/queryModels", {}, timeout=self.rpc_timeout)
async def get_endpoint(self):
await self.ensure_ready()
return await self.rpc.request("config/getEndpoint", {}, timeout=self.rpc_timeout)
async def update_endpoint(self, endpoint: str):
return await self.rpc.request("config/updateEndpoint", {"endpoint": endpoint}, timeout=self.rpc_timeout)
await self.ensure_ready()
return await self.rpc.request(
"config/updateEndpoint", {"endpoint": endpoint}, timeout=self.rpc_timeout
)
async def generate_login_url(self):
await self.ensure_ready()
result = await self.rpc.request("login/generateUrl", {}, timeout=self.rpc_timeout)
if isinstance(result, str):
return result, {"raw": result}
@@ -322,6 +534,8 @@ class LingmaGatewayClient:
return "", result
return "", {"raw": result}
# ------------------------------------------------------------------ chat
def _build_payload(self, prompt: str, model_key: str, ask_mode: str, session_id: str, request_id: str):
session_type = "developer" if ask_mode == "agent" else "chat"
return {
@@ -355,17 +569,24 @@ class LingmaGatewayClient:
}
async def chat_complete(self, prompt: str, model_key: str, ask_mode: str) -> dict:
await self.ensure_ready()
request_id = str(uuid.uuid4())
session_id = str(uuid.uuid4())
payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id)
self.rpc.create_stream(request_id)
try:
await self.rpc.request("chat/ask", payload, timeout=self.rpc_timeout)
except (TimeoutError, asyncio.TimeoutError):
pass
async for _ in self.rpc.consume_stream(request_id, timeout=max(20.0, self.rpc_timeout + 20.0)):
pass
result = self.rpc.get_stream_result(request_id)
try:
await self.rpc.request("chat/ask", payload, timeout=self.rpc_timeout)
except TIMEOUT_EXCEPTIONS:
# chat/ask often returns nothing until chat/finish arrives; tolerate.
pass
async for _ in self.rpc.consume_stream(
request_id, timeout=max(20.0, self.rpc_timeout + 20.0)
):
pass
result = self.rpc.get_stream_result(request_id)
finally:
self.rpc.pop_stream(request_id)
finish = result.get("finish") or {}
result["requestId"] = request_id
result["sessionId"] = finish.get("sessionId") or session_id
@@ -374,13 +595,20 @@ class LingmaGatewayClient:
return result
async def chat_stream(self, prompt: str, model_key: str, ask_mode: str) -> AsyncIterator[str]:
await self.ensure_ready()
request_id = str(uuid.uuid4())
session_id = str(uuid.uuid4())
payload = self._build_payload(prompt, model_key, ask_mode, session_id, request_id)
self.rpc.create_stream(request_id)
try:
await self.rpc.request("chat/ask", payload, timeout=self.rpc_timeout)
except (TimeoutError, asyncio.TimeoutError):
pass
async for chunk in self.rpc.consume_stream(request_id, timeout=max(20.0, self.rpc_timeout + 40.0)):
yield chunk
try:
await self.rpc.request("chat/ask", payload, timeout=self.rpc_timeout)
except TIMEOUT_EXCEPTIONS:
pass
async for chunk in self.rpc.consume_stream(
request_id, timeout=max(20.0, self.rpc_timeout + 40.0)
):
yield chunk
finally:
# Runs on normal completion, exception, or consumer GeneratorExit (client disconnect).
self.rpc.pop_stream(request_id)