feat: M1+M2 gateway hardening and multi-instance pool
Behavior hardening (M1):
- Fix `_chat_streams` memory leak: pop_stream on completion, error, and
client disconnect.
- Add WebSocket reconnect with state machine (stopped/starting/ready/
reconnecting/failed/closed) and exponential backoff, so a Lingma
restart no longer requires restarting the gateway.
- Lazy initialization: startup failure is non-fatal, first real request
triggers retry, `/healthz` reflects readiness.
- Migrate FastAPI on_event to lifespan.
- Structured JSON logging with request_id ContextVar; `x-request-id`
propagated to responses.
- SSE now sets `Cache-Control: no-cache`, `X-Accel-Buffering: no` to
defeat proxy buffering.
- OpenAI schema compatibility: `content` accepts str | list[parts] | None,
added `developer`/`function` roles, `tools/tool_choice/stream_options/
user/max_tokens` fields, and `stream_options.include_usage` emits final
usage chunk.
- `require_bearer` uses `hmac.compare_digest`; `/metrics` now requires
Bearer when `METRICS_TOKEN` or `API_KEYS` are set.
- Python 3.10/3.11 `TimeoutError` vs `asyncio.TimeoutError` unified.
- Error responses no longer leak `auto_login.status()` details.
Backpressure (M2 / A2):
- New `InFlightGuard` with per-request ticket, queue + rejection
accounting, `BackpressureRejected` raises 429 + `Retry-After` once
`GATEWAY_QUEUE_TIMEOUT_SEC` elapses.
- Streaming ticket ownership transfers to the generator so CancelledError
from client disconnect still releases the slot.
- `/internal/stats.concurrency` and `/metrics` expose in_flight/queued/
accepted_total/rejected_total/max_in_flight.
Multi-instance pool (M2 / A1 + B3):
- New `LingmaPool` with N processes, each with its own workDir, socket
port (dynamic when N>1), and `AutoLoginManager`.
- Account parser supports CSV (`u1:p1,u2:p2`) and JSON formats via
`LINGMA_ACCOUNTS`; falls back to `LINGMA_USERNAME/LINGMA_PASSWORD` for
backwards compatibility (N=1 keeps legacy paths/ports).
- Routing: sticky affinity by `user` / system-prompt hash, then
least-in-flight, finally round-robin fallback for unhealthy pool.
- `/healthz` reports per-instance state and ready count.
- `/internal/stats.pool` and `/metrics` expose per-instance
`gateway_pool_instance_in_flight{name}` / `gateway_pool_instance_ready{name}`.
- `/internal/auto-login/start?instance=inst-N` targets a specific instance;
`/internal/auto-login/status` lists all instances.
Compat notes:
- `.env.example` adds `METRICS_TOKEN`, `LOG_LEVEL`, `GATEWAY_MAX_IN_FLIGHT`,
`GATEWAY_QUEUE_TIMEOUT_SEC`, `LINGMA_ACCOUNTS`, `LINGMA_INSTANCE_COUNT`.
- `.gitignore` cleaned up data/ duplication.
- Existing single-instance deployments keep working without config change.
Made-with: Cursor
This commit is contained in:
121
app/concurrency.py
Normal file
121
app/concurrency.py
Normal file
@@ -0,0 +1,121 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
|
||||
from .logging_config import get_logger
|
||||
|
||||
|
||||
logger = get_logger("lingma_gateway.concurrency")
|
||||
|
||||
|
||||
class BackpressureRejected(Exception):
|
||||
"""Raised when a request cannot acquire an in-flight slot before timeout."""
|
||||
|
||||
def __init__(self, retry_after: float):
|
||||
super().__init__(f"backpressure rejected, retry_after={retry_after:.1f}s")
|
||||
self.retry_after = retry_after
|
||||
|
||||
|
||||
class InFlightTicket:
|
||||
"""Reference-counted handle for a single in-flight slot.
|
||||
|
||||
Release is idempotent so callers can defensively `release()` from multiple
|
||||
cleanup paths (stream finally + outer exception handler) without worrying.
|
||||
"""
|
||||
|
||||
__slots__ = ("_parent", "_released")
|
||||
|
||||
def __init__(self, parent: "InFlightGuard | None"):
|
||||
self._parent = parent
|
||||
self._released = False
|
||||
|
||||
def release(self) -> None:
|
||||
if self._released or self._parent is None:
|
||||
self._released = True
|
||||
return
|
||||
self._released = True
|
||||
self._parent._on_release()
|
||||
|
||||
async def __aenter__(self) -> "InFlightTicket":
|
||||
return self
|
||||
|
||||
async def __aexit__(self, *_exc) -> None:
|
||||
self.release()
|
||||
|
||||
|
||||
class InFlightGuard:
|
||||
"""Async semaphore wrapper with queue/reject accounting and Prometheus hooks.
|
||||
|
||||
- `max_in_flight <= 0` disables limiting (back-compat, unlimited).
|
||||
- `queue_timeout_sec` bounds how long a request may wait for a slot. On
|
||||
timeout, `try_acquire()` raises `BackpressureRejected`.
|
||||
"""
|
||||
|
||||
def __init__(self, max_in_flight: int, queue_timeout_sec: float):
|
||||
self.max = max(0, int(max_in_flight))
|
||||
self.queue_timeout = max(0.0, float(queue_timeout_sec))
|
||||
self._sem: asyncio.Semaphore | None = (
|
||||
asyncio.Semaphore(self.max) if self.max > 0 else None
|
||||
)
|
||||
self.in_flight = 0
|
||||
self.queued = 0
|
||||
self.accepted_total = 0
|
||||
self.rejected_total = 0
|
||||
|
||||
async def try_acquire(self) -> InFlightTicket:
|
||||
if self._sem is None:
|
||||
self.in_flight += 1
|
||||
self.accepted_total += 1
|
||||
return InFlightTicket(parent=self)
|
||||
|
||||
self.queued += 1
|
||||
try:
|
||||
if self.queue_timeout <= 0:
|
||||
await self._sem.acquire()
|
||||
else:
|
||||
try:
|
||||
await asyncio.wait_for(self._sem.acquire(), timeout=self.queue_timeout)
|
||||
except (asyncio.TimeoutError, TimeoutError):
|
||||
self.rejected_total += 1
|
||||
logger.warning(
|
||||
"backpressure rejected: in_flight=%d queued=%d max=%d",
|
||||
self.in_flight,
|
||||
self.queued - 1,
|
||||
self.max,
|
||||
)
|
||||
raise BackpressureRejected(retry_after=self.queue_timeout)
|
||||
finally:
|
||||
self.queued -= 1
|
||||
|
||||
self.in_flight += 1
|
||||
self.accepted_total += 1
|
||||
return InFlightTicket(parent=self)
|
||||
|
||||
def _on_release(self) -> None:
|
||||
self.in_flight -= 1
|
||||
if self._sem is not None:
|
||||
self._sem.release()
|
||||
|
||||
def stats(self) -> dict:
|
||||
return {
|
||||
"max_in_flight": self.max,
|
||||
"in_flight": self.in_flight,
|
||||
"queued": self.queued,
|
||||
"accepted_total": self.accepted_total,
|
||||
"rejected_total": self.rejected_total,
|
||||
"queue_timeout_sec": self.queue_timeout,
|
||||
}
|
||||
|
||||
def prometheus_lines(self) -> list[str]:
|
||||
return [
|
||||
"# TYPE gateway_in_flight gauge",
|
||||
f"gateway_in_flight {self.in_flight}",
|
||||
"# TYPE gateway_queued gauge",
|
||||
f"gateway_queued {self.queued}",
|
||||
"# TYPE gateway_max_in_flight gauge",
|
||||
f"gateway_max_in_flight {self.max}",
|
||||
"# TYPE gateway_accepted_total counter",
|
||||
f"gateway_accepted_total {self.accepted_total}",
|
||||
"# TYPE gateway_rejected_total counter",
|
||||
f"gateway_rejected_total {self.rejected_total}",
|
||||
]
|
||||
Reference in New Issue
Block a user