prod hardening: admin/metrics authz split, subprocess lifecycle, parallel pool start, HEALTHCHECK
- authz: new ADMIN_TOKEN gates /internal/*; METRICS_PUBLIC=false by default, so /metrics returns 503 when neither METRICS_TOKEN nor API_KEYS is set (previously leaked pool topology). Startup logs loudly if API_KEYS is empty or admin falls back to chat keys. - lingma_client: keep a Popen handle instead of orphaning Lingma with start_new_session, drain stderr to logger at DEBUG, SIGTERM -> 5s grace -> SIGKILL on shutdown. Fixes the zombie-process leak on container reload. - pool: asyncio.gather to start N instances concurrently; N=2 pool shaves ~startup_timeout seconds off boot. - Dockerfile: HEALTHCHECK hits /healthz and greps for pool_ready>0 so Docker / compose orchestrators see "stuck on login" as unhealthy. Made-with: Cursor
This commit is contained in:
@@ -316,6 +316,11 @@ class LingmaGatewayClient:
|
||||
self._ready_event = asyncio.Event()
|
||||
self._reconnect_task: asyncio.Task | None = None
|
||||
self._last_error: str = ""
|
||||
# Lingma subprocess handle. Kept so we can reap on shutdown and read
|
||||
# stderr for debugging (pre-v0.4 we forked with DEVNULL + new_session
|
||||
# which orphaned the process and hid crash logs).
|
||||
self._proc: subprocess.Popen | None = None
|
||||
self._stderr_task: asyncio.Task | None = None
|
||||
|
||||
# ------------------------------------------------------------------ state
|
||||
|
||||
@@ -359,6 +364,76 @@ class LingmaGatewayClient:
|
||||
if self._ws:
|
||||
with contextlib.suppress(Exception):
|
||||
await self._ws.close()
|
||||
await self._terminate_proc()
|
||||
if self._stderr_task and not self._stderr_task.done():
|
||||
self._stderr_task.cancel()
|
||||
with contextlib.suppress(Exception):
|
||||
await self._stderr_task
|
||||
|
||||
async def _drain_stderr(self, proc: subprocess.Popen) -> None:
|
||||
"""Mirror Lingma stderr to the logger at DEBUG level.
|
||||
|
||||
Running in a worker thread (readline is blocking) and dumping lines
|
||||
through logger.debug means crashes like native-module load failures
|
||||
are visible when LOG_LEVEL=DEBUG but don't spam production logs.
|
||||
"""
|
||||
if proc.stderr is None:
|
||||
return
|
||||
|
||||
name = self.name
|
||||
|
||||
def reader() -> None:
|
||||
try:
|
||||
for line in iter(proc.stderr.readline, b""):
|
||||
if not line:
|
||||
break
|
||||
text = line.decode("utf-8", errors="replace").rstrip()
|
||||
if text:
|
||||
logger.debug("[%s] lingma stderr: %s", name, text)
|
||||
except Exception as exc: # pragma: no cover -- defensive
|
||||
logger.debug("[%s] stderr drain aborted: %s", name, exc)
|
||||
|
||||
try:
|
||||
await asyncio.to_thread(reader)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
async def _terminate_proc(self) -> None:
|
||||
"""Reap the Lingma subprocess we spawned.
|
||||
|
||||
SIGTERM first with a short grace period, then SIGKILL. Blocking waits
|
||||
are off-loaded to a thread so they don't stall the FastAPI shutdown
|
||||
event loop. Idempotent: safe to call even if nothing was spawned.
|
||||
"""
|
||||
proc = self._proc
|
||||
if proc is None:
|
||||
return
|
||||
self._proc = None
|
||||
try:
|
||||
if proc.poll() is None:
|
||||
try:
|
||||
proc.terminate()
|
||||
except Exception as exc:
|
||||
logger.warning("[%s] proc.terminate failed: %s", self.name, exc)
|
||||
try:
|
||||
await asyncio.wait_for(asyncio.to_thread(proc.wait), timeout=5.0)
|
||||
except TIMEOUT_EXCEPTIONS:
|
||||
logger.warning(
|
||||
"[%s] lingma (pid=%s) didn't exit in 5s, sending SIGKILL",
|
||||
self.name,
|
||||
proc.pid,
|
||||
)
|
||||
with contextlib.suppress(Exception):
|
||||
proc.kill()
|
||||
with contextlib.suppress(Exception):
|
||||
await asyncio.wait_for(
|
||||
asyncio.to_thread(proc.wait), timeout=3.0
|
||||
)
|
||||
finally:
|
||||
# Close stderr pipe so the drain thread can exit cleanly.
|
||||
if proc.stderr is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
proc.stderr.close()
|
||||
|
||||
async def ensure_ready(self, timeout: float | None = None) -> None:
|
||||
"""Block until the RPC connection is usable, (re)connecting on demand."""
|
||||
@@ -413,12 +488,26 @@ class LingmaGatewayClient:
|
||||
self.lingma_bin,
|
||||
self.work_dir,
|
||||
)
|
||||
subprocess.Popen(
|
||||
# Reap any old proc from a previous connect attempt before spawning
|
||||
# a fresh one so we never accumulate zombie Lingma instances.
|
||||
await self._terminate_proc()
|
||||
if self._stderr_task and not self._stderr_task.done():
|
||||
self._stderr_task.cancel()
|
||||
with contextlib.suppress(Exception):
|
||||
await self._stderr_task
|
||||
self._stderr_task = None
|
||||
|
||||
self._proc = subprocess.Popen(
|
||||
[str(self.lingma_bin), "start", "--workDir", str(self.work_dir)],
|
||||
cwd=str(self.lingma_bin.parent),
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
start_new_session=True,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
logger.info(
|
||||
"[%s] lingma spawned (pid=%s)", self.name, self._proc.pid
|
||||
)
|
||||
self._stderr_task = asyncio.create_task(
|
||||
self._drain_stderr(self._proc)
|
||||
)
|
||||
info, _, _ = _wait_info_any(info_paths, timeout_sec=self.startup_timeout)
|
||||
self.socket_port = info
|
||||
|
||||
Reference in New Issue
Block a user