prod hardening: admin/metrics authz split, subprocess lifecycle, parallel pool start, HEALTHCHECK

- authz: new ADMIN_TOKEN gates /internal/*; METRICS_PUBLIC=false by default, so
  /metrics returns 503 when neither METRICS_TOKEN nor API_KEYS is set
  (previously leaked pool topology). Startup logs loudly if API_KEYS is empty
  or admin falls back to chat keys.
- lingma_client: keep a Popen handle instead of orphaning Lingma with
  start_new_session, drain stderr to logger at DEBUG, SIGTERM -> 5s grace ->
  SIGKILL on shutdown. Fixes the zombie-process leak on container reload.
- pool: asyncio.gather to start N instances concurrently; N=2 pool shaves
  ~startup_timeout seconds off boot.
- Dockerfile: HEALTHCHECK hits /healthz and greps for pool_ready>0 so Docker
  / compose orchestrators see "stuck on login" as unhealthy.

Made-with: Cursor
This commit is contained in:
GitHub Actions
2026-04-18 10:22:13 +08:00
parent 3130533888
commit 2febc37c2c
8 changed files with 248 additions and 28 deletions

View File

@@ -29,7 +29,8 @@ def _match_any(token: str, candidates: list[str]) -> bool:
def require_bearer(request: Request, api_keys: list[str]) -> None:
# Empty api_keys means auth is disabled (keeps the old behavior).
# Empty api_keys means auth is disabled (kept for local dev). The startup
# logger warns loudly in that case so it can't go unnoticed in prod.
if not api_keys:
return
token = _extract_bearer(request)
@@ -47,19 +48,42 @@ def require_bearer(request: Request, api_keys: list[str]) -> None:
def require_metrics_access(
request: Request, api_keys: list[str], metrics_token: str
request: Request,
api_keys: list[str],
metrics_token: str,
*,
public: bool = False,
) -> None:
"""Allow metrics if any of: METRICS_TOKEN matches, or any API_KEYS match.
"""Gate /metrics.
If neither METRICS_TOKEN nor API_KEYS are configured, metrics is public
(backwards compatible default).
Resolution order:
1. `public=True` (METRICS_PUBLIC) — wide open, explicit opt-in for
sidecar scrapers on a private network.
2. `METRICS_TOKEN` configured — must match.
3. `API_KEYS` configured — any configured API key works.
4. Nothing configured at all — 503 (scraping disabled) so we don't
silently leak the pool topology on an un-hardened deployment.
"""
if public:
return
accepted: list[str] = []
if metrics_token:
accepted.append(metrics_token)
accepted.extend(api_keys)
if not accepted:
return
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail={
"error": {
"message": (
"metrics scraping is disabled: set METRICS_TOKEN, "
"API_KEYS, or METRICS_PUBLIC=true"
),
"type": "service_unavailable",
"code": "metrics_disabled",
}
},
)
token = _extract_bearer(request)
if not _match_any(token, accepted):
raise HTTPException(
@@ -72,3 +96,52 @@ def require_metrics_access(
}
},
)
def require_admin_access(
request: Request,
api_keys: list[str],
admin_token: str,
) -> None:
"""Gate /internal/* admin endpoints.
Resolution order:
1. `ADMIN_TOKEN` configured — must match exactly.
2. Otherwise fall back to the regular API_KEYS (single-tenant deploys).
3. If nothing is configured — 503 so we never expose auto-login /
session-export on an unauthenticated gateway.
Backwards compat: existing deployments that only set `API_KEYS` keep
working; add ADMIN_TOKEN in .env when you want a dedicated split.
"""
accepted: list[str] = []
if admin_token:
accepted.append(admin_token)
else:
accepted.extend(api_keys)
if not accepted:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail={
"error": {
"message": (
"admin endpoints disabled: configure ADMIN_TOKEN "
"(recommended) or API_KEYS"
),
"type": "service_unavailable",
"code": "admin_disabled",
}
},
)
token = _extract_bearer(request)
if not _match_any(token, accepted):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail={
"error": {
"message": "Invalid admin token",
"type": "invalid_request_error",
"code": "invalid_api_key",
}
},
)