prod hardening: admin/metrics authz split, subprocess lifecycle, parallel pool start, HEALTHCHECK

- authz: new ADMIN_TOKEN gates /internal/*; METRICS_PUBLIC=false by default, so /metrics returns 503 when neither METRICS_TOKEN nor API_KEYS is set (previously leaked pool topology). Startup logs loudly if API_KEYS is empty or admin falls back to chat keys. - lingma_client: keep a Popen handle instead of orphaning Lingma with start_new_session, drain stderr to logger at DEBUG, SIGTERM -> 5s grace -> SIGKILL on shutdown. Fixes the zombie-process leak on container reload. - pool: asyncio.gather to start N instances concurrently; N=2 pool shaves ~startup_timeout seconds off boot. - Dockerfile: HEALTHCHECK hits /healthz and greps for pool_ready>0 so Docker / compose orchestrators see "stuck on login" as unhealthy. Made-with: Cursor
2026-04-18 10:22:13 +08:00
parent 3130533888
commit 2febc37c2c
8 changed files with 248 additions and 28 deletions
--- a/app/auth.py
+++ b/app/auth.py
@@ -29,7 +29,8 @@ def _match_any(token: str, candidates: list[str]) -> bool:


 def require_bearer(request: Request, api_keys: list[str]) -> None:
-    # Empty api_keys means auth is disabled (keeps the old behavior).
+    # Empty api_keys means auth is disabled (kept for local dev). The startup
+    # logger warns loudly in that case so it can't go unnoticed in prod.
    if not api_keys:
        return
    token = _extract_bearer(request)
@@ -47,19 +48,42 @@ def require_bearer(request: Request, api_keys: list[str]) -> None:


 def require_metrics_access(
-    request: Request, api_keys: list[str], metrics_token: str
+    request: Request,
+    api_keys: list[str],
+    metrics_token: str,
+    *,
+    public: bool = False,
 ) -> None:
-    """Allow metrics if any of: METRICS_TOKEN matches, or any API_KEYS match.
+    """Gate /metrics.

-    If neither METRICS_TOKEN nor API_KEYS are configured, metrics is public
-    (backwards compatible default).
+    Resolution order:
+      1. `public=True` (METRICS_PUBLIC) — wide open, explicit opt-in for
+         sidecar scrapers on a private network.
+      2. `METRICS_TOKEN` configured — must match.
+      3. `API_KEYS` configured — any configured API key works.
+      4. Nothing configured at all — 503 (scraping disabled) so we don't
+         silently leak the pool topology on an un-hardened deployment.
    """
+    if public:
+        return
    accepted: list[str] = []
    if metrics_token:
        accepted.append(metrics_token)
    accepted.extend(api_keys)
    if not accepted:
-        return
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail={
+                "error": {
+                    "message": (
+                        "metrics scraping is disabled: set METRICS_TOKEN, "
+                        "API_KEYS, or METRICS_PUBLIC=true"
+                    ),
+                    "type": "service_unavailable",
+                    "code": "metrics_disabled",
+                }
+            },
+        )
    token = _extract_bearer(request)
    if not _match_any(token, accepted):
        raise HTTPException(
@@ -72,3 +96,52 @@ def require_metrics_access(
                }
            },
        )
+
+
+def require_admin_access(
+    request: Request,
+    api_keys: list[str],
+    admin_token: str,
+) -> None:
+    """Gate /internal/* admin endpoints.
+
+    Resolution order:
+      1. `ADMIN_TOKEN` configured — must match exactly.
+      2. Otherwise fall back to the regular API_KEYS (single-tenant deploys).
+      3. If nothing is configured — 503 so we never expose auto-login /
+         session-export on an unauthenticated gateway.
+
+    Backwards compat: existing deployments that only set `API_KEYS` keep
+    working; add ADMIN_TOKEN in .env when you want a dedicated split.
+    """
+    accepted: list[str] = []
+    if admin_token:
+        accepted.append(admin_token)
+    else:
+        accepted.extend(api_keys)
+    if not accepted:
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail={
+                "error": {
+                    "message": (
+                        "admin endpoints disabled: configure ADMIN_TOKEN "
+                        "(recommended) or API_KEYS"
+                    ),
+                    "type": "service_unavailable",
+                    "code": "admin_disabled",
+                }
+            },
+        )
+    token = _extract_bearer(request)
+    if not _match_any(token, accepted):
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail={
+                "error": {
+                    "message": "Invalid admin token",
+                    "type": "invalid_request_error",
+                    "code": "invalid_api_key",
+                }
+            },
+        )