refactor: share request execution lifecycle
Extract the shared request startup, completion, and cleanup flow so OpenAI and Anthropic routes keep the same wire behavior with less duplicated orchestration. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any, Awaitable, Callable
|
from typing import Any, Awaitable, Callable
|
||||||
|
|
||||||
|
from ..concurrency import InFlightGuard
|
||||||
from ..lingma_pool import LingmaPool, PoolInstance
|
from ..lingma_pool import LingmaPool, PoolInstance
|
||||||
from ..model_map import build_model_name_map, flatten_model_keys, resolve_model
|
from ..model_map import build_model_name_map, flatten_model_keys, resolve_model
|
||||||
from ..session_cache import SessionCache, hash_branch_context
|
from ..session_cache import SessionCache, hash_branch_context
|
||||||
@@ -21,6 +22,22 @@ class ExecutionContext:
|
|||||||
affinity: str | None
|
affinity: str | None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class StartedExecution:
|
||||||
|
ticket: Any
|
||||||
|
prompt_tokens: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CompletedExecution:
|
||||||
|
result: dict[str, Any]
|
||||||
|
completion_tokens: int
|
||||||
|
|
||||||
|
|
||||||
|
class UpstreamExecutionError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _resolve_ask_mode(model: str, has_tooling_context: bool, *, default_ask_mode: str) -> str:
|
def _resolve_ask_mode(model: str, has_tooling_context: bool, *, default_ask_mode: str) -> str:
|
||||||
model_name = (model or "").lower()
|
model_name = (model or "").lower()
|
||||||
if model_name in {"lingma-agent", "agent"} or has_tooling_context:
|
if model_name in {"lingma-agent", "agent"} or has_tooling_context:
|
||||||
@@ -146,3 +163,119 @@ async def prepare_execution_context(
|
|||||||
is_reply=is_reply,
|
is_reply=is_reply,
|
||||||
affinity=affinity,
|
affinity=affinity,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def start_execution(
|
||||||
|
*,
|
||||||
|
protocol: str,
|
||||||
|
execution: ExecutionContext,
|
||||||
|
stream: bool,
|
||||||
|
chat_guard: InFlightGuard,
|
||||||
|
logger: Any,
|
||||||
|
estimate_tokens: Callable[[str], int],
|
||||||
|
extra_log_context: dict[str, Any] | None = None,
|
||||||
|
) -> StartedExecution:
|
||||||
|
if not execution.prompt:
|
||||||
|
raise ValueError("messages is empty")
|
||||||
|
|
||||||
|
prompt_tokens = estimate_tokens(execution.prompt)
|
||||||
|
ticket = await chat_guard.try_acquire()
|
||||||
|
execution.inst.in_flight += 1
|
||||||
|
log_extra = {
|
||||||
|
"ctx_instance": execution.inst.name,
|
||||||
|
"ctx_model": execution.model,
|
||||||
|
"ctx_ask_mode": execution.ask_mode,
|
||||||
|
"ctx_stream": stream,
|
||||||
|
"ctx_prompt_tokens": prompt_tokens,
|
||||||
|
"ctx_in_flight": chat_guard.in_flight,
|
||||||
|
"ctx_affinity": execution.affinity,
|
||||||
|
"ctx_session_reuse": bool(execution.cached_session_id),
|
||||||
|
}
|
||||||
|
if extra_log_context:
|
||||||
|
log_extra.update(extra_log_context)
|
||||||
|
logger.info(
|
||||||
|
"%s.start inst=%s model=%s ask_mode=%s stream=%s prompt_tokens~%d reuse=%s",
|
||||||
|
protocol,
|
||||||
|
execution.inst.name,
|
||||||
|
execution.model,
|
||||||
|
execution.ask_mode,
|
||||||
|
stream,
|
||||||
|
prompt_tokens,
|
||||||
|
bool(execution.cached_session_id),
|
||||||
|
extra=log_extra,
|
||||||
|
)
|
||||||
|
return StartedExecution(ticket=ticket, prompt_tokens=prompt_tokens)
|
||||||
|
|
||||||
|
|
||||||
|
async def complete_execution(
|
||||||
|
*,
|
||||||
|
protocol: str,
|
||||||
|
execution: ExecutionContext,
|
||||||
|
prompt_tokens: int,
|
||||||
|
tool_config: dict[str, Any] | None,
|
||||||
|
logger: Any,
|
||||||
|
stats_collector: Any,
|
||||||
|
session_cache: SessionCache,
|
||||||
|
estimate_tokens: Callable[[str], int],
|
||||||
|
) -> CompletedExecution:
|
||||||
|
try:
|
||||||
|
result = await execution.inst.client.chat_complete(
|
||||||
|
execution.prompt,
|
||||||
|
execution.model,
|
||||||
|
execution.ask_mode,
|
||||||
|
session_id=execution.cached_session_id,
|
||||||
|
is_reply=execution.is_reply,
|
||||||
|
tool_config=tool_config,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("%s.complete error (inst=%s): %s", protocol, execution.inst.name, exc)
|
||||||
|
await stats_collector.record_chat(
|
||||||
|
stream=False,
|
||||||
|
success=False,
|
||||||
|
prompt_tokens=prompt_tokens,
|
||||||
|
completion_tokens=0,
|
||||||
|
)
|
||||||
|
if execution.cached_session_id and execution.lookup_key:
|
||||||
|
await session_cache.invalidate(execution.lookup_key)
|
||||||
|
raise UpstreamExecutionError from exc
|
||||||
|
|
||||||
|
completion_tokens = estimate_tokens(result.get("text") or "")
|
||||||
|
await stats_collector.record_chat(
|
||||||
|
stream=False,
|
||||||
|
success=True,
|
||||||
|
prompt_tokens=prompt_tokens,
|
||||||
|
completion_tokens=completion_tokens,
|
||||||
|
)
|
||||||
|
if execution.write_key:
|
||||||
|
sid = result.get("sessionId")
|
||||||
|
if sid:
|
||||||
|
await session_cache.put(execution.write_key, sid, execution.inst.name)
|
||||||
|
return CompletedExecution(result=result, completion_tokens=completion_tokens)
|
||||||
|
|
||||||
|
|
||||||
|
async def finalize_stream_execution(
|
||||||
|
*,
|
||||||
|
success: bool,
|
||||||
|
write_key: str | None,
|
||||||
|
session_id: str | None,
|
||||||
|
inst: PoolInstance,
|
||||||
|
ticket: Any,
|
||||||
|
session_cache: SessionCache,
|
||||||
|
stats_collector: Any,
|
||||||
|
prompt_tokens: int,
|
||||||
|
completion_tokens: int,
|
||||||
|
) -> None:
|
||||||
|
if success and write_key and session_id:
|
||||||
|
await session_cache.put(write_key, session_id, inst.name)
|
||||||
|
await stats_collector.record_chat(
|
||||||
|
stream=True,
|
||||||
|
success=success,
|
||||||
|
prompt_tokens=prompt_tokens,
|
||||||
|
completion_tokens=completion_tokens,
|
||||||
|
)
|
||||||
|
release_execution(ticket=ticket, inst=inst)
|
||||||
|
|
||||||
|
|
||||||
|
def release_execution(*, ticket: Any, inst: PoolInstance) -> None:
|
||||||
|
inst.in_flight = max(0, inst.in_flight - 1)
|
||||||
|
ticket.release()
|
||||||
|
|||||||
207
app/main.py
207
app/main.py
@@ -28,7 +28,12 @@ from .config import Settings, load_settings
|
|||||||
from .http.execution_core import (
|
from .http.execution_core import (
|
||||||
_apply_cached_instance_or_invalidate as _shared_apply_cached_instance_or_invalidate,
|
_apply_cached_instance_or_invalidate as _shared_apply_cached_instance_or_invalidate,
|
||||||
_resolve_ask_mode as _shared_resolve_ask_mode,
|
_resolve_ask_mode as _shared_resolve_ask_mode,
|
||||||
|
UpstreamExecutionError,
|
||||||
|
complete_execution,
|
||||||
|
finalize_stream_execution,
|
||||||
prepare_execution_context,
|
prepare_execution_context,
|
||||||
|
release_execution,
|
||||||
|
start_execution,
|
||||||
)
|
)
|
||||||
from .http.openai_responses import handle_responses
|
from .http.openai_responses import handle_responses
|
||||||
from .http.tool_bridge import (
|
from .http.tool_bridge import (
|
||||||
@@ -472,27 +477,29 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
|
|||||||
messages_to_prompt=_messages_to_prompt,
|
messages_to_prompt=_messages_to_prompt,
|
||||||
)
|
)
|
||||||
ask_mode = execution.ask_mode
|
ask_mode = execution.ask_mode
|
||||||
lookup_key = execution.lookup_key
|
|
||||||
write_key = execution.write_key
|
write_key = execution.write_key
|
||||||
cached_session_id = execution.cached_session_id
|
cached_session_id = execution.cached_session_id
|
||||||
inst = execution.inst
|
inst = execution.inst
|
||||||
model = execution.model
|
model = execution.model
|
||||||
prompt = execution.prompt
|
prompt = execution.prompt
|
||||||
is_reply = execution.is_reply
|
is_reply = execution.is_reply
|
||||||
affinity = execution.affinity
|
|
||||||
|
|
||||||
if not prompt:
|
include_usage = _include_usage(req.stream_options)
|
||||||
|
|
||||||
|
try:
|
||||||
|
started = await start_execution(
|
||||||
|
protocol="chat",
|
||||||
|
execution=execution,
|
||||||
|
stream=req.stream,
|
||||||
|
chat_guard=chat_guard,
|
||||||
|
logger=logger,
|
||||||
|
estimate_tokens=estimate_tokens,
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
detail={"error": {"message": "messages is empty", "type": "invalid_request_error"}},
|
detail={"error": {"message": "messages is empty", "type": "invalid_request_error"}},
|
||||||
)
|
)
|
||||||
prompt_tokens = estimate_tokens(prompt)
|
|
||||||
include_usage = _include_usage(req.stream_options)
|
|
||||||
|
|
||||||
# Backpressure: acquire a slot *after* the cheap validation but before any
|
|
||||||
# upstream call. This ensures we reject quickly when saturated.
|
|
||||||
try:
|
|
||||||
ticket = await chat_guard.try_acquire()
|
|
||||||
except BackpressureRejected as exc:
|
except BackpressureRejected as exc:
|
||||||
retry_after = max(1, int(exc.retry_after))
|
retry_after = max(1, int(exc.retry_after))
|
||||||
logger.warning("chat rejected by backpressure, retry_after=%ds", retry_after)
|
logger.warning("chat rejected by backpressure, retry_after=%ds", retry_after)
|
||||||
@@ -508,26 +515,8 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
|
|||||||
headers={"Retry-After": str(retry_after)},
|
headers={"Retry-After": str(retry_after)},
|
||||||
)
|
)
|
||||||
|
|
||||||
inst.in_flight += 1
|
ticket = started.ticket
|
||||||
logger.info(
|
prompt_tokens = started.prompt_tokens
|
||||||
"chat.start inst=%s model=%s ask_mode=%s stream=%s prompt_tokens~%d reuse=%s",
|
|
||||||
inst.name,
|
|
||||||
model,
|
|
||||||
ask_mode,
|
|
||||||
req.stream,
|
|
||||||
prompt_tokens,
|
|
||||||
bool(cached_session_id),
|
|
||||||
extra={
|
|
||||||
"ctx_instance": inst.name,
|
|
||||||
"ctx_model": model,
|
|
||||||
"ctx_ask_mode": ask_mode,
|
|
||||||
"ctx_stream": req.stream,
|
|
||||||
"ctx_prompt_tokens": prompt_tokens,
|
|
||||||
"ctx_in_flight": chat_guard.in_flight,
|
|
||||||
"ctx_affinity": affinity,
|
|
||||||
"ctx_session_reuse": bool(cached_session_id),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
ticket_transferred = False
|
ticket_transferred = False
|
||||||
|
|
||||||
@@ -715,59 +704,40 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
|
|||||||
exc,
|
exc,
|
||||||
)
|
)
|
||||||
finally:
|
finally:
|
||||||
if success and write_key:
|
await finalize_stream_execution(
|
||||||
sid = _meta.get("session_id")
|
|
||||||
if sid:
|
|
||||||
await session_cache.put(write_key, sid, _inst.name)
|
|
||||||
await stats_collector.record_chat(
|
|
||||||
stream=True,
|
|
||||||
success=success,
|
success=success,
|
||||||
|
write_key=write_key,
|
||||||
|
session_id=_meta.get("session_id"),
|
||||||
|
inst=_inst,
|
||||||
|
ticket=_ticket,
|
||||||
|
session_cache=session_cache,
|
||||||
|
stats_collector=stats_collector,
|
||||||
prompt_tokens=prompt_tokens,
|
prompt_tokens=prompt_tokens,
|
||||||
completion_tokens=completion_tokens_holder["n"],
|
completion_tokens=completion_tokens_holder["n"],
|
||||||
)
|
)
|
||||||
_inst.in_flight = max(0, _inst.in_flight - 1)
|
|
||||||
_ticket.release()
|
|
||||||
|
|
||||||
ticket_transferred = True
|
ticket_transferred = True
|
||||||
return _streaming_response(event_stream())
|
return _streaming_response(event_stream())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = await inst.client.chat_complete(
|
completed = await complete_execution(
|
||||||
prompt,
|
protocol="chat",
|
||||||
model,
|
execution=execution,
|
||||||
ask_mode,
|
|
||||||
session_id=cached_session_id,
|
|
||||||
is_reply=is_reply,
|
|
||||||
tool_config=tool_config,
|
|
||||||
)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("chat.complete error (inst=%s): %s", inst.name, exc)
|
|
||||||
await stats_collector.record_chat(
|
|
||||||
stream=False,
|
|
||||||
success=False,
|
|
||||||
prompt_tokens=prompt_tokens,
|
prompt_tokens=prompt_tokens,
|
||||||
completion_tokens=0,
|
tool_config=tool_config,
|
||||||
|
logger=logger,
|
||||||
|
stats_collector=stats_collector,
|
||||||
|
session_cache=session_cache,
|
||||||
|
estimate_tokens=estimate_tokens,
|
||||||
)
|
)
|
||||||
# If we used a cached session and the call blew up, drop it so the
|
except UpstreamExecutionError:
|
||||||
# next turn can start fresh instead of hitting the same dead session.
|
|
||||||
if cached_session_id and lookup_key:
|
|
||||||
await session_cache.invalidate(lookup_key)
|
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=502,
|
status_code=502,
|
||||||
detail={"error": {"message": "upstream lingma error", "type": "upstream_error"}},
|
detail={"error": {"message": "upstream lingma error", "type": "upstream_error"}},
|
||||||
)
|
)
|
||||||
|
|
||||||
completion_tokens = estimate_tokens(result.get("text") or "")
|
result = completed.result
|
||||||
await stats_collector.record_chat(
|
completion_tokens = completed.completion_tokens
|
||||||
stream=False,
|
|
||||||
success=True,
|
|
||||||
prompt_tokens=prompt_tokens,
|
|
||||||
completion_tokens=completion_tokens,
|
|
||||||
)
|
|
||||||
if write_key:
|
|
||||||
sid = result.get("sessionId")
|
|
||||||
if sid:
|
|
||||||
await session_cache.put(write_key, sid, inst.name)
|
|
||||||
forced_tool_name = _openai_forced_tool_name(req.tool_choice)
|
forced_tool_name = _openai_forced_tool_name(req.tool_choice)
|
||||||
tool_events = _allowed_tool_events(
|
tool_events = _allowed_tool_events(
|
||||||
result.get("toolEvents"),
|
result.get("toolEvents"),
|
||||||
@@ -823,8 +793,7 @@ async def v1_chat_completions(req: ChatCompletionsRequest, request: Request):
|
|||||||
return JSONResponse(content=data)
|
return JSONResponse(content=data)
|
||||||
finally:
|
finally:
|
||||||
if not ticket_transferred:
|
if not ticket_transferred:
|
||||||
inst.in_flight = max(0, inst.in_flight - 1)
|
release_execution(ticket=ticket, inst=inst)
|
||||||
ticket.release()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -949,22 +918,25 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
|
|||||||
msg = (detail.get("error") or {}).get("message") or str(detail) or "upstream error"
|
msg = (detail.get("error") or {}).get("message") or str(detail) or "upstream error"
|
||||||
return _anthropic_error(exc.status_code, err_type, msg)
|
return _anthropic_error(exc.status_code, err_type, msg)
|
||||||
ask_mode = execution.ask_mode
|
ask_mode = execution.ask_mode
|
||||||
lookup_key = execution.lookup_key
|
|
||||||
write_key = execution.write_key
|
write_key = execution.write_key
|
||||||
cached_session_id = execution.cached_session_id
|
cached_session_id = execution.cached_session_id
|
||||||
inst = execution.inst
|
inst = execution.inst
|
||||||
model = execution.model
|
model = execution.model
|
||||||
prompt = execution.prompt
|
prompt = execution.prompt
|
||||||
is_reply = execution.is_reply
|
is_reply = execution.is_reply
|
||||||
affinity = execution.affinity
|
|
||||||
|
|
||||||
if not prompt:
|
|
||||||
return _anthropic_error(400, "invalid_request_error", "messages is empty")
|
|
||||||
|
|
||||||
prompt_tokens = estimate_tokens(prompt)
|
|
||||||
# ------------------------------------------------------------- backpressure
|
|
||||||
try:
|
try:
|
||||||
ticket = await chat_guard.try_acquire()
|
started = await start_execution(
|
||||||
|
protocol="anthropic",
|
||||||
|
execution=execution,
|
||||||
|
stream=req.stream,
|
||||||
|
chat_guard=chat_guard,
|
||||||
|
logger=logger,
|
||||||
|
estimate_tokens=estimate_tokens,
|
||||||
|
extra_log_context={"ctx_api": "anthropic"},
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
return _anthropic_error(400, "invalid_request_error", "messages is empty")
|
||||||
except BackpressureRejected as exc:
|
except BackpressureRejected as exc:
|
||||||
retry_after = max(1, int(exc.retry_after))
|
retry_after = max(1, int(exc.retry_after))
|
||||||
logger.warning("anthropic rejected by backpressure, retry_after=%ds", retry_after)
|
logger.warning("anthropic rejected by backpressure, retry_after=%ds", retry_after)
|
||||||
@@ -976,27 +948,9 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
|
|||||||
resp.headers["Retry-After"] = str(retry_after)
|
resp.headers["Retry-After"] = str(retry_after)
|
||||||
return resp
|
return resp
|
||||||
|
|
||||||
inst.in_flight += 1
|
ticket = started.ticket
|
||||||
|
prompt_tokens = started.prompt_tokens
|
||||||
message_id = f"msg_{uuid.uuid4().hex}"
|
message_id = f"msg_{uuid.uuid4().hex}"
|
||||||
logger.info(
|
|
||||||
"anthropic.start inst=%s model=%s stream=%s prompt_tokens~%d reuse=%s",
|
|
||||||
inst.name,
|
|
||||||
model,
|
|
||||||
req.stream,
|
|
||||||
prompt_tokens,
|
|
||||||
bool(cached_session_id),
|
|
||||||
extra={
|
|
||||||
"ctx_instance": inst.name,
|
|
||||||
"ctx_model": model,
|
|
||||||
"ctx_ask_mode": ask_mode,
|
|
||||||
"ctx_stream": req.stream,
|
|
||||||
"ctx_prompt_tokens": prompt_tokens,
|
|
||||||
"ctx_in_flight": chat_guard.in_flight,
|
|
||||||
"ctx_affinity": affinity,
|
|
||||||
"ctx_session_reuse": bool(cached_session_id),
|
|
||||||
"ctx_api": "anthropic",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
ticket_transferred = False
|
ticket_transferred = False
|
||||||
|
|
||||||
@@ -1175,59 +1129,39 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
finally:
|
finally:
|
||||||
# Session write-back only on clean finish — partial streams
|
await finalize_stream_execution(
|
||||||
# leave Lingma's session in an indeterminate state.
|
|
||||||
if success and write_key:
|
|
||||||
sid = _meta.get("session_id")
|
|
||||||
if sid:
|
|
||||||
await session_cache.put(write_key, sid, _inst.name)
|
|
||||||
await stats_collector.record_chat(
|
|
||||||
stream=True,
|
|
||||||
success=success,
|
success=success,
|
||||||
|
write_key=write_key,
|
||||||
|
session_id=_meta.get("session_id"),
|
||||||
|
inst=_inst,
|
||||||
|
ticket=_ticket,
|
||||||
|
session_cache=session_cache,
|
||||||
|
stats_collector=stats_collector,
|
||||||
prompt_tokens=prompt_tokens,
|
prompt_tokens=prompt_tokens,
|
||||||
completion_tokens=completion_tokens_holder["n"],
|
completion_tokens=completion_tokens_holder["n"],
|
||||||
)
|
)
|
||||||
_inst.in_flight = max(0, _inst.in_flight - 1)
|
|
||||||
_ticket.release()
|
|
||||||
|
|
||||||
ticket_transferred = True
|
ticket_transferred = True
|
||||||
return _streaming_response(event_stream())
|
return _streaming_response(event_stream())
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------- non-stream
|
|
||||||
try:
|
try:
|
||||||
result = await inst.client.chat_complete(
|
completed = await complete_execution(
|
||||||
prompt,
|
protocol="anthropic",
|
||||||
model,
|
execution=execution,
|
||||||
ask_mode,
|
|
||||||
session_id=cached_session_id,
|
|
||||||
is_reply=is_reply,
|
|
||||||
tool_config=tool_config,
|
|
||||||
)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("anthropic.complete error (inst=%s): %s", inst.name, exc)
|
|
||||||
await stats_collector.record_chat(
|
|
||||||
stream=False,
|
|
||||||
success=False,
|
|
||||||
prompt_tokens=prompt_tokens,
|
prompt_tokens=prompt_tokens,
|
||||||
completion_tokens=0,
|
tool_config=tool_config,
|
||||||
|
logger=logger,
|
||||||
|
stats_collector=stats_collector,
|
||||||
|
session_cache=session_cache,
|
||||||
|
estimate_tokens=estimate_tokens,
|
||||||
)
|
)
|
||||||
if cached_session_id and lookup_key:
|
except UpstreamExecutionError:
|
||||||
await session_cache.invalidate(lookup_key)
|
|
||||||
return _anthropic_error(502, "api_error", "upstream lingma error")
|
return _anthropic_error(502, "api_error", "upstream lingma error")
|
||||||
|
|
||||||
|
result = completed.result
|
||||||
text = result.get("text") or ""
|
text = result.get("text") or ""
|
||||||
completion_tokens = estimate_tokens(text)
|
completion_tokens = completed.completion_tokens
|
||||||
await stats_collector.record_chat(
|
|
||||||
stream=False,
|
|
||||||
success=True,
|
|
||||||
prompt_tokens=prompt_tokens,
|
|
||||||
completion_tokens=completion_tokens,
|
|
||||||
)
|
|
||||||
if write_key:
|
|
||||||
sid = result.get("sessionId")
|
|
||||||
if sid:
|
|
||||||
await session_cache.put(write_key, sid, inst.name)
|
|
||||||
|
|
||||||
content_blocks: list[dict[str, Any]] = []
|
content_blocks: list[dict[str, Any]] = []
|
||||||
if text:
|
if text:
|
||||||
@@ -1286,8 +1220,7 @@ async def v1_messages(req: AnthropicMessagesRequest, request: Request):
|
|||||||
return JSONResponse(content=response_body)
|
return JSONResponse(content=response_body)
|
||||||
finally:
|
finally:
|
||||||
if not ticket_transferred:
|
if not ticket_transferred:
|
||||||
inst.in_flight = max(0, inst.in_flight - 1)
|
release_execution(ticket=ticket, inst=inst)
|
||||||
ticket.release()
|
|
||||||
|
|
||||||
|
|
||||||
@app.post("/internal/auto-login/start", dependencies=[Depends(admin_auth_guard)])
|
@app.post("/internal/auto-login/start", dependencies=[Depends(admin_auth_guard)])
|
||||||
|
|||||||
Reference in New Issue
Block a user