lingma-openai-gateway/app/openai_schema.py

from __future__ import annotations

from typing import Any, Literal

from pydantic import BaseModel, Field


# Keep permissive: OpenAI clients routinely send list-of-parts (multi-modal) or None
# (for tool calls). We flatten to plain text downstream.
MessageContent = str | list[dict[str, Any]] | None


class ChatMessage(BaseModel):
    # OpenAI supports "developer" on newer API versions in addition to the classic set.
    role: Literal["system", "user", "assistant", "tool", "developer", "function"]
    content: MessageContent = None
    name: str | None = None
    tool_call_id: str | None = None
    tool_calls: list[dict[str, Any]] | None = None


class ChatCompletionsRequest(BaseModel):
    model: str
    messages: list[ChatMessage]
    stream: bool = False
    temperature: float | None = None
    top_p: float | None = None
    max_tokens: int | None = None
    user: str | None = None
    stream_options: dict[str, Any] | None = None
    tools: list[dict[str, Any]] | None = None
    tool_choice: Any | None = None


class ModelData(BaseModel):
    id: str
    name: str | None = None
    object: str = "model"
    created: int = 0
    owned_by: str = "lingma"


class ModelsResponse(BaseModel):
    object: str = "list"
    data: list[ModelData]


class ChatCompletionChoice(BaseModel):
    index: int = 0
    finish_reason: str | None = "stop"
    message: dict = Field(default_factory=dict)
    logprobs: Any | None = None


class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: list[ChatCompletionChoice]
    system_fingerprint: str | None = None


def flatten_content(content: MessageContent) -> str:
    """Reduce OpenAI multi-part content to a plain string prompt for Lingma."""
    if content is None:
        return ""
    if isinstance(content, str):
        return content
    if isinstance(content, list):
        parts: list[str] = []
        for item in content:
            if not isinstance(item, dict):
                parts.append(str(item))
                continue
            t = item.get("type")
            if t == "text":
                text = item.get("text") or ""
                if text:
                    parts.append(text)
            elif t in ("image_url", "input_image"):
                # Lingma 不支持多模态，降级成占位符，保留语义信号
                parts.append("[image]")
            elif t == "input_audio":
                parts.append("[audio]")
            else:
                text = item.get("text") or item.get("content")
                if isinstance(text, str) and text:
                    parts.append(text)
        return "\n".join(p for p in parts if p)
    return str(content)