diff --git a/docs/tool-emulation-checklist.md b/docs/tool-emulation-checklist.md new file mode 100644 index 0000000..92cf1c4 --- /dev/null +++ b/docs/tool-emulation-checklist.md @@ -0,0 +1,194 @@ +# Tool Emulation Checklist + +This checklist is for implementation work. + +It is not meant to explain the theory again. It breaks plain-chat tool emulation into concrete surfaces that can be implemented and validated incrementally. + +## 1. Prompt Contract + +- tell the model that tools are available +- list tool names, short descriptions, and schema summaries +- define a fixed action format +- define multi-turn rules +- encode `tool_choice` constraints +- include at least one valid action example +- ideally include one example where a tool result arrives and the model decides what to do next + +Acceptance: + +- the first turn reliably emits a valid action block +- later turns do not collapse into plain explanation after a tool result + +## 2. Request Normalization + +- OpenAI: + - parse `tools` + - parse `tool_choice` + - parse `assistant.tool_calls` + - parse `tool` +- Anthropic: + - parse `tools` + - parse `tool_choice` + - parse `tool_use` + - parse `tool_result` +- normalize everything into one internal structure +- detect tool history even when the current turn does not repeat `tools` + +Acceptance: + +- emulation stays active on later turns without repeated tool definitions + +## 3. Tool History Projection + +- project historical assistant tool calls back into action text +- do not pass downstream protocol-specific history directly to the upstream model +- preserve tool name, arguments, and call id where useful + +Acceptance: + +- the model can “see” its own previous actions in later turns + +## 4. Tool Result Continuation + +- do not feed raw tool output back without framing +- wrap tool results into an explicit continuation message +- handle empty, partial, and error outputs consistently + +Acceptance: + +- after a tool result, the model can either call another tool or finish naturally + +## 5. Parser Contract + +- recognize both ` ```json action ` and plain ` ```json ` +- tolerate smart quotes, trailing commas, and stringified argument JSON +- extract `tool`, `name`, `parameters`, `arguments`, or `input` +- support multiple blocks in one reply +- strip action blocks from normal assistant text + +Acceptance: + +- multiple action blocks can be parsed reliably + +## 6. Retry Policy + +- trigger when: + - a tool call was expected but no action block was produced + - refusal language is detected + - `tool_choice=any` + - `tool_choice=tool` +- retry with a stricter message +- bound retry count +- log retry reason + +Acceptance: + +- refusal-style replies can be corrected without infinite loops + +## 7. Refusal Detection + +- maintain a refusal phrase set +- detect both hard refusals and soft “environment limitation” answers +- distinguish between: + - a legitimate no-tool answer + - a failed tool-use turn + +Acceptance: + +- common “tools are unavailable” replies trigger retry when appropriate + +## 8. Response Re-encoding + +- OpenAI: + - emit `message.tool_calls` + - set `finish_reason = tool_calls` +- Anthropic: + - emit `content[].tool_use` + - set `stop_reason = tool_use` +- preserve normal text when no tool call is present + +Acceptance: + +- downstream clients remain unaware that the upstream lacks native tools + +## 9. Streaming Strategy + +- OpenAI: + - role chunk + - text deltas + - tool call deltas +- Anthropic: + - `message_start` + - `content_block_start` + - `content_block_delta` + - `content_block_stop` + - `message_delta` + - `message_stop` +- document clearly when streaming is synthesized from a completed non-stream result + +Acceptance: + +- downstream stream consumers receive protocol-valid event sequences + +## 10. Multi-turn State Machine + +- distinguish at least: + - first decision + - tool call emitted + - waiting for tool result + - tool result received, next decision pending + - final answer +- derive state from message history, not only the current payload +- do not confuse “tool history exists” with “another tool call is mandatory” + +Acceptance: + +- agent loops remain stable across more than one turn + +## 11. Observability + +- log: + - whether emulation is active + - how many tool calls were parsed + - whether retry fired + - which refusal signal matched +- ideally log whether: + - the prompt contract was injected + - tool history was detected + +Acceptance: + +- failures can be localized to prompt, parser, retry, or state management + +## 12. Test Matrix + +- OpenAI: + - single-turn tool call + - multi-turn tool result continuation + - later turn without repeated `tools` + - forced tool + - `tool_choice=any` +- Anthropic: + - single-turn `tool_use` + - multi-turn `tool_result` continuation + - later turn without repeated `tools` + - streaming `tool_use` +- error cases: + - refusal + - invalid JSON + - multiple action blocks + - plain-text final answer + +Acceptance: + +- both “first tool turn” and “second-turn continuation” are covered + +## 13. Recommended Next Priorities + +If the system already works, the highest-value next improvements are: + +1. stronger few-shot for “tool result arrives, then call another tool” +2. better history-aware retry policy +3. finer refusal categories +4. stronger parser tolerance +5. richer streaming behavior diff --git a/docs/tool-emulation-checklist.zh-CN.md b/docs/tool-emulation-checklist.zh-CN.md new file mode 100644 index 0000000..9c77ccc --- /dev/null +++ b/docs/tool-emulation-checklist.zh-CN.md @@ -0,0 +1,241 @@ +# Tool Emulation 实现清单 + +这份清单是给后续迭代用的。 + +目标不是解释原理,而是把“纯聊天 API 模拟 tools 调用”拆成可逐项完成、可逐项验证的实现面。 + +## 1. Prompt Contract + +- 明确告诉模型当前有可用工具,不要声称“工具不可用” +- 列出全部工具: + - 名称 + - 简短描述 + - 参数 schema 摘要 +- 固定动作输出格式: + - ` ```json action ... ``` ` +- 明确多轮规则: + - 独立动作可并行 + - 依赖动作要等 tool result + - 无需工具时才输出普通文本 +- 明确 `tool_choice` 约束: + - `any` + - 指定 tool +- 给至少一个合法 action block 示例 +- 最好再给一个“tool result 回来后继续决策”的 few-shot + +验收标准: + +- 模型第一轮能稳定输出合法 action block +- 第二轮收到 tool result 后,不会轻易掉回普通解释文本 + +## 2. Request Normalization + +- OpenAI: + - 解析 `tools` + - 解析 `tool_choice` + - 解析 `assistant.tool_calls` + - 解析 `tool` +- Anthropic: + - 解析 `tools` + - 解析 `tool_choice` + - 解析 `tool_use` + - 解析 `tool_result` +- 统一归一化成内部结构: + - tools + - choice + - messages + - history state +- 识别“当前轮没带 tools,但历史里已有 tool 调用”的场景 + +验收标准: + +- 第二轮即使不重复传 `tools`,也能继续走 emulation + +## 3. Tool History Projection + +- 把历史 assistant 工具调用重投影成 action text +- 不要把结构化历史原样丢给上游模型 +- 保留: + - tool name + - arguments + - call id +- 投影结果应和真实 action block 尽量一致 + +验收标准: + +- 模型在多轮中能“看到”自己之前做过什么动作 + +## 4. Tool Result Continuation + +- tool result 不要裸塞回去 +- 包装成明确续写指令: + - 当前哪个 call 的结果回来了 + - 基于结果继续下一步动作 +- 对空结果、错误结果、部分结果做统一包装 + +验收标准: + +- 模型收到 tool result 后能继续: + - 再发起新工具调用 + - 或输出最终答案 + +## 5. Parser Contract + +- 识别: + - ` ```json action ` + - 普通 ` ```json ` +- 容忍: + - 智能引号 + - 尾逗号 + - 参数是字符串化 JSON +- 支持提取: + - `tool` + - `name` + - `parameters` + - `arguments` + - `input` +- 能从正文里剥离 action block +- 支持多 block + +验收标准: + +- 同一回复里多个 action block 都能被解析 +- 正文和动作块可以正确拆分 + +## 6. Retry Policy + +- 触发条件: + - 明确要求工具调用但没产出 action block + - 命中 refusal 文本 + - `tool_choice=any` + - `tool_choice=tool` +- retry 消息要更强约束: + - 必须输出 action block + - 不要解释 + - 必要时必须调用指定工具 +- 控制 retry 次数 +- 记录 retry 原因 + +验收标准: + +- refusal 回复能被纠偏 +- retry 不会无限循环 + +## 7. Refusal Detection + +- 维护 refusal 关键词表: + - `I don't have tools` + - `tools are unavailable` + - `没有可用的工具` + - `无法调用工具` +- 识别“软拒答”: + - 只解释、不行动 + - 强调环境限制 +- 区分: + - 真正不该调用工具 + - 本该调用工具却在推脱 + +验收标准: + +- 常见“我没有工具”类回复能稳定触发 retry + +## 8. Response Re-encoding + +- OpenAI: + - `message.tool_calls` + - `finish_reason = tool_calls` +- Anthropic: + - `content[].tool_use` + - `stop_reason = tool_use` +- 无工具时回普通文本 +- 文本和工具调用共存时保持协议兼容 + +验收标准: + +- 下游客户端无需知道上游其实不支持 native tools + +## 9. Streaming Strategy + +- OpenAI stream: + - 先发 role chunk + - 再发 text delta + - 再发 tool_calls delta +- Anthropic stream: + - `message_start` + - `content_block_start` + - `content_block_delta` + - `content_block_stop` + - `message_delta` + - `message_stop` +- 如果当前实现是“先完整拿结果再合成流”,文档里要明确说明 + +验收标准: + +- 下游看到的流式协议字段合法 + +## 10. Multi-turn State Machine + +- 状态至少区分: + - 等待模型首次决策 + - 已发起工具调用 + - 等待 tool result + - 收到 tool result,等待下一轮决策 + - 最终回答完成 +- 状态切换依据应来自消息历史,而不是只看本轮字段 +- 不要把“工具历史存在”误判成“必须再调工具” + +验收标准: + +- 一轮以上的 agent loop 稳定 + +## 11. Observability + +- 打日志: + - 是否进入 emulation + - 解析到几个 tool calls + - 是否触发 retry + - refusal 命中原因 +- 最好记录: + - prompt contract 是否注入 + - tool history 是否被识别 + +验收标准: + +- 出问题时能判断是: + - prompt 不够强 + - parser 失败 + - retry 没触发 + - 状态机断了 + +## 12. 测试矩阵 + +- OpenAI: + - 单轮 tool call + - 多轮 tool result 回灌 + - 第二轮不重复传 `tools` + - 指定 tool + - `tool_choice=any` +- Anthropic: + - 单轮 tool_use + - 多轮 tool_result 回灌 + - 第二轮不重复传 `tools` + - 流式 tool_use +- 异常场景: + - refusal + - 无效 JSON + - 多 action block + - 普通文本结束 + +验收标准: + +- 至少覆盖“第一轮调用工具”和“第二轮继续决策”两大关键场景 + +## 13. 下一步优先级 + +如果当前系统已经能跑,最值得优先继续做的是: + +1. 多轮再次发起新工具调用的 few-shot +2. 基于历史状态的 retry 强化 +3. 更细的 refusal 分类 +4. parser 容错增强 +5. 流式工具事件细化 diff --git a/docs/tool-emulation-methodology.md b/docs/tool-emulation-methodology.md new file mode 100644 index 0000000..68db4e6 --- /dev/null +++ b/docs/tool-emulation-methodology.md @@ -0,0 +1,131 @@ +# Methodology: Simulating Tool Calls over a Plain Chat API + +This document describes a practical pattern for supporting tool calling when the upstream model only exposes a plain chat API. + +The core idea is: + +1. Convert downstream tool definitions into a prompt-level contract. +2. Ask the model to emit structured action text. +3. Parse that action text in the proxy. +4. Re-encode it back into standard protocol fields such as OpenAI `tool_calls` or Anthropic `tool_use`. + +## Core Pattern + +When the upstream model does not support native tool calls, do not rely on blindly forwarding `tools`. + +Instead: + +- treat the model as a text generator +- define a stable action DSL +- keep the proxy responsible for state, retries, parsing, and protocol mapping + +In this project the action DSL is a fenced block: + +```text +```json action +{"tool":"NAME","parameters":{"key":"value"}} +``` +``` + +## What the Proxy Must Do + +The proxy is not a passive transport anymore. Once tool emulation is enabled, it should: + +- inject tool definitions into the prompt +- preserve tool history across turns +- project historical tool calls back into action text +- wrap tool results into a continuation prompt +- detect refusal patterns such as “I don't have tools” +- retry with a stronger instruction when a tool call was expected but missing +- map parsed actions back into downstream protocol fields + +## Multi-turn Tool Calling + +Single-turn emulation is not enough. A useful agent loop looks like this: + +1. model emits a tool call +2. external executor runs the tool +3. tool result is fed back into the conversation +4. model decides whether to call another tool or finish + +To make this stable: + +- do not feed tool results back as raw text only +- wrap them in a continuation message that clearly asks for the next action +- keep emulation active even when later turns do not repeat the original `tools` field + +That last point matters. Many clients send `tools` only on the first turn. The proxy should still keep the conversation in emulation mode when it sees tool history. + +## Few-shot Guidance + +The minimum few-shot should teach the model the output shape. + +A better few-shot also teaches state transitions: + +- when to call a tool +- when to wait for the tool result +- when to call another tool +- when to answer normally + +For complex agent loops, a multi-step example with: + +- user request +- assistant tool call +- tool result +- assistant next action + +is usually more effective than a single static action example. + +## Retry Guidance + +Retry is useful when: + +- a tool call was expected but no action block was produced +- the model says tools are unavailable +- the request forces tool usage + +A retry prompt should be explicit and procedural, for example: + +```text +Your last response did not include any ```json action``` block. +You must respond with at least one valid action block now. +Do not explain. Output the action block directly. +``` + +Retries should be bounded. A small retry budget plus stronger instructions per retry is usually enough. + +## Protocol Mapping + +OpenAI side: + +- input may contain `tools`, `tool_choice`, `assistant.tool_calls`, and `tool` +- output should map back into `message.tool_calls` and `finish_reason = "tool_calls"` + +Anthropic side: + +- input may contain `tools`, `tool_choice`, `tool_use`, and `tool_result` +- output should map back into `content[].tool_use` and `stop_reason = "tool_use"` + +## Common Failure Modes + +- only supporting the first tool turn +- losing emulation state on later turns +- not projecting historical tool calls back into text +- feeding back raw tool results without continuation instructions +- missing refusal detection +- using a parser that is too brittle for real model output + +## In This Repository + +The implementation here follows exactly this pattern: + +- downstream tool schemas are rewritten into prompt instructions +- the model emits `json action` blocks +- the proxy parses them +- the proxy re-encodes them as OpenAI or Anthropic tool protocol fields +- later turns can continue from tool history even when `tools` are not repeated + +Implementation checklist: + +- [tool-emulation-checklist.md](./tool-emulation-checklist.md) + diff --git a/docs/tool-emulation-methodology.zh-CN.md b/docs/tool-emulation-methodology.zh-CN.md new file mode 100644 index 0000000..fa5504a --- /dev/null +++ b/docs/tool-emulation-methodology.zh-CN.md @@ -0,0 +1,378 @@ +# 纯聊天 API 模拟 Tools 调用的方法论 + +这份文档总结的是一种通用做法: + +- 上游模型只有普通聊天接口 +- 不原生支持 `tools` / `tool_calls` / `tool_use` +- 但下游调用方希望继续走 OpenAI 或 Anthropic 风格的工具调用协议 + +核心思路不是“骗上游说自己支持 tools”,而是: + +1. 在代理层把工具定义改写成一套稳定的提示词契约 +2. 让模型用约定的结构化文本输出动作 +3. 再由代理把结构化文本还原成标准协议里的 `tool_calls` 或 `tool_use` + +## 核心原则 + +### 1. 不依赖上游原生能力 + +如果上游不支持原生工具调用,最稳的路线不是继续透传 `tools` 字段,而是把工具定义下沉成提示词层协议。 + +换句话说: + +- 对模型来说,它看到的是“你有这些动作,可以按某种格式发起调用” +- 对客户端来说,它看到的仍然是标准 OpenAI / Anthropic 工具协议 + +代理层负责做两次映射。 + +### 2. 工具调用必须降维成可解析文本 + +一个可落地的格式必须满足: + +- 模型容易学会 +- 人容易读 +- 代理容易解析 +- 多轮场景里不容易歧义 + +本项目采用的是 fenced block: + +```text +```json action +{"tool":"NAME","parameters":{"key":"value"}} +``` +``` + +这个格式比“自然语言里自己说我要调用某个工具”稳定很多。 + +### 3. 代理是状态机,不只是转发器 + +一旦进入 emulation 模式,代理就不能再只是简单透传。 + +它至少要承担这些职责: + +- 注入工具说明 +- 把历史工具调用改写回上下文 +- 把工具结果回灌成下一轮提示 +- 识别拒答和跑偏 +- 必要时做 retry +- 把文本动作重新编码成标准工具协议 + +## 一条完整链路 + +### 输入侧 + +客户端发来: + +- OpenAI `tools` / `tool_choice` +- 或 Anthropic `tools` / `tool_choice` + +代理做三件事: + +1. 抽取工具名称、描述、参数 schema +2. 归一化 tool choice +3. 判断是否进入 emulation 模式 + +进入 emulation 后,不再把原始 `tools` 直接交给上游,而是改写系统提示词。 + +### 提示词侧 + +提示词里至少要包含: + +- 你有工具可用,不要声称“工具不可用” +- 工具列表 +- 固定动作格式 +- 多轮规则 +- `tool_choice` 约束 +- 一个有效示例 + +建议的约束重点: + +- 需要工具时必须输出 `json action` +- 独立动作可以一次输出多个 block +- 依赖动作必须等工具结果回来再继续 +- 不需要工具时才允许输出普通文本 +- 不要解释“为什么不能调用工具” + +### 输出侧 + +模型回复后,代理扫描 `json action` block: + +- 解析出 `tool` +- 解析出 `parameters` +- 从正文里剥离 action block + +然后映射回: + +- OpenAI `message.tool_calls` +- Anthropic `content[].tool_use` + +如果没有解析到动作,就把剩余文本当普通 assistant 回复。 + +## 多轮工具调用 + +这是最容易做坏的部分。 + +### 单轮模拟并不够 + +只做第一轮 `tool_calls` 很容易,但这还不是真正的 agent loop。 + +真正有用的是: + +1. 第一轮模型发起工具调用 +2. 外部执行工具 +3. 把工具结果回灌 +4. 模型继续决策 +5. 可能再次发起工具调用 +6. 或输出最终回答 + +### 回灌工具结果时,不要只塞原始结果 + +稳定做法是把工具结果包装成明确的续写指令,而不是只把结果裸塞回去。 + +例如: + +```text +Tool result for call_1: +pong + +Based on the tool result above, continue with the next appropriate action using the structured format. +``` + +这样模型更清楚当前处于“继续 agent loop”的阶段,而不是另起一轮普通问答。 + +### 第二轮不应强依赖重复传 tools + +复杂客户端并不一定会在每一轮都重复把 `tools` 发回来。 + +因此代理应把这些历史也视作“仍处于 emulation 会话中”的信号: + +- OpenAI: + - assistant 消息里已有 `tool_calls` + - 后续有 `tool` 角色消息 +- Anthropic: + - 历史里已有 `tool_use` + - 后续有 `tool_result` + +只要这些历史存在,即使当前轮未重新传 `tools`,代理也应继续以 emulation 方式处理。 + +### 历史里的工具调用要重新投影成动作文本 + +模型并不理解 OpenAI / Anthropic 的结构化历史字段。 + +因此代理要把历史里的: + +- `assistant.tool_calls` +- `assistant tool_use` + +重新投影成: + +```text +```json action +{ + "tool": "ping", + "parameters": { + "value": "123" + } +} +``` +``` + +这样模型才能在多轮里看到自己“之前做过什么动作”。 + +## Few-shot 怎么设计 + +### 最小 few-shot + +至少给一个合法动作示例: + +```text +```json action +{ + "tool": "read_file", + "parameters": { + "path": "README.md" + } +} +``` +``` + +它的作用不是示范业务逻辑,而是强制模型学会“输出形状”。 + +### 更稳的 few-shot + +如果目标是复杂 agent loop,推荐再补一个“工具结果回来后再次决策”的 few-shot。 + +例如三段式: + +1. 用户请求 +2. assistant 发起工具调用 +3. user 提供 tool result +4. assistant 再次发起新工具调用或结束 + +这个 few-shot 能显著减少模型在第二轮以后掉回普通文本解释。 + +### few-shot 要突出状态转换 + +最重要的不是工具本身,而是让模型明确以下三种状态: + +- 该调用工具 +- 该等待工具结果 +- 该输出最终回答 + +复杂 loop 不稳,通常就是状态转换没教明白。 + +## Retry 怎么设计 + +### Retry 的触发条件 + +比较实用的触发条件: + +- 本轮本应调用工具,但没有解析出 action block +- 模型回复了“没有工具”“工具不可用”“我无法调用” +- `tool_choice=any` +- `tool_choice=tool` + +### Retry 的方式 + +不要只重发原请求。应显式补一条纠偏消息,例如: + +```text +Your last response did not include any ```json action``` block. +You must respond with at least one valid action block now. +Do not explain. Output the action block directly. +``` + +如果是强制指定某个工具,再额外加: + +```text +You must call "ping". +``` + +### Retry 不要无限循环 + +建议设置: + +- 小次数重试 +- 每次 retry 都更强约束 +- 只在明确需要工具调用时触发 + +否则很容易把普通自然回复误判成失败。 + +## 协议映射建议 + +### OpenAI + +输入: + +- `tools` +- `tool_choice` +- `assistant.tool_calls` +- `tool` + +输出: + +- `finish_reason = "tool_calls"` +- `message.tool_calls` + +### Anthropic + +输入: + +- `tools` +- `tool_choice` +- `content[].tool_use` +- `content[].tool_result` + +输出: + +- `stop_reason = "tool_use"` +- `content[].tool_use` + +流式时,再映射成对应的 SSE 事件。 + +## 常见坑 + +### 1. 只做第一轮 + +这会让你看起来“支持 tools”,但一进入 agent loop 就断掉。 + +### 2. 历史工具调用没有重投影 + +模型看不到自己的历史动作,多轮就不稳。 + +### 3. 工具结果回灌过于裸 + +只把 `pong` 塞回去,模型不一定知道自己该继续决策。 + +### 4. 没有 refusal 检测 + +很多模型会下意识说: + +- 我没有工具 +- 当前环境无法调用 +- 我只能提供建议 + +不识别这类模式,就不会进入纠偏 retry。 + +### 5. 文本解析规则太脆弱 + +解析器至少要容忍: + +- ` ```json action ` 或普通 ` ```json ` +- 智能引号 +- 末尾逗号 +- 参数对象有时是字符串化 JSON + +## 推荐的最小实现 + +如果要做一个最小可用版,建议先只做: + +1. 工具定义注入 +2. `json action` 解析 +3. refusal 检测 +4. 一次 retry +5. OpenAI 非流式返回 + +然后再逐步补: + +1. Anthropic 非流式 +2. OpenAI 流式 +3. Anthropic 流式 +4. 多轮 tool history 投影 +5. 更强 few-shot + +## 适用边界 + +这套方法适合: + +- 上游不支持原生 tools +- 你又必须对外兼容标准工具协议 +- 目标任务以工程类、文件类、检索类工具为主 + +它不适合: + +- 对工具调用正确率极高要求的强生产场景 +- 上游已经支持原生 tools,但你还硬要绕一层文本模拟 + +如果上游能原生支持工具调用,优先使用原生协议。 + +## 本项目里的落地经验 + +在 `lingma-ipc-proxy` 里,这套方法最终证明了两点: + +1. 只靠透传 `tools` 给 Lingma 不够,模型会继续说“没有可用工具” +2. 代理层做 emulation 后,可以稳定还原出: + - OpenAI `tool_calls` + - Anthropic `tool_use` + - 多轮 tool result 回灌后的继续决策 + +进一步要增强稳定性,最值得继续打磨的是: + +- 多轮再次发起新工具调用的 few-shot +- 基于历史状态的更细 retry 策略 +- 不同工具类别的专用示例 + +配套实现清单: + +- [tool-emulation-checklist.zh-CN.md](./tool-emulation-checklist.zh-CN.md) + diff --git a/internal/httpapi/server_test.go b/internal/httpapi/server_test.go new file mode 100644 index 0000000..1d0bccc --- /dev/null +++ b/internal/httpapi/server_test.go @@ -0,0 +1,114 @@ +package httpapi + +import ( + "strings" + "testing" +) + +func TestNormalizeOpenAIRequestKeepsEmulationForToolHistoryWithoutTools(t *testing.T) { + s := &Server{cfg: Config{EmulateToolCalls: true}} + req := openAIChatRequest{ + Model: "test-model", + Messages: []rawMessage{ + { + Role: "user", + Content: "Call ping once, then after the tool result reply FINAL_OK.", + }, + { + Role: "assistant", + Content: nil, + ToolCalls: []rawToolCall{ + { + ID: "call_1", + Type: "function", + Function: struct { + Name string `json:"name"` + Arguments string `json:"arguments"` + }{ + Name: "ping", + Arguments: "{\"value\":\"x\"}", + }, + }, + }, + }, + { + Role: "tool", + ToolCallID: "call_1", + Content: "pong", + }, + }, + } + + normalized, err := s.normalizeOpenAIRequest(req) + if err != nil { + t.Fatalf("normalizeOpenAIRequest() error = %v", err) + } + if !normalized.Emulated { + t.Fatalf("expected emulation to stay enabled when tool history exists") + } + if len(normalized.ChatRequest.Messages) != 3 { + t.Fatalf("message count = %d", len(normalized.ChatRequest.Messages)) + } + if normalized.ChatRequest.Messages[1].Role != "assistant" { + t.Fatalf("assistant message role = %q", normalized.ChatRequest.Messages[1].Role) + } + if !strings.Contains(normalized.ChatRequest.Messages[1].Text, "json action") || !strings.Contains(normalized.ChatRequest.Messages[1].Text, "\"tool\": \"ping\"") { + t.Fatalf("assistant tool call was not rewritten into action format: %q", normalized.ChatRequest.Messages[1].Text) + } + if normalized.ChatRequest.Messages[2].Role != "user" { + t.Fatalf("tool result role = %q", normalized.ChatRequest.Messages[2].Role) + } + if !strings.Contains(normalized.ChatRequest.Messages[2].Text, "pong") { + t.Fatalf("tool result was not converted into a follow-up prompt: %q", normalized.ChatRequest.Messages[2].Text) + } +} + +func TestNormalizeAnthropicRequestKeepsEmulationForToolHistoryWithoutTools(t *testing.T) { + s := &Server{cfg: Config{EmulateToolCalls: true}} + req := anthropicRequest{ + Model: "test-model", + Messages: []rawMessage{ + { + Role: "user", + Content: []any{ + map[string]any{"type": "text", "text": "Use ping, then after the tool result reply FINAL_OK."}, + }, + }, + { + Role: "assistant", + Content: []any{ + map[string]any{"type": "tool_use", "id": "call_1", "name": "ping", "input": map[string]any{"value": "x"}}, + }, + }, + { + Role: "user", + Content: []any{ + map[string]any{"type": "tool_result", "tool_use_id": "call_1", "content": []any{map[string]any{"type": "text", "text": "pong"}}}, + }, + }, + }, + } + + normalized, err := s.normalizeAnthropicRequest(req) + if err != nil { + t.Fatalf("normalizeAnthropicRequest() error = %v", err) + } + if !normalized.Emulated { + t.Fatalf("expected emulation to stay enabled when anthropic tool history exists") + } + if len(normalized.ChatRequest.Messages) != 3 { + t.Fatalf("message count = %d", len(normalized.ChatRequest.Messages)) + } + if normalized.ChatRequest.Messages[1].Role != "assistant" { + t.Fatalf("assistant message role = %q", normalized.ChatRequest.Messages[1].Role) + } + if !strings.Contains(normalized.ChatRequest.Messages[1].Text, "json action") || !strings.Contains(normalized.ChatRequest.Messages[1].Text, "\"tool\": \"ping\"") { + t.Fatalf("assistant tool_use was not rewritten into action format: %q", normalized.ChatRequest.Messages[1].Text) + } + if normalized.ChatRequest.Messages[2].Role != "user" { + t.Fatalf("tool result role = %q", normalized.ChatRequest.Messages[2].Role) + } + if !strings.Contains(normalized.ChatRequest.Messages[2].Text, "pong") { + t.Fatalf("tool result was not converted into a follow-up prompt: %q", normalized.ChatRequest.Messages[2].Text) + } +} diff --git a/internal/toolemulation/toolemulation.go b/internal/toolemulation/toolemulation.go new file mode 100644 index 0000000..f8a5e23 --- /dev/null +++ b/internal/toolemulation/toolemulation.go @@ -0,0 +1,648 @@ +package toolemulation + +import ( + "encoding/json" + "strconv" + "strings" + "sync/atomic" +) + +type ToolDef struct { + Name string + Description string + InputSchema map[string]any +} + +type ToolChoice struct { + Mode string + Name string +} + +type ToolCall struct { + ID string + Name string + Arguments map[string]any +} + +type Config struct { + MaxScanBytes int +} + +func ExtractTools(raw any) []ToolDef { + items, ok := raw.([]any) + if !ok { + return nil + } + + out := make([]ToolDef, 0, len(items)) + for _, item := range items { + m, ok := item.(map[string]any) + if !ok { + continue + } + fn, ok := m["function"].(map[string]any) + if !ok { + continue + } + name := strings.TrimSpace(stringFromAny(fn["name"])) + if name == "" { + continue + } + schema, _ := fn["parameters"].(map[string]any) + out = append(out, ToolDef{ + Name: name, + Description: strings.TrimSpace(stringFromAny(fn["description"])), + InputSchema: cloneMap(schema), + }) + } + return out +} + +func ExtractAnthropicTools(raw any) []ToolDef { + items, ok := raw.([]any) + if !ok { + return nil + } + + out := make([]ToolDef, 0, len(items)) + for _, item := range items { + m, ok := item.(map[string]any) + if !ok { + continue + } + name := strings.TrimSpace(stringFromAny(m["name"])) + if name == "" { + continue + } + schema, _ := m["input_schema"].(map[string]any) + out = append(out, ToolDef{ + Name: name, + Description: strings.TrimSpace(stringFromAny(m["description"])), + InputSchema: cloneMap(schema), + }) + } + return out +} + +func ExtractToolChoice(raw any) ToolChoice { + if raw == nil { + return ToolChoice{Mode: "auto"} + } + if s, ok := raw.(string); ok { + s = strings.TrimSpace(s) + switch s { + case "", "auto", "none": + return ToolChoice{Mode: "auto"} + case "required", "any": + return ToolChoice{Mode: "any"} + default: + return ToolChoice{Mode: "tool", Name: s} + } + } + + m, ok := raw.(map[string]any) + if !ok { + return ToolChoice{Mode: "auto"} + } + typeName := strings.TrimSpace(stringFromAny(m["type"])) + switch typeName { + case "function", "tool": + if fn, ok := m["function"].(map[string]any); ok { + if name := strings.TrimSpace(stringFromAny(fn["name"])); name != "" { + return ToolChoice{Mode: "tool", Name: name} + } + } + if name := strings.TrimSpace(stringFromAny(m["name"])); name != "" { + return ToolChoice{Mode: "tool", Name: name} + } + case "required", "any": + return ToolChoice{Mode: "any"} + case "auto", "none": + return ToolChoice{Mode: "auto"} + } + return ToolChoice{Mode: "auto"} +} + +func ExtractAnthropicToolChoice(raw any) ToolChoice { + if raw == nil { + return ToolChoice{Mode: "auto"} + } + m, ok := raw.(map[string]any) + if !ok { + return ExtractToolChoice(raw) + } + switch strings.TrimSpace(stringFromAny(m["type"])) { + case "", "auto", "none": + return ToolChoice{Mode: "auto"} + case "any", "required": + return ToolChoice{Mode: "any"} + case "tool": + name := strings.TrimSpace(stringFromAny(m["name"])) + if name != "" { + return ToolChoice{Mode: "tool", Name: name} + } + } + return ToolChoice{Mode: "auto"} +} + +func HasToolRequest(tools []ToolDef, choice ToolChoice) bool { + return len(tools) > 0 || choice.Mode != "" && choice.Mode != "auto" +} + +func InjectTooling(system string, tools []ToolDef, choice ToolChoice) string { + system = strings.TrimSpace(system) + if len(tools) == 0 { + return system + } + + toolLines := make([]string, 0, len(tools)) + for _, tool := range tools { + name := strings.TrimSpace(tool.Name) + if name == "" { + continue + } + sig := compactSchema(tool.InputSchema) + line := name + "(" + sig + ")" + if desc := strings.TrimSpace(truncate(tool.Description, 120)); desc != "" { + line += " - " + desc + } + toolLines = append(toolLines, line) + } + + var b strings.Builder + b.WriteString("You are a capable AI assistant operating inside an IDE with tool access.\n\n") + b.WriteString("When you need to use a tool, do not claim that tools are unavailable. ") + b.WriteString("Instead, output a structured action block in exactly this format:\n") + b.WriteString("```json action\n{\"tool\":\"NAME\",\"parameters\":{\"key\":\"value\"}}\n```\n\n") + b.WriteString("Available tools:\n") + b.WriteString(strings.Join(toolLines, "\n")) + b.WriteString("\n\n") + b.WriteString("Rules:\n") + b.WriteString("- Use one or more ```json action``` blocks for tool calls.\n") + b.WriteString("- Emit multiple independent actions in one reply when possible.\n") + b.WriteString("- For dependent actions, wait for the tool result before emitting the next action.\n") + b.WriteString("- If no tool is needed, reply with normal plain text.\n") + b.WriteString("- Do not say that tools are unavailable.\n") + b.WriteString(forceConstraint(choice)) + + example := ActionBlockExample(tools) + if example != "" { + b.WriteString("\n\nExample valid action block:\n") + b.WriteString(example) + } + + tooling := strings.TrimSpace(b.String()) + if system == "" { + return tooling + } + return system + "\n\n---\n\n" + tooling +} + +func AssistantToolCallsToText(content string, calls []ToolCall) string { + content = strings.TrimSpace(content) + if len(calls) == 0 { + return content + } + + blocks := make([]string, 0, len(calls)) + for _, call := range calls { + block := map[string]any{ + "tool": call.Name, + "parameters": call.Arguments, + } + b, err := json.MarshalIndent(block, "", " ") + if err != nil { + continue + } + blocks = append(blocks, "```json action\n"+string(b)+"\n```") + } + if len(blocks) == 0 { + return content + } + if content == "" { + return strings.Join(blocks, "\n\n") + } + return content + "\n\n" + strings.Join(blocks, "\n\n") +} + +func ActionOutputPrompt(toolCallID string, output string) string { + output = strings.TrimSpace(output) + if output == "" { + return "" + } + if id := strings.TrimSpace(toolCallID); id != "" { + return "Tool result for " + id + ":\n" + output + "\n\nBased on the tool result above, continue with the next appropriate action using the structured format." + } + return "Tool result:\n" + output + "\n\nBased on the tool result above, continue with the next appropriate action using the structured format." +} + +func ActionBlockExample(tools []ToolDef) string { + tool, ok := selectExampleTool(tools) + if !ok { + return "" + } + block := map[string]any{ + "tool": tool.Name, + "parameters": exampleParameters(tool.Name, tool.InputSchema), + } + b, err := json.MarshalIndent(block, "", " ") + if err != nil { + return "" + } + return "```json action\n" + string(b) + "\n```" +} + +func ForceToolingPrompt(choice ToolChoice) string { + prompt := "Your last response did not include any ```json action``` block. " + + "You must respond with at least one valid action block now. " + + "Do not explain. Output the action block directly." + if choice.Mode == "tool" && strings.TrimSpace(choice.Name) != "" { + prompt += " You must call \"" + strings.TrimSpace(choice.Name) + "\"." + } + return prompt +} + +func LooksLikeRefusal(text string) bool { + t := strings.ToLower(strings.TrimSpace(text)) + if t == "" { + return false + } + needles := []string{ + "i don't have tools", + "i do not have tools", + "tools are unavailable", + "cannot call tools", + "can't call tools", + "没有可用的工具", + "无法调用", + "工具不可用", + "不能调用工具", + "我不具备", + "受限于当前环境", + } + for _, needle := range needles { + if strings.Contains(t, needle) { + return true + } + } + return false +} + +func ParseActionBlocks(text string, cfg Config) ([]ToolCall, string, error) { + if strings.TrimSpace(text) == "" { + return nil, "", nil + } + if cfg.MaxScanBytes > 0 && len(text) > cfg.MaxScanBytes { + text = text[:cfg.MaxScanBytes] + } + + openings := findActionOpenings(text) + if len(openings) == 0 { + return nil, strings.TrimSpace(text), nil + } + + type span struct{ start, end int } + spans := make([]span, 0, len(openings)) + calls := make([]ToolCall, 0, len(openings)) + + for _, start := range openings { + contentStart := start + if i := strings.Index(text[start:], "\n"); i >= 0 { + contentStart = start + i + 1 + } + end := findClosingFence(text, contentStart) + if end < 0 { + continue + } + + raw := strings.TrimSpace(text[contentStart:end]) + if raw == "" { + continue + } + call, ok := parseToolCallJSON(raw) + if !ok { + continue + } + calls = append(calls, call) + spans = append(spans, span{start: start, end: end + 3}) + } + + if len(calls) == 0 { + return nil, strings.TrimSpace(text), nil + } + + clean := text + for i := len(spans) - 1; i >= 0; i-- { + span := spans[i] + if span.start < 0 || span.end > len(clean) || span.start >= span.end { + continue + } + clean = clean[:span.start] + clean[span.end:] + } + return calls, strings.TrimSpace(clean), nil +} + +func findActionOpenings(text string) []int { + out := make([]int, 0) + searches := []string{"```json action", "```json\n", "```json\r\n"} + for idx := 0; idx < len(text); { + foundAt := -1 + foundLen := 0 + for _, needle := range searches { + i := strings.Index(text[idx:], needle) + if i < 0 { + continue + } + pos := idx + i + if foundAt < 0 || pos < foundAt { + foundAt = pos + foundLen = len(needle) + } + } + if foundAt < 0 { + break + } + out = append(out, foundAt) + idx = foundAt + foundLen + } + return out +} + +func findClosingFence(text string, from int) int { + inString := false + escape := false + for i := from; i < len(text)-2; i++ { + ch := text[i] + if inString { + if escape { + escape = false + continue + } + if ch == '\\' { + escape = true + continue + } + if ch == '"' { + inString = false + } + continue + } + if ch == '"' { + inString = true + continue + } + if text[i:i+3] == "```" { + return i + } + } + return -1 +} + +func parseToolCallJSON(raw string) (ToolCall, bool) { + raw = normalizeJSON(raw) + + var obj map[string]any + if err := json.Unmarshal([]byte(raw), &obj); err != nil { + return ToolCall{}, false + } + + name := strings.TrimSpace(stringFromAny(obj["tool"])) + if name == "" { + name = strings.TrimSpace(stringFromAny(obj["name"])) + } + if name == "" { + return ToolCall{}, false + } + + args, _ := obj["parameters"].(map[string]any) + if args == nil { + args, _ = obj["arguments"].(map[string]any) + } + if args == nil { + args, _ = obj["input"].(map[string]any) + } + if args == nil { + if s := strings.TrimSpace(stringFromAny(obj["parameters"])); s != "" { + _ = json.Unmarshal([]byte(s), &args) + } + } + if args == nil { + args = map[string]any{} + } + + return ToolCall{ + ID: newCallID(), + Name: name, + Arguments: args, + }, true +} + +func normalizeJSON(text string) string { + text = strings.TrimSpace(text) + replacer := strings.NewReplacer( + "\u201c", "\"", "\u201d", "\"", + "“", "\"", "”", "\"", + ",\n}", "\n}", + ",\n]", "\n]", + ", }", " }", + ", ]", " ]", + ) + return replacer.Replace(text) +} + +func compactSchema(schema map[string]any) string { + if len(schema) == 0 { + return "" + } + props, _ := schema["properties"].(map[string]any) + if len(props) == 0 { + return "" + } + + required := map[string]bool{} + if rawRequired, ok := schema["required"].([]any); ok { + for _, item := range rawRequired { + name := strings.TrimSpace(stringFromAny(item)) + if name != "" { + required[name] = true + } + } + } + + keys := make([]string, 0, len(props)) + for key := range props { + keys = append(keys, key) + } + sortStrings(keys) + + parts := make([]string, 0, len(keys)) + for _, key := range keys { + part := key + if !required[key] { + part += "?" + } + parts = append(parts, part) + } + return strings.Join(parts, ", ") +} + +func truncate(text string, max int) string { + if max <= 0 { + return "" + } + runes := []rune(strings.TrimSpace(text)) + if len(runes) <= max { + return string(runes) + } + return string(runes[:max]) + "..." +} + +func selectExampleTool(tools []ToolDef) (ToolDef, bool) { + if len(tools) == 0 { + return ToolDef{}, false + } + for _, tool := range tools { + name := strings.ToLower(strings.TrimSpace(tool.Name)) + if strings.Contains(name, "read") || strings.Contains(name, "file") { + return tool, true + } + } + for _, tool := range tools { + name := strings.ToLower(strings.TrimSpace(tool.Name)) + if strings.Contains(name, "bash") || strings.Contains(name, "shell") || strings.Contains(name, "command") { + return tool, true + } + } + return tools[0], true +} + +func exampleParameters(toolName string, schema map[string]any) map[string]any { + props, _ := schema["properties"].(map[string]any) + if len(props) == 0 { + return map[string]any{} + } + + required := requiredKeys(schema) + keys := make([]string, 0, 2) + for _, key := range required { + keys = append(keys, key) + if len(keys) >= 2 { + break + } + } + if len(keys) == 0 { + for key := range props { + keys = append(keys, key) + break + } + } + + out := map[string]any{} + for _, key := range keys { + prop, _ := props[key].(map[string]any) + out[key] = exampleValueForKey(toolName, key, prop) + } + return out +} + +func requiredKeys(schema map[string]any) []string { + items, ok := schema["required"].([]any) + if !ok { + return nil + } + out := make([]string, 0, len(items)) + for _, item := range items { + name := strings.TrimSpace(stringFromAny(item)) + if name != "" { + out = append(out, name) + } + } + return out +} + +func exampleValueForKey(toolName string, key string, prop map[string]any) any { + if enum, ok := prop["enum"].([]any); ok && len(enum) > 0 { + return enum[0] + } + valueType := strings.ToLower(strings.TrimSpace(stringFromAny(prop["type"]))) + lowerKey := strings.ToLower(strings.TrimSpace(key)) + lowerTool := strings.ToLower(strings.TrimSpace(toolName)) + + switch valueType { + case "integer": + return 1 + case "number": + return 1 + case "boolean": + return true + case "array": + return []any{} + case "object": + return map[string]any{} + } + + switch { + case strings.Contains(lowerKey, "path") || strings.Contains(lowerKey, "file"): + return "README.md" + case strings.Contains(lowerKey, "command") || strings.Contains(lowerTool, "bash") || strings.Contains(lowerTool, "shell"): + return "pwd" + case strings.Contains(lowerKey, "url"): + return "https://example.com" + default: + return "value" + } +} + +func forceConstraint(choice ToolChoice) string { + switch choice.Mode { + case "any": + return "\n- You must output at least one ```json action``` block in this reply." + case "tool": + if strings.TrimSpace(choice.Name) != "" { + return "\n- You must call \"" + strings.TrimSpace(choice.Name) + "\" in this reply." + } + } + return "" +} + +func cloneMap(src map[string]any) map[string]any { + if src == nil { + return nil + } + dst := make(map[string]any, len(src)) + for key, value := range src { + dst[key] = value + } + return dst +} + +func stringFromAny(value any) string { + switch typed := value.(type) { + case string: + return typed + case json.Number: + return typed.String() + default: + return "" + } +} + +func sortStrings(values []string) { + if len(values) < 2 { + return + } + for i := 0; i < len(values)-1; i++ { + for j := i + 1; j < len(values); j++ { + if values[j] < values[i] { + values[i], values[j] = values[j], values[i] + } + } + } +} + +var callSeq uint64 + +func newCallID() string { + seq := atomic.AddUint64(&callSeq, 1) + return "call_" + strconv.FormatUint(seq, 10) +}