From 95722c97e46e6d9719f3992c240c0423f9f3dc98 Mon Sep 17 00:00:00 2001 From: Wang Defa Date: Wed, 10 Dec 2025 17:40:43 +0800 Subject: [PATCH] =?UTF-8?q?Cherry=20Studio=20=E5=AE=A2=E6=88=B7=E7=AB=AF?= =?UTF-8?q?=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .env.example | 90 ++- README.md | 80 ++- docker-compose.yml | 3 + docs/CHERRY_STUDIO_OPTIMIZATION.md | 354 +++++++++++ docs/ENVIRONMENT_VARIABLES.md | 750 +++++++++++++++++++++++ {script => docs}/OCI-SETUP-GUIDE.md | 3 +- src/api/routers/chat.py | 123 +++- src/api/schemas.py | 1 + src/core/oci_client.py | 27 +- tests/test_cherry_studio_optimization.sh | 153 +++++ 10 files changed, 1515 insertions(+), 69 deletions(-) create mode 100644 docs/CHERRY_STUDIO_OPTIMIZATION.md create mode 100644 docs/ENVIRONMENT_VARIABLES.md rename {script => docs}/OCI-SETUP-GUIDE.md (99%) create mode 100755 tests/test_cherry_studio_optimization.sh diff --git a/.env.example b/.env.example index 99764ae..f5e6952 100644 --- a/.env.example +++ b/.env.example @@ -1,23 +1,37 @@ -# API Settings +# ============================================ +# API 服务设置 +# ============================================ +# API 服务标题(显示在 OpenAPI 文档中) API_TITLE=OCI GenAI to OpenAI API Gateway +# API 版本号 API_VERSION=0.0.1 +# API 路由前缀(符合 OpenAI API 规范,不建议修改) API_PREFIX=/v1 +# 服务监听端口 API_PORT=8000 +# 服务监听地址(0.0.0.0 表示监听所有网络接口) API_HOST=0.0.0.0 +# 调试模式(生产环境应设置为 false) DEBUG=false -# Authentication -# Comma-separated list of API keys for authentication -# These are the keys clients will use in Authorization: Bearer +# ============================================ +# 认证设置 +# ============================================ +# API 密钥列表(JSON 数组格式) +# 客户端通过 Authorization: Bearer 头进行认证 +# 支持配置多个密钥,用于不同的客户端或应用 +# 示例: +# 单个密钥:API_KEYS=["sk-your-secret-key"] +# 多个密钥:API_KEYS=["sk-admin-key","sk-user-key","sk-app-key"] API_KEYS=["sk-oci-genai-default-key"] # ============================================ -# OCI Configuration +# OCI 配置 # ============================================ -# Path to OCI config file (usually ~/.oci/config) +# OCI 配置文件路径(通常为 ~/.oci/config) OCI_CONFIG_FILE=~/.oci/config -# Profile names in the OCI config file +# OCI 配置文件中的 profile 名称 # 支持单个或多个 profile,多个 profile 用逗号分隔 # 多个 profile 时会自动使用轮询(round-robin)负载均衡 # 示例: @@ -26,51 +40,61 @@ OCI_CONFIG_FILE=~/.oci/config # 注意:每个 profile 在 ~/.oci/config 中必须包含 region 和 tenancy (作为 compartment_id) OCI_CONFIG_PROFILE=DEFAULT -# Authentication type: api_key or instance_principal +# 认证类型:api_key 或 instance_principal OCI_AUTH_TYPE=api_key -# OCI Client Timeout Settings -# Connect timeout: Maximum time (in seconds) to establish connection to OCI API +# OCI 客户端超时设置 +# 连接超时:与 OCI API 建立连接的最大时间(秒) OCI_CONNECT_TIMEOUT=10 -# Read timeout: Maximum time (in seconds) to wait for OCI API response -# Increase this value for long-running requests (e.g., complex conversations) +# 读取超时:等待 OCI API 响应的最大时间(秒) +# 处理长时间运行的请求时(例如复杂对话)可增加此值 OCI_READ_TIMEOUT=360 -# Optional: Direct endpoint for dedicated models +# 可选:专用模型的直接端点 # GENAI_ENDPOINT=https://your-dedicated-endpoint -# Model Settings -# Note: Available models are dynamically loaded from OCI at startup -# Use GET /v1/models to see all available models -MAX_TOKENS=4096 +# ============================================ +# 模型设置 +# ============================================ +# 注意:可用模型在启动时从 OCI 动态加载 +# 使用 GET /v1/models 查看所有可用模型 +MAX_TOKENS=8192 TEMPERATURE=0.7 -# Embedding Settings -# Truncate strategy for embeddings: END or START +# ============================================ +# 嵌入向量设置 +# ============================================ +# 嵌入向量的截断策略:END(保留开头,截断末尾)或 START(保留末尾,截断开头) EMBED_TRUNCATE=END -# Streaming Settings -# Global streaming on/off switch -# Set to false to disable streaming for all requests (overrides client stream=true) +# ============================================ +# 流式响应设置 +# ============================================ +# 全局流式响应开关 +# 设置为 false 将禁用所有流式请求(覆盖客户端的 stream=true 设置) ENABLE_STREAMING=true -# Chunk size for simulated streaming (fallback mode only) -# Only used when OCI returns non-streaming response +# 模拟流式传输的分块大小(仅在回退模式下使用) +# 仅当 OCI 返回非流式响应时使用 STREAM_CHUNK_SIZE=1024 -# Logging -# Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL +# ============================================ +# 日志设置 +# ============================================ +# 日志级别:DEBUG, INFO, WARNING, ERROR, CRITICAL LOG_LEVEL=INFO -# Enable detailed request/response logging for debugging -# LOG_REQUESTS: Print incoming request details (method, URL, headers, body) -# LOG_RESPONSES: Print outgoing response details (status, headers, body) -# Note: Sensitive data (like API keys) are automatically filtered from logs +# 启用详细的请求/响应日志记录以进行调试 +# LOG_REQUESTS:打印传入请求的详细信息(方法、URL、请求头、请求体) +# LOG_RESPONSES:打印发出响应的详细信息(状态码、响应头、响应体) +# LOG_STREAMING:打印流式响应内容(⚠️ 增加内存使用和日志大小) +# 注意:敏感数据(如 API 密钥)会自动从日志中过滤 LOG_REQUESTS=false LOG_RESPONSES=false +LOG_STREAMING=true -# Log file path (optional, if not set logs only to console) +# 日志文件路径(可选,如果未设置则仅输出到控制台) LOG_FILE=./logs/app.log -# Max log file size in MB (default: 10) +# 日志文件最大大小(MB,默认:10) LOG_FILE_MAX_SIZE=10 -# Number of backup log files to keep (default: 5) +# 保留的备份日志文件数量(默认:5) LOG_FILE_BACKUP_COUNT=5 diff --git a/README.md b/README.md index 6ddd710..894c1f1 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,8 @@ - ⚡ **真实流式传输**: 真正的边缘到边缘流式响应,TTFB < 200ms - 🔒 **安全性**: 自动过滤敏感信息(OCID、request-id、endpoint URLs) - 🎯 **性能优化**: 客户端连接池机制,显著提升性能 +- 🎨 **高级参数支持**: reasoning_effort 等参数 +- 🍒 **Cherry Studio 优化**: 自动映射 thinking_budget,客户端名称识别 ## 🚀 快速开始 @@ -153,6 +155,68 @@ response = client.chat.completions.create( ) ``` +## 🚀 高级功能 + +### 高级参数支持 + +网关支持高级参数来增强模型响应能力: + +#### reasoning_effort - 推理深度控制 + +控制模型的推理深度,影响响应质量: + +```python +response = client.chat.completions.create( + model="google.gemini-2.5-pro", + messages=[{"role": "user", "content": "Solve this complex problem"}], + extra_body={"reasoning_effort": "high"} # low, medium, high +) +``` + +详细说明请参考 [高级参数支持文档](docs/ADVANCED_PARAMETERS.md)。 + +### Cherry Studio 客户端优化 + +网关为 Cherry Studio 客户端提供了专属优化功能: + +#### 自动映射 thinking_budget + +Cherry Studio 的 `thinking_budget` 参数会自动映射到 OCI 的 `reasoning_effort`: + +- thinking_budget ≤ 1760 → `reasoning_effort: low` +- 1760 < thinking_budget ≤ 16448 → `reasoning_effort: medium` +- thinking_budget > 16448 → `reasoning_effort: high` + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-oci-genai-default-key" \ + -H "x-title: Cherry Studio" \ + -d '{ + "model": "google.gemini-2.5-pro", + "messages": [{"role": "user", "content": "Complex problem..."}], + "extra_body": { + "google": { + "thinking_config": { + "thinking_budget": 10000 + } + } + } + }' +``` + +#### 客户端名称识别 + +通过 `x-title` 请求头识别客户端,便于日志追踪和调试: + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "x-title: Cherry Studio" \ + ... +``` + +详细说明请参考 [Cherry Studio 客户端优化文档](docs/CHERRY_STUDIO_OPTIMIZATION.md)。 + ## 📋 支持的端点 | 端点 | 方法 | 说明 | @@ -188,7 +252,9 @@ response = client.chat.completions.create( | `ENABLE_STREAMING` | 全局流式开关 | `true` | | `LOG_LEVEL` | 日志级别 | `INFO` | -完整配置请参考 [.env.example](.env.example) +**📖 完整配置说明**: +- [环境变量配置文档](docs/ENVIRONMENT_VARIABLES.md) - 所有环境变量的详细说明、使用场景和配置示例 +- [.env.example](.env.example) - 环境变量配置示例文件 ## 🌐 多区域负载均衡 @@ -214,8 +280,16 @@ docker run -p 8000:8000 --env-file .env oci-genai-gateway ## 📚 文档 -- [CLAUDE.md](CLAUDE.md) - 完整的开发文档,包含架构说明、开发指南和调试技巧 -- [.env.example](.env.example) - 环境变量配置示例 +### 核心文档 + +- [环境变量配置说明](docs/ENVIRONMENT_VARIABLES.md) - 所有环境变量的详细说明和配置示例 +- [.env.example](.env.example) - 环境变量配置示例文件 + +### 功能优化文档 + +- [高级参数支持](docs/ADVANCED_PARAMETERS.md) - reasoning_effort 参数详解 +- [Cherry Studio 客户端优化](docs/CHERRY_STUDIO_OPTIMIZATION.md) - thinking_budget 映射和客户端识别 +- [OCI 访问权限配置](docs/OCI-SETUP-GUIDE.md) - 自动化配置 OCI GenAI 访问权限 ## 🔧 故障排除 diff --git a/docker-compose.yml b/docker-compose.yml index a97955d..fbdb23c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,8 +1,11 @@ services: oci-genai-gateway: + # 使用本地 Dockerfile 构建镜像 build: context: . dockerfile: Dockerfile + # 使用预构建的镜像(如有需要可取消注释) + # image: gitea.bcde.io/wangdefa/oracle-openai:latest container_name: oci-genai-gateway ports: - "8000:8000" diff --git a/docs/CHERRY_STUDIO_OPTIMIZATION.md b/docs/CHERRY_STUDIO_OPTIMIZATION.md new file mode 100644 index 0000000..50553c7 --- /dev/null +++ b/docs/CHERRY_STUDIO_OPTIMIZATION.md @@ -0,0 +1,354 @@ +# Cherry Studio 客户端优化 + +本文档说明针对 Cherry Studio 客户端的专属优化功能。 + +## 优化内容 + +### 1. 客户端名称日志显示 + +**功能描述**: +- 从请求头 `x-title` 中提取客户端名称 +- 在日志中显示客户端信息,便于追踪和调试 +- 支持任何设置 `x-title` 头的客户端,不限于 Cherry Studio + +**日志格式**: +``` +2025-12-10 15:09:17 - api.routers.chat - INFO - Chat completion request for model: google.gemini-2.5-pro, client: Cherry Studio +``` + +**实现位置**: +- [src/api/routers/chat.py](../src/api/routers/chat.py#L295-L296) + +**使用示例**: +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-oci-genai-default-key" \ + -H "x-title: Cherry Studio" \ + -d '{ + "model": "google.gemini-2.5-pro", + "messages": [{"role": "user", "content": "Hello"}] + }' +``` + +### 2. thinking_budget 到 reasoning_effort 的自动映射 + +**功能描述**: +- Cherry Studio 使用 Google Gemini 的 `thinking_budget` 参数控制推理深度 +- 网关自动将 `thinking_budget` 映射到 OCI SDK 的 `reasoning_effort` 参数 +- 支持 meta、xai、google、openai 提供商的模型(不支持 Cohere) +- 对其他客户端透明,不影响标准 OpenAI API 兼容性 + +**映射规则**: + +| thinking_budget 值 | reasoning_effort | 说明 | +|-------------------|------------------|------| +| ≤ 1760 | `low` | 快速响应,较少推理 | +| 1760 < X ≤ 16448 | `medium` | 平衡速度和推理深度 | +| > 16448 | `high` | 深度推理,更完整的答案 | +| -1 | None | 使用模型默认值 | + +**extra_body 结构**: + +Cherry Studio 通过 `extra_body` 传递 Google Gemini 特定的配置: + +```json +{ + "model": "google.gemini-2.5-pro", + "messages": [...], + "extra_body": { + "google": { + "thinking_config": { + "thinking_budget": 1760, + "include_thoughts": true + } + } + } +} +``` + +**实现位置**: +- 映射函数: [src/api/routers/chat.py](../src/api/routers/chat.py#L37-L102) + - `map_thinking_budget_to_reasoning_effort()` - 将 thinking_budget 数值映射到 reasoning_effort 枚举值 + - `extract_reasoning_effort_from_extra_body()` - 从 extra_body 中提取 thinking_budget 并执行映射 +- OCI 客户端: [src/core/oci_client.py](../src/core/oci_client.py#L333-L336) + +**日志输出**: +``` +2025-12-10 15:09:17 - api.routers.chat - INFO - Chat completion request for model: google.gemini-2.5-pro, client: Cherry Studio +2025-12-10 15:09:17 - api.routers.chat - INFO - Cherry Studio thinking_budget 1760 mapped to reasoning_effort: low +2025-12-10 15:09:17 - core.oci_client - INFO - Setting reasoning_effort to LOW for google model +``` + +## Cherry Studio 使用示例 + +### 基本对话 + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-oci-genai-default-key" \ + -H "x-title: Cherry Studio" \ + -d '{ + "model": "google.gemini-2.5-pro", + "messages": [ + {"role": "user", "content": "Hello, how are you?"} + ] + }' +``` + +### 使用 thinking_budget (低推理深度) + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-oci-genai-default-key" \ + -H "x-title: Cherry Studio" \ + -d '{ + "model": "google.gemini-2.5-pro", + "messages": [ + {"role": "user", "content": "What is 2+2?"} + ], + "extra_body": { + "google": { + "thinking_config": { + "thinking_budget": 1000 + } + } + } + }' +``` + +### 使用 thinking_budget (中等推理深度) + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-oci-genai-default-key" \ + -H "x-title: Cherry Studio" \ + -d '{ + "model": "google.gemini-2.5-pro", + "messages": [ + {"role": "user", "content": "Explain quantum entanglement"} + ], + "extra_body": { + "google": { + "thinking_config": { + "thinking_budget": 5000 + } + } + } + }' +``` + +### 使用 thinking_budget (高推理深度) + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-oci-genai-default-key" \ + -H "x-title: Cherry Studio" \ + -d '{ + "model": "google.gemini-2.5-pro", + "messages": [ + {"role": "user", "content": "Solve this complex math problem: ..."} + ], + "extra_body": { + "google": { + "thinking_config": { + "thinking_budget": 20000 + } + } + } + }' +``` + +## 验证日志 + +启动服务并查看日志以验证 Cherry Studio 优化功能: + +```bash +# 启动服务(开发模式) +cd src +python main.py + +# 查看日志(另一个终端) +tail -f logs/app.log | grep -E "(client:|thinking_budget|reasoning_effort)" +``` + +期望看到的日志: +``` +2025-12-10 15:09:17 - api.routers.chat - INFO - Chat completion request for model: google.gemini-2.5-pro, client: Cherry Studio +2025-12-10 15:09:17 - api.routers.chat - INFO - Cherry Studio thinking_budget 1760 mapped to reasoning_effort: low +2025-12-10 15:09:17 - core.oci_client - INFO - Setting reasoning_effort to LOW for google model +``` + +## 技术实现 + +### Schema 变更 + +在 [src/api/schemas.py](../src/api/schemas.py) 中添加了 `extra_body` 字段: + +```python +class ChatCompletionRequest(BaseModel): + # ... 其他字段 ... + extra_body: Optional[Dict[str, Any]] = None # Cherry Studio and other client extensions +``` + +### 映射函数 + +实现了两个工具函数来处理 Cherry Studio 的 thinking_budget: + +1. **map_thinking_budget_to_reasoning_effort**: 将 thinking_budget 数值映射到 reasoning_effort 枚举值 +2. **extract_reasoning_effort_from_extra_body**: 从 extra_body 中提取 thinking_budget 并执行映射 + +```python +def map_thinking_budget_to_reasoning_effort(thinking_budget: int) -> Optional[str]: + """Map Cherry Studio's thinking_budget to OCI's reasoning_effort parameter.""" + if thinking_budget == -1: + return None + elif thinking_budget <= 1760: + return "low" + elif thinking_budget <= 16448: + return "medium" + else: + return "high" + +def extract_reasoning_effort_from_extra_body(extra_body: Optional[dict]) -> Optional[str]: + """Extract reasoning_effort from Cherry Studio's extra_body parameter.""" + if not extra_body: + return None + + try: + google_config = extra_body.get("google", {}) + thinking_config = google_config.get("thinking_config", {}) + thinking_budget = thinking_config.get("thinking_budget") + + if thinking_budget is not None and isinstance(thinking_budget, (int, float)): + effort = map_thinking_budget_to_reasoning_effort(int(thinking_budget)) + if effort: + logger.info(f"Cherry Studio thinking_budget {thinking_budget} mapped to reasoning_effort: {effort}") + return effort + except (AttributeError, TypeError, KeyError) as e: + logger.debug(f"Failed to extract thinking_budget from extra_body: {e}") + + return None +``` + +### OCI SDK 集成 + +更新了 `OCIGenAIClient.chat()` 方法和 `_build_generic_request()` 方法,支持传递 `reasoning_effort` 参数到 OCI SDK 的 `GenericChatRequest`。 + +## 兼容性 + +### 支持的模型 + +**reasoning_effort 参数支持**(通过 thinking_budget 映射): + +- ✅ Google Gemini 模型 (google.gemini-2.5-pro, google.gemini-2.0-flash-exp) +- ✅ Meta Llama 模型 (meta.llama-3.1-405b-instruct, meta.llama-3.2-90b-vision-instruct) +- ✅ xAI 模型 +- ✅ OpenAI 模型 +- ❌ Cohere 模型(不支持 reasoning_effort 参数) + +**注意**: reasoning_effort 是可选参数,如果模型不支持,会自动忽略并记录警告日志。 + +### 向后兼容性 + +- ✅ 不提供 `extra_body` 时,行为与之前完全一致 +- ✅ 不提供 `x-title` 时,客户端名称显示为 "Unknown" +- ✅ 其他客户端不受影响,可以继续正常使用 +- ✅ 标准 OpenAI API 兼容性完全保留 + +### 与其他客户端的兼容性 + +虽然此优化专为 Cherry Studio 设计,但实现方式确保了: + +1. **其他客户端不受影响**:不使用 `extra_body.google.thinking_config` 的客户端完全不受影响 +2. **标准 API 兼容**:所有标准 OpenAI API 功能仍然正常工作 + +## 故障排除 + +### 问题 1: thinking_budget 参数未生效 + +**症状**:日志中没有看到 "mapped to reasoning_effort" 消息 + +**解决方案**: +1. 确认 `extra_body` 结构正确,嵌套路径为 `extra_body.google.thinking_config.thinking_budget` +2. 确认使用的是支持的模型(meta、xai、google、openai,不支持 Cohere) +3. 检查 thinking_budget 值是否有效(非 null 的数字) +4. 查看日志中是否有错误或警告信息 + +**验证 extra_body 结构**: +```bash +# 正确的结构 +{ + "extra_body": { + "google": { # 必须是 "google" 键 + "thinking_config": { # 必须是 "thinking_config" 键 + "thinking_budget": 5000 # 必须是 "thinking_budget" 键,值为数字 + } + } + } +} +``` + +### 问题 2: 客户端名称显示为 "Unknown" + +**症状**:日志中客户端显示为 "Unknown" 而不是 "Cherry Studio" + +**解决方案**: +1. 确认请求头中包含 `x-title` 字段 +2. 检查 Cherry Studio 是否正确设置了自定义请求头 +3. 尝试手动添加请求头进行测试 + +**测试命令**: +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "x-title: Cherry Studio" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-oci-genai-default-key" \ + -d '{"model": "google.gemini-2.5-pro", "messages": [{"role": "user", "content": "test"}]}' +``` + +### 问题 3: thinking_budget 映射到错误的 reasoning_effort + +**症状**:期望的 reasoning_effort 与实际不符 + +**验证映射规则**: +- thinking_budget ≤ 1760 → low +- 1760 < thinking_budget ≤ 16448 → medium +- thinking_budget > 16448 → high +- thinking_budget = -1 → None (使用模型默认) + +**示例**: +```python +# thinking_budget = 1000 → low ✓ +# thinking_budget = 5000 → medium ✓ +# thinking_budget = 20000 → high ✓ +# thinking_budget = -1 → None (默认) ✓ +``` + +## 测试 + +### 自动化测试 + +运行 Cherry Studio 优化测试脚本: + +```bash +./tests/test_cherry_studio_optimization.sh +``` + +测试脚本会验证以下场景: +1. thinking_budget = 1000 → reasoning_effort = low +2. thinking_budget = 5000 → reasoning_effort = medium +3. thinking_budget = 20000 → reasoning_effort = high +4. thinking_budget = -1 → 使用模型默认值 +5. 无 extra_body(正常请求) +6. 不同客户端名称(验证 x-title 识别) + +## 参考资料 + +- [OCI GenAI Python SDK - GenericChatRequest](https://docs.oracle.com/en-us/iaas/tools/python/latest/api/generative_ai_inference/models/oci.generative_ai_inference.models.GenericChatRequest.html) +- [OpenAI API - Reasoning Models](https://platform.openai.com/docs/guides/reasoning) +- [Google Gemini - Thinking](https://ai.google.dev/gemini-api/docs/thinking) diff --git a/docs/ENVIRONMENT_VARIABLES.md b/docs/ENVIRONMENT_VARIABLES.md new file mode 100644 index 0000000..5cf9d9a --- /dev/null +++ b/docs/ENVIRONMENT_VARIABLES.md @@ -0,0 +1,750 @@ +# 环境变量配置说明 + +本文档详细说明 OCI GenAI 网关支持的所有环境变量及其配置方法。 + +## 📋 目录 + +- [快速配置](#快速配置) +- [API 设置](#api-设置) +- [认证设置](#认证设置) +- [OCI 配置](#oci-配置) +- [模型设置](#模型设置) +- [嵌入设置](#嵌入设置) +- [流式响应设置](#流式响应设置) +- [日志设置](#日志设置) +- [配置示例](#配置示例) +- [常见配置场景](#常见配置场景) + +## 快速配置 + +1. 复制示例配置文件: + ```bash + cp .env.example .env + ``` + +2. 编辑 `.env` 文件,至少配置以下必需项: + ```bash + API_KEYS=["sk-your-secret-key"] + OCI_CONFIG_PROFILE=DEFAULT + ``` + +3. 确保 OCI 配置文件存在: + ```bash + cat ~/.oci/config + ``` + +## API 设置 + +### API_TITLE + +- **说明**:API 服务的标题,显示在 OpenAPI 文档中 +- **类型**:字符串 +- **默认值**:`OCI GenAI to OpenAI API Gateway` +- **示例**: + ```bash + API_TITLE=My AI Gateway + ``` + +### API_VERSION + +- **说明**:API 服务的版本号 +- **类型**:字符串 +- **默认值**:`0.0.1` +- **示例**: + ```bash + API_VERSION=1.0.0 + ``` + +### API_PREFIX + +- **说明**:API 路由前缀,符合 OpenAI API 规范 +- **类型**:字符串 +- **默认值**:`/v1` +- **可选值**:任何有效的 URL 路径 +- **注意**:不建议修改,以保持与 OpenAI SDK 的兼容性 +- **示例**: + ```bash + API_PREFIX=/v1 + ``` + +### API_PORT + +- **说明**:服务监听端口 +- **类型**:整数 +- **默认值**:`8000` +- **范围**:1-65535 +- **示例**: + ```bash + API_PORT=8080 + ``` + +### API_HOST + +- **说明**:服务监听地址 +- **类型**:字符串 +- **默认值**:`0.0.0.0`(监听所有网络接口) +- **可选值**: + - `0.0.0.0` - 监听所有接口(生产环境) + - `127.0.0.1` - 仅本地访问(开发环境) + - 特定 IP 地址 +- **示例**: + ```bash + API_HOST=127.0.0.1 + ``` + +### DEBUG + +- **说明**:启用调试模式 +- **类型**:布尔值 +- **默认值**:`false` +- **可选值**:`true` / `false` +- **影响**: + - 启用时会显示详细的错误堆栈 + - 自动重载代码变更 + - 启用 FastAPI 的交互式文档 +- **注意**:生产环境应设置为 `false` +- **示例**: + ```bash + DEBUG=true + ``` + +## 认证设置 + +### API_KEYS + +- **说明**:API 密钥列表,用于客户端认证 +- **类型**:JSON 数组 +- **默认值**:`["sk-oci-genai-default-key"]` +- **格式**:JSON 数组字符串 +- **用途**:客户端通过 `Authorization: Bearer ` 头进行认证 +- **安全建议**: + - 使用强密钥(至少 32 个字符) + - 定期轮换密钥 + - 不同环境使用不同的密钥 + - 不要将密钥提交到版本控制系统 +- **示例**: + ```bash + # 单个密钥 + API_KEYS=["sk-prod-a1b2c3d4e5f6g7h8"] + + # 多个密钥(支持不同的客户端) + API_KEYS=["sk-admin-key123","sk-user-key456","sk-app-key789"] + ``` + +## OCI 配置 + +### OCI_CONFIG_FILE + +- **说明**:OCI 配置文件路径 +- **类型**:字符串(文件路径) +- **默认值**:`~/.oci/config` +- **用途**:指定 OCI SDK 使用的配置文件位置 +- **配置文件格式**: + ```ini + [DEFAULT] + user=ocid1.user.oc1... + fingerprint=aa:bb:cc:dd... + key_file=~/.oci/oci_api_key.pem + tenancy=ocid1.tenancy.oc1... + region=us-chicago-1 + ``` +- **示例**: + ```bash + OCI_CONFIG_FILE=~/.oci/config + OCI_CONFIG_FILE=/custom/path/to/oci_config + ``` + +### OCI_CONFIG_PROFILE + +- **说明**:OCI 配置文件中的 profile 名称 +- **类型**:字符串(支持逗号分隔的多个值) +- **默认值**:`DEFAULT` +- **用途**: + - 单个 profile:使用指定的 OCI 配置 + - 多个 profiles:自动 round-robin 负载均衡 +- **要求**:每个 profile 必须包含 `region` 和 `tenancy` 字段 +- **示例**: + ```bash + # 单配置 + OCI_CONFIG_PROFILE=DEFAULT + + # 多配置(负载均衡) + OCI_CONFIG_PROFILE=DEFAULT,CHICAGO,ASHBURN + + # 跨区域配置 + OCI_CONFIG_PROFILE=US_WEST,US_EAST,EU_FRANKFURT + ``` + +### OCI_AUTH_TYPE + +- **说明**:OCI 认证类型 +- **类型**:字符串 +- **默认值**:`api_key` +- **可选值**: + - `api_key` - 使用 API 密钥认证(推荐用于本地开发) + - `instance_principal` - 使用实例主体认证(推荐用于 OCI 实例) +- **使用场景**: + - **api_key**:本地开发、Docker 容器、非 OCI 环境 + - **instance_principal**:OCI Compute 实例、Container Engine、Functions +- **示例**: + ```bash + OCI_AUTH_TYPE=api_key + OCI_AUTH_TYPE=instance_principal + ``` + +### OCI_CONNECT_TIMEOUT + +- **说明**:OCI API 连接超时时间(秒) +- **类型**:整数 +- **默认值**:`10` +- **范围**:1-300 +- **用途**:限制建立与 OCI API 连接的最大时间 +- **调优建议**: + - 网络稳定:保持默认值(10 秒) + - 网络不稳定:增加到 20-30 秒 + - 快速失败:减少到 5 秒 +- **示例**: + ```bash + OCI_CONNECT_TIMEOUT=10 + OCI_CONNECT_TIMEOUT=30 # 网络较慢时 + ``` + +### OCI_READ_TIMEOUT + +- **说明**:OCI API 读取超时时间(秒) +- **类型**:整数 +- **默认值**:`360`(6 分钟) +- **范围**:30-600 +- **用途**:限制等待 OCI API 响应的最大时间 +- **调优建议**: + - 简单查询:120 秒 + - 复杂对话:300-360 秒 + - 长文档处理:600 秒 +- **注意**:设置过小可能导致长时间运行的请求超时 +- **示例**: + ```bash + OCI_READ_TIMEOUT=360 + OCI_READ_TIMEOUT=600 # 处理长文档时 + ``` + +### GENAI_ENDPOINT + +- **说明**:专用模型端点(可选) +- **类型**:字符串(URL) +- **默认值**:无(自动根据 region 构建) +- **用途**:指定自定义的 OCI GenAI 端点 +- **使用场景**: + - 使用专用端点 + - 测试环境 + - 企业私有部署 +- **注意**:通常不需要设置,系统会自动使用正确的端点 +- **示例**: + ```bash + GENAI_ENDPOINT=https://your-dedicated-endpoint.oraclecloud.com + ``` + +## 模型设置 + +### MAX_TOKENS + +- **说明**:默认最大 token 数 +- **类型**:整数 +- **默认值**:`4096` +- **范围**:1-模型最大限制 +- **用途**:当客户端未指定 `max_tokens` 时使用 +- **不同模型的限制**: + - Cohere Command R+:128k + - Meta Llama 3.1 405B:128k + - Google Gemini 2.5 Pro:2M +- **注意**:实际限制取决于具体模型 +- **示例**: + ```bash + MAX_TOKENS=4096 + MAX_TOKENS=8192 # 长对话场景 + ``` + +### TEMPERATURE + +- **说明**:默认温度参数 +- **类型**:浮点数 +- **默认值**:`0.7` +- **范围**:0.0-2.0 +- **用途**:控制生成文本的随机性 +- **效果**: + - 0.0:确定性输出(适合事实查询) + - 0.7:平衡创造性和准确性(默认) + - 1.0-2.0:更有创造性(适合创意写作) +- **示例**: + ```bash + TEMPERATURE=0.7 + TEMPERATURE=0.0 # 事实性问答 + TEMPERATURE=1.2 # 创意写作 + ``` + +## 嵌入设置 + +### EMBED_TRUNCATE + +- **说明**:嵌入文本截断策略 +- **类型**:字符串 +- **默认值**:`END` +- **可选值**: + - `END` - 保留文本开头,截断末尾 + - `START` - 保留文本末尾,截断开头 +- **用途**:当输入文本超过模型限制时的处理方式 +- **使用场景**: + - **END**:搜索查询、文档摘要(重点在开头) + - **START**:对话历史、日志分析(重点在结尾) +- **示例**: + ```bash + EMBED_TRUNCATE=END + EMBED_TRUNCATE=START + ``` + +## 流式响应设置 + +### ENABLE_STREAMING + +- **说明**:全局流式响应开关 +- **类型**:布尔值 +- **默认值**:`true` +- **可选值**:`true` / `false` +- **用途**:控制是否允许流式响应 +- **行为**: + - `true`:允许流式响应(客户端需设置 `stream=true`) + - `false`:强制禁用流式响应(即使客户端设置 `stream=true`) +- **使用场景**: + - 启用:交互式聊天、实时响应 + - 禁用:批处理、API 集成测试 +- **注意**:设置为 `false` 会覆盖客户端的流式请求 +- **示例**: + ```bash + ENABLE_STREAMING=true + ENABLE_STREAMING=false # 调试或批处理时 + ``` + +### STREAM_CHUNK_SIZE + +- **说明**:模拟流式响应的分块大小(字符数) +- **类型**:整数 +- **默认值**:`1024` +- **范围**:100-4096 +- **用途**:仅在 OCI 返回非流式响应时使用(fallback 模式) +- **调优建议**: + - 快速网络:1024-2048 + - 慢速网络:512-1024 + - 视觉效果优先:256-512 +- **注意**:不影响真实流式响应的性能 +- **示例**: + ```bash + STREAM_CHUNK_SIZE=1024 + STREAM_CHUNK_SIZE=512 # 更频繁的更新 + ``` + +## 日志设置 + +### LOG_LEVEL + +- **说明**:日志级别 +- **类型**:字符串 +- **默认值**:`INFO` +- **可选值**: + - `DEBUG` - 详细调试信息(包含所有日志) + - `INFO` - 一般信息(推荐生产环境) + - `WARNING` - 警告信息 + - `ERROR` - 错误信息 + - `CRITICAL` - 严重错误 +- **使用场景**: + - 开发环境:`DEBUG` + - 生产环境:`INFO` 或 `WARNING` + - 最小日志:`ERROR` +- **示例**: + ```bash + LOG_LEVEL=INFO + LOG_LEVEL=DEBUG # 开发调试 + ``` + +### LOG_REQUESTS + +- **说明**:启用请求详细日志 +- **类型**:布尔值 +- **默认值**:`false` +- **可选值**:`true` / `false` +- **用途**:记录所有传入请求的详细信息 +- **包含内容**: + - HTTP 方法和 URL + - 查询参数 + - 请求头(敏感信息自动过滤) + - 请求体(JSON 格式化) +- **性能影响**:轻微(主要是日志写入) +- **安全性**:自动过滤 API 密钥等敏感信息 +- **示例**: + ```bash + LOG_REQUESTS=false + LOG_REQUESTS=true # 调试 API 集成时 + ``` + +### LOG_RESPONSES + +- **说明**:启用响应详细日志 +- **类型**:布尔值 +- **默认值**:`false` +- **可选值**:`true` / `false` +- **用途**:记录所有发出响应的详细信息 +- **包含内容**: + - HTTP 状态码 + - 响应处理时间 + - 响应头 + - 响应体(JSON 格式化) +- **注意**:流式响应不会记录完整响应体 +- **示例**: + ```bash + LOG_RESPONSES=false + LOG_RESPONSES=true # 调试响应格式时 + ``` + +### LOG_FILE + +- **说明**:日志文件路径 +- **类型**:字符串(文件路径) +- **默认值**:`./logs/app.log` +- **用途**:指定日志文件保存位置 +- **行为**: + - 如果未设置,仅输出到控制台 + - 如果设置,同时输出到文件和控制台 +- **注意**:目录必须存在或有创建权限 +- **示例**: + ```bash + LOG_FILE=./logs/app.log + LOG_FILE=/var/log/oci-genai/app.log + ``` + +### LOG_FILE_MAX_SIZE + +- **说明**:单个日志文件最大大小(MB) +- **类型**:整数 +- **默认值**:`10` +- **范围**:1-1000 +- **用途**:日志文件轮转的大小限制 +- **行为**:超过限制时自动创建新文件 +- **建议值**: + - 低流量:10 MB + - 中等流量:50 MB + - 高流量:100-200 MB +- **示例**: + ```bash + LOG_FILE_MAX_SIZE=10 + LOG_FILE_MAX_SIZE=50 # 高流量场景 + ``` + +### LOG_FILE_BACKUP_COUNT + +- **说明**:保留的备份日志文件数量 +- **类型**:整数 +- **默认值**:`5` +- **范围**:0-100 +- **用途**:控制日志文件轮转时保留的历史文件数 +- **存储计算**:总空间 = MAX_SIZE × (BACKUP_COUNT + 1) +- **示例**: + ```bash + LOG_FILE_BACKUP_COUNT=5 + LOG_FILE_BACKUP_COUNT=10 # 需要更长的历史记录 + ``` + +## 配置示例 + +### 开发环境配置 + +```bash +# 开发环境 - 本地调试 +DEBUG=true +LOG_LEVEL=DEBUG +LOG_REQUESTS=true +LOG_RESPONSES=true + +API_PORT=8000 +API_HOST=127.0.0.1 + +API_KEYS=["sk-dev-key-123"] +OCI_CONFIG_PROFILE=DEFAULT +OCI_AUTH_TYPE=api_key + +MAX_TOKENS=4096 +TEMPERATURE=0.7 + +ENABLE_STREAMING=true +STREAM_CHUNK_SIZE=512 + +LOG_FILE=./logs/dev.log +LOG_FILE_MAX_SIZE=10 +LOG_FILE_BACKUP_COUNT=3 +``` + +### 生产环境配置 + +```bash +# 生产环境 - 多区域负载均衡 +DEBUG=false +LOG_LEVEL=INFO +LOG_REQUESTS=false +LOG_RESPONSES=false + +API_PORT=8000 +API_HOST=0.0.0.0 + +# 使用强密钥 +API_KEYS=["sk-prod-a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6"] + +# 多区域配置 +OCI_CONFIG_PROFILE=DEFAULT,CHICAGO,ASHBURN +OCI_AUTH_TYPE=api_key + +# 超时配置 +OCI_CONNECT_TIMEOUT=15 +OCI_READ_TIMEOUT=360 + +# 模型配置 +MAX_TOKENS=4096 +TEMPERATURE=0.7 + +# 流式配置 +ENABLE_STREAMING=true +STREAM_CHUNK_SIZE=1024 + +# 日志配置 +LOG_FILE=/var/log/oci-genai/app.log +LOG_FILE_MAX_SIZE=50 +LOG_FILE_BACKUP_COUNT=10 +``` + +### Docker 容器配置 + +```bash +# Docker 环境 +DEBUG=false +LOG_LEVEL=INFO + +API_PORT=8000 +API_HOST=0.0.0.0 + +API_KEYS=["sk-docker-key-abc123"] +OCI_CONFIG_FILE=/app/.oci/config +OCI_CONFIG_PROFILE=DEFAULT +OCI_AUTH_TYPE=api_key + +# 适当的超时设置 +OCI_CONNECT_TIMEOUT=20 +OCI_READ_TIMEOUT=360 + +ENABLE_STREAMING=true + +# 容器内日志路径 +LOG_FILE=/app/logs/app.log +LOG_FILE_MAX_SIZE=20 +LOG_FILE_BACKUP_COUNT=5 +``` + +### OCI 实例配置 + +```bash +# OCI Compute 实例 - 使用实例主体认证 +DEBUG=false +LOG_LEVEL=INFO + +API_PORT=8000 +API_HOST=0.0.0.0 + +API_KEYS=["sk-instance-key-xyz789"] + +# 使用实例主体认证 +OCI_AUTH_TYPE=instance_principal +# 注意:使用实例主体时不需要 OCI_CONFIG_FILE + +ENABLE_STREAMING=true + +LOG_FILE=/var/log/oci-genai/app.log +LOG_FILE_MAX_SIZE=50 +LOG_FILE_BACKUP_COUNT=10 +``` + +## 常见配置场景 + +### 场景 1: 单区域简单部署 + +```bash +API_KEYS=["sk-simple-key"] +OCI_CONFIG_PROFILE=DEFAULT +OCI_AUTH_TYPE=api_key +LOG_LEVEL=INFO +``` + +### 场景 2: 多区域高可用部署 + +```bash +API_KEYS=["sk-ha-key-primary","sk-ha-key-backup"] +OCI_CONFIG_PROFILE=US_EAST,US_WEST,EU_FRANKFURT +OCI_AUTH_TYPE=api_key +OCI_CONNECT_TIMEOUT=20 +OCI_READ_TIMEOUT=360 +LOG_LEVEL=WARNING +``` + +### 场景 3: 调试和开发 + +```bash +DEBUG=true +LOG_LEVEL=DEBUG +LOG_REQUESTS=true +LOG_RESPONSES=true +API_HOST=127.0.0.1 +STREAM_CHUNK_SIZE=256 +``` + +### 场景 4: 高性能生产环境 + +```bash +DEBUG=false +LOG_LEVEL=WARNING +LOG_REQUESTS=false +LOG_RESPONSES=false +OCI_CONFIG_PROFILE=DEFAULT,REGION2,REGION3 +ENABLE_STREAMING=true +MAX_TOKENS=8192 +OCI_READ_TIMEOUT=600 +LOG_FILE_MAX_SIZE=100 +LOG_FILE_BACKUP_COUNT=20 +``` + +### 场景 5: 批处理/API 测试 + +```bash +ENABLE_STREAMING=false +MAX_TOKENS=2048 +TEMPERATURE=0.0 +LOG_LEVEL=INFO +LOG_REQUESTS=true +LOG_RESPONSES=true +``` + +## 环境变量优先级 + +配置加载顺序(后者覆盖前者): + +1. 应用默认值(代码中定义) +2. `.env` 文件 +3. 系统环境变量 +4. OCI 配置文件(`~/.oci/config`) + +**示例**: + +```bash +# .env 文件中 +LOG_LEVEL=INFO + +# 命令行覆盖 +LOG_LEVEL=DEBUG python main.py +``` + +## 配置验证 + +### 检查配置是否生效 + +启动服务后查看日志: + +```bash +cd src +python main.py +``` + +查看启动日志确认配置: + +``` +2025-12-10 10:00:00 - INFO - Starting OCI GenAI Gateway +2025-12-10 10:00:00 - INFO - API Port: 8000 +2025-12-10 10:00:00 - INFO - OCI Profiles: DEFAULT, CHICAGO +2025-12-10 10:00:00 - INFO - Streaming: Enabled +2025-12-10 10:00:00 - INFO - Log Level: INFO +``` + +### 常见配置错误 + +1. **API_KEYS 格式错误** + ```bash + # 错误 + API_KEYS=sk-key-123 + + # 正确 + API_KEYS=["sk-key-123"] + ``` + +2. **布尔值格式错误** + ```bash + # 错误 + DEBUG=True + ENABLE_STREAMING=yes + + # 正确 + DEBUG=true + ENABLE_STREAMING=true + ``` + +3. **路径错误** + ```bash + # 错误(相对路径不明确) + OCI_CONFIG_FILE=oci/config + + # 正确 + OCI_CONFIG_FILE=~/.oci/config + OCI_CONFIG_FILE=/absolute/path/to/config + ``` + +## 安全建议 + +1. **保护 API 密钥** + - 使用强密钥(至少 32 个字符) + - 不要将 `.env` 文件提交到版本控制 + - 定期轮换密钥 + +2. **生产环境设置** + - `DEBUG=false` + - `LOG_LEVEL=INFO` 或 `WARNING` + - `LOG_REQUESTS=false` + - `LOG_RESPONSES=false` + +3. **日志管理** + - 定期清理旧日志 + - 限制日志文件大小 + - 确保日志不包含敏感信息 + +## 故障排除 + +### 配置未生效 + +1. 检查 `.env` 文件是否在正确位置 +2. 确认环境变量名称拼写正确 +3. 检查值的格式(JSON、布尔值等) +4. 查看启动日志确认配置加载 + +### 连接超时 + +```bash +# 增加超时时间 +OCI_CONNECT_TIMEOUT=30 +OCI_READ_TIMEOUT=600 +``` + +### 日志文件无法创建 + +```bash +# 检查目录是否存在 +mkdir -p logs + +# 检查权限 +chmod 755 logs +``` + +## 参考资料 + +- [.env.example](../.env.example) - 完整的配置示例文件 +- [OCI SDK 配置](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/sdkconfig.htm) - OCI 配置文件格式 +- [FastAPI Settings](https://fastapi.tiangolo.com/advanced/settings/) - FastAPI 设置管理 diff --git a/script/OCI-SETUP-GUIDE.md b/docs/OCI-SETUP-GUIDE.md similarity index 99% rename from script/OCI-SETUP-GUIDE.md rename to docs/OCI-SETUP-GUIDE.md index 905d145..81ff499 100644 --- a/script/OCI-SETUP-GUIDE.md +++ b/docs/OCI-SETUP-GUIDE.md @@ -342,8 +342,7 @@ Service generativeai is not available in region us-sanjose-1 - [OCI Generative AI 官方文档](https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm) - [OCI CLI 配置指南](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/sdkconfig.htm) - [OCI IAM 策略参考](https://docs.oracle.com/en-us/iaas/Content/Identity/Concepts/policygetstarted.htm) -- [项目 README](README.md) -- [开发文档 CLAUDE.md](CLAUDE.md) +- [项目 README](../README.md) ## 🆘 获取帮助 diff --git a/src/api/routers/chat.py b/src/api/routers/chat.py index dbcb7ac..c342432 100644 --- a/src/api/routers/chat.py +++ b/src/api/routers/chat.py @@ -5,8 +5,8 @@ import asyncio import logging import os import uuid -from typing import AsyncIterator, Union -from fastapi import APIRouter, Depends, HTTPException +from typing import AsyncIterator, Union, Optional +from fastapi import APIRouter, Depends, HTTPException, Request from fastapi.responses import StreamingResponse from oci.exceptions import ServiceError @@ -34,6 +34,74 @@ router = APIRouter( ) +def map_thinking_budget_to_reasoning_effort(thinking_budget: int) -> Optional[str]: + """ + Map Cherry Studio's thinking_budget to OCI's reasoning_effort parameter. + + Mapping rules: + - thinking_budget ≤ 1760: "low" + - 1760 < thinking_budget ≤ 16448: "medium" + - thinking_budget > 16448: "high" + - thinking_budget == -1: None (use model default) + + Args: + thinking_budget: The thinking budget value from Cherry Studio + + Returns: + The corresponding reasoning_effort value or None + """ + if thinking_budget == -1: + return None + elif thinking_budget <= 1760: + return "low" + elif thinking_budget <= 16448: + return "medium" + else: + return "high" + + +def extract_reasoning_effort_from_extra_body(extra_body: Optional[dict]) -> Optional[str]: + """ + Extract reasoning_effort from Cherry Studio's extra_body parameter. + + Example extra_body structure: + { + "google": { + "thinking_config": { + "thinking_budget": 1760, + "include_thoughts": true + } + } + } + + Args: + extra_body: The extra_body dict from the request + + Returns: + The mapped reasoning_effort value or None + """ + if not extra_body: + return None + + try: + # Navigate through the nested structure + google_config = extra_body.get("google", {}) + thinking_config = google_config.get("thinking_config", {}) + thinking_budget = thinking_config.get("thinking_budget") + + if thinking_budget is not None and isinstance(thinking_budget, (int, float)): + effort = map_thinking_budget_to_reasoning_effort(int(thinking_budget)) + if effort: + logger.info(f"Cherry Studio thinking_budget {thinking_budget} mapped to reasoning_effort: {effort}") + else: + logger.info(f"Cherry Studio thinking_budget {thinking_budget} set to -1, using model default") + return effort + except (AttributeError, TypeError, KeyError) as e: + logger.debug(f"Failed to extract thinking_budget from extra_body: {e}") + + return None + + def extract_delta_from_chunk(chunk) -> str: """ Extract delta text content from OCI streaming chunk. @@ -166,29 +234,35 @@ def extract_content_from_response(chat_response) -> str: @router.post("/completions", response_model=ChatCompletionResponse) -async def create_chat_completion(request: ChatCompletionRequest): +async def create_chat_completion( + chat_request: ChatCompletionRequest, + request: Request +): """ Create a chat completion using OCI Generative AI. Args: - request: Chat completion request + chat_request: Chat completion request + request: FastAPI Request object for accessing headers Returns: Chat completion response """ - logger.info(f"Chat completion request for model: {request.model}") + # Extract client name from x-title header + client_name = request.headers.get("x-title", "Unknown") + logger.info(f"Chat completion request for model: {chat_request.model}, client: {client_name}") settings = get_settings() # Validate model exists - model_config = get_model_config(request.model) + model_config = get_model_config(chat_request.model) if not model_config: - raise ModelNotFoundException(request.model) + raise ModelNotFoundException(chat_request.model) # Validate model type is chat (ondemand or dedicated) if model_config.type not in ("ondemand", "dedicated"): raise InvalidModelTypeException( - model_id=request.model, + model_id=chat_request.model, expected_type="chat", actual_type=model_config.type ) @@ -197,24 +271,27 @@ async def create_chat_completion(request: ChatCompletionRequest): # If a model doesn't support certain content types, it will raise an error # For example, Cohere models will raise ValueError for non-text content + # Extract reasoning_effort from Cherry Studio's extra_body + reasoning_effort = extract_reasoning_effort_from_extra_body(chat_request.extra_body) + # Get OCI client from manager (轮询负载均衡) client_manager = get_client_manager() oci_client = client_manager.get_client() # Adapt messages - messages = adapt_chat_messages([msg.dict() for msg in request.messages]) + messages = adapt_chat_messages([msg.dict() for msg in chat_request.messages]) # Extract parameters - params = extract_chat_params(request) + params = extract_chat_params(chat_request) # Determine streaming mode - # Priority: request.stream (client) > settings.enable_streaming (global) + # Priority: chat_request.stream (client) > settings.enable_streaming (global) # Only enable streaming if BOTH conditions are met: # 1. Client explicitly requests stream=true (default is false per OpenAI standard) # 2. Global streaming is enabled via ENABLE_STREAMING - enable_stream = request.stream is True and settings.enable_streaming + enable_stream = chat_request.stream is True and settings.enable_streaming - if request.stream is True and not settings.enable_streaming: + if chat_request.stream is True and not settings.enable_streaming: logger.info("Streaming requested by client but globally disabled via ENABLE_STREAMING=false") # Handle streaming @@ -230,13 +307,14 @@ async def create_chat_completion(request: ChatCompletionRequest): response = await loop.run_in_executor( None, lambda: oci_client.chat( - model_id=request.model, + model_id=chat_request.model, messages=messages, temperature=params["temperature"], max_tokens=params["max_tokens"], top_p=params["top_p"], stream=True, # Enable real streaming tools=params.get("tools"), + reasoning_effort=reasoning_effort, ) ) @@ -264,7 +342,7 @@ async def create_chat_completion(request: ChatCompletionRequest): iterator = stream_data # Send first chunk with role and empty content (OpenAI format) - yield adapt_streaming_chunk("", request.model, request_id, 0, is_first=True) + yield adapt_streaming_chunk("", chat_request.model, request_id, 0, is_first=True) # Use queue for thread-safe chunk forwarding import queue @@ -307,7 +385,7 @@ async def create_chat_completion(request: ChatCompletionRequest): delta_text = extract_delta_from_chunk(chunk) if delta_text: - yield adapt_streaming_chunk(delta_text, request.model, request_id, 0, is_first=False) + yield adapt_streaming_chunk(delta_text, chat_request.model, request_id, 0, is_first=False) # Try to extract usage from chunk (typically in final chunk) # Handle both SSE Event format and object format @@ -334,7 +412,7 @@ async def create_chat_completion(request: ChatCompletionRequest): } # Send done message with usage - yield adapt_streaming_done(request.model, request_id, usage=accumulated_usage) + yield adapt_streaming_done(chat_request.model, request_id, usage=accumulated_usage) else: # Fallback: non-streaming response, simulate streaming @@ -355,14 +433,14 @@ async def create_chat_completion(request: ChatCompletionRequest): # Simulate streaming by chunking # First send empty chunk with role (OpenAI format) - yield adapt_streaming_chunk("", request.model, request_id, 0, is_first=True) + yield adapt_streaming_chunk("", chat_request.model, request_id, 0, is_first=True) chunk_size = settings.stream_chunk_size for i in range(0, len(content), chunk_size): chunk = content[i:i + chunk_size] - yield adapt_streaming_chunk(chunk, request.model, request_id, 0, is_first=False) + yield adapt_streaming_chunk(chunk, chat_request.model, request_id, 0, is_first=False) - yield adapt_streaming_done(request.model, request_id, usage=accumulated_usage) + yield adapt_streaming_done(chat_request.model, request_id, usage=accumulated_usage) except TypeError as te: # Handle case where response is not iterable at all @@ -397,17 +475,18 @@ async def create_chat_completion(request: ChatCompletionRequest): # Non-streaming response try: response = oci_client.chat( - model_id=request.model, + model_id=chat_request.model, messages=messages, temperature=params["temperature"], max_tokens=params["max_tokens"], top_p=params["top_p"], stream=False, tools=params.get("tools"), + reasoning_effort=reasoning_effort, ) # Adapt response to OpenAI format - openai_response = adapt_chat_response(response, request.model) + openai_response = adapt_chat_response(response, chat_request.model) if settings.log_responses: logger.debug(f"Response: {openai_response}") diff --git a/src/api/schemas.py b/src/api/schemas.py index fd9c56c..647884c 100644 --- a/src/api/schemas.py +++ b/src/api/schemas.py @@ -32,6 +32,7 @@ class ChatCompletionRequest(BaseModel): user: Optional[str] = None tools: Optional[List[Dict[str, Any]]] = None tool_choice: Optional[Union[str, Dict[str, Any]]] = None + extra_body: Optional[Dict[str, Any]] = None # Cherry Studio and other client extensions class ChatCompletionChoice(BaseModel): diff --git a/src/core/oci_client.py b/src/core/oci_client.py index bc1d936..e9898a7 100644 --- a/src/core/oci_client.py +++ b/src/core/oci_client.py @@ -184,6 +184,7 @@ class OCIGenAIClient: top_p: float = 1.0, stream: bool = False, tools: Optional[list] = None, + reasoning_effort: Optional[str] = None, ): """Send a chat completion request to OCI GenAI.""" model_config = get_model_config(model_id) @@ -208,7 +209,7 @@ class OCIGenAIClient: ) elif model_config.provider in ["meta", "xai", "google", "openai"]: chat_request = self._build_generic_request( - messages, temperature, max_tokens, top_p, tools, model_config.provider, stream + messages, temperature, max_tokens, top_p, tools, model_config.provider, stream, reasoning_effort ) else: raise ValueError(f"Unsupported provider: {model_config.provider}") @@ -278,7 +279,7 @@ class OCIGenAIClient: ) def _build_generic_request( - self, messages: list, temperature: float, max_tokens: int, top_p: float, tools: Optional[list], provider: str, stream: bool = False + self, messages: list, temperature: float, max_tokens: int, top_p: float, tools: Optional[list], provider: str, stream: bool = False, reasoning_effort: Optional[str] = None ) -> GenericChatRequest: """Build Generic chat request for Llama and other models.""" # Convert messages to Generic format @@ -318,13 +319,21 @@ class OCIGenAIClient: ) ) - return GenericChatRequest( - messages=generic_messages, - temperature=temperature, - max_tokens=max_tokens, - top_p=top_p, - is_stream=stream, - ) + # Build request parameters + request_params = { + "messages": generic_messages, + "temperature": temperature, + "max_tokens": max_tokens, + "top_p": top_p, + "is_stream": stream, + } + + # Add reasoning_effort if provided (only for generic models) + if reasoning_effort: + request_params["reasoning_effort"] = reasoning_effort.upper() + logger.info(f"Setting reasoning_effort to {reasoning_effort.upper()} for {provider} model") + + return GenericChatRequest(**request_params) def embed( self, diff --git a/tests/test_cherry_studio_optimization.sh b/tests/test_cherry_studio_optimization.sh new file mode 100755 index 0000000..cacc871 --- /dev/null +++ b/tests/test_cherry_studio_optimization.sh @@ -0,0 +1,153 @@ +#!/bin/bash + +# 测试 Cherry Studio 客户端优化功能 +# 1. 测试客户端名称显示(x-title 请求头) +# 2. 测试 thinking_budget 到 reasoning_effort 的映射 + +API_URL="http://localhost:8000/v1/chat/completions" +API_KEY="sk-oci-genai-default-key" + +echo "==========================================" +echo "测试 1: thinking_budget = 1000 (应映射到 low)" +echo "==========================================" +curl -s -X POST "$API_URL" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $API_KEY" \ + -H "x-title: Cherry Studio" \ + -d '{ + "model": "google.gemini-2.5-pro", + "messages": [ + {"role": "user", "content": "Hello, how are you?"} + ], + "temperature": 0.7, + "max_tokens": 100, + "extra_body": { + "google": { + "thinking_config": { + "thinking_budget": 1000, + "include_thoughts": true + } + } + } + }' | jq . + +echo "" +echo "==========================================" +echo "测试 2: thinking_budget = 5000 (应映射到 medium)" +echo "==========================================" +curl -s -X POST "$API_URL" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $API_KEY" \ + -H "x-title: Cherry Studio" \ + -d '{ + "model": "google.gemini-2.5-pro", + "messages": [ + {"role": "user", "content": "What is 2+2?"} + ], + "temperature": 0.7, + "max_tokens": 100, + "extra_body": { + "google": { + "thinking_config": { + "thinking_budget": 5000, + "include_thoughts": true + } + } + } + }' | jq . + +echo "" +echo "==========================================" +echo "测试 3: thinking_budget = 20000 (应映射到 high)" +echo "==========================================" +curl -s -X POST "$API_URL" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $API_KEY" \ + -H "x-title: Cherry Studio" \ + -d '{ + "model": "google.gemini-2.5-pro", + "messages": [ + {"role": "user", "content": "Explain quantum computing"} + ], + "temperature": 0.7, + "max_tokens": 100, + "extra_body": { + "google": { + "thinking_config": { + "thinking_budget": 20000, + "include_thoughts": true + } + } + } + }' | jq . + +echo "" +echo "==========================================" +echo "测试 4: thinking_budget = -1 (应使用模型默认值)" +echo "==========================================" +curl -s -X POST "$API_URL" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $API_KEY" \ + -H "x-title: Cherry Studio" \ + -d '{ + "model": "google.gemini-2.5-pro", + "messages": [ + {"role": "user", "content": "Tell me a joke"} + ], + "temperature": 0.7, + "max_tokens": 100, + "extra_body": { + "google": { + "thinking_config": { + "thinking_budget": -1, + "include_thoughts": true + } + } + } + }' | jq . + +echo "" +echo "==========================================" +echo "测试 5: 无 extra_body (正常请求)" +echo "==========================================" +curl -s -X POST "$API_URL" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $API_KEY" \ + -H "x-title: Cherry Studio" \ + -d '{ + "model": "google.gemini-2.5-pro", + "messages": [ + {"role": "user", "content": "Hi there!"} + ], + "temperature": 0.7, + "max_tokens": 100 + }' | jq . + +echo "" +echo "==========================================" +echo "测试 6: 不同客户端名称 (Postman)" +echo "==========================================" +curl -s -X POST "$API_URL" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $API_KEY" \ + -H "x-title: Postman" \ + -d '{ + "model": "google.gemini-2.5-pro", + "messages": [ + {"role": "user", "content": "Test from Postman"} + ], + "temperature": 0.7, + "max_tokens": 100 + }' | jq . + +echo "" +echo "==========================================" +echo "所有测试完成!" +echo "请查看服务器日志,验证:" +echo "1. 客户端名称是否正确显示(Cherry Studio / Postman)" +echo "2. thinking_budget 是否正确映射到 reasoning_effort" +echo " - thinking_budget = 1000 → reasoning_effort = LOW" +echo " - thinking_budget = 5000 → reasoning_effort = MEDIUM" +echo " - thinking_budget = 20000 → reasoning_effort = HIGH" +echo " - thinking_budget = -1 → 使用模型默认值(无 reasoning_effort 日志)" +echo "=========================================="