第10章实战：推理 Agent — AI 推理模型开发

系统架构

用户输入 │ ▼ ┌─────────────────────────────────────┐ │ 任务分类器（Haiku） │ │ 判断: math / code_debug / general │ └──────────────────┬──────────────────┘ │ ┌──────────┴──────────┐ │ │ ▼ ▼ ┌──────────────┐ ┌──────────────────┐ │ 数学推理器 │ │ 代码调试 Agent │ │ (Sonnet + │ │ (Sonnet + │ │ thinking │ │ thinking + │ │ + 验证) │ │ 代码执行) │ └──────┬───────┘ └──────────┬───────┘ │ │ └──────────┬──────────────┘ ▼ ┌─────────────────┐ │ 结果格式化输出 │ └─────────────────┘

数学解题器（含自动验证）

import anthropic
import sympy
from sympy.parsing.latex import parse_latex
import re

client = anthropic.Anthropic()

class MathReasoningAgent:
    """数学推理 Agent：解题 + 自动验证"""

    def solve(self, problem: str, verify: bool = True) -> dict:
        # Step 1: 推理求解
        response = client.messages.create(
            model="claude-sonnet-4-6",
            max_tokens=16000,
            thinking={"type": "enabled", "budget_tokens": 10000},
            system="""你是数学解题专家。
最终答案必须用 \\boxed{} 格式，例如 \\boxed{42} 或 \\boxed{x=3}。
如果是多步骤计算，最后一行必须是 \\boxed{答案}。""",
            messages=[{"role": "user", "content": problem}]
        )

        thinking = next((b.thinking for b in response.content if b.type == "thinking"), "")
        solution = next(b.text for b in response.content if b.type == "text")

        # 提取 \boxed{} 中的答案
        answer = self.extract_boxed_answer(solution)

        result = {
            "problem": problem,
            "thinking_tokens": len(thinking.split()),
            "solution": solution,
            "answer": answer,
            "verified": None
        }

        # Step 2: 数值验证（用 sympy）
        if verify and answer:
            result["verified"] = self.verify_with_sympy(problem, answer)

        return result

    def extract_boxed_answer(self, text: str) -> str:
        match = re.search(r'\\boxed\{([^}]+)\}', text)
        return match.group(1) if match else ""

    def verify_with_sympy(self, problem: str, answer: str) -> bool:
        try:
            expr = parse_latex(answer)
            return expr is not None
        except:
            return None  # 无法验证

代码调试 Agent（执行反馈循环）

import subprocess
import tempfile
import os

class CodeDebuggingAgent:
    """代码调试 Agent：推理分析 → 修复 → 执行验证 → 迭代"""

    MAX_ITERATIONS = 3

    def debug(self, code: str, error: str = "") -> dict:
        iteration = 0
        current_code = code
        history = []

        while iteration < self.MAX_ITERATIONS:
            iteration += 1

            # 1. 推理分析并修复
            fix_prompt = f"""分析以下代码的问题并修复：

```python
{current_code}
```
{"错误信息：" + error if error else ""}

输出修复后的完整代码（只输出代码，不要说明）："""

            response = client.messages.create(
                model="claude-sonnet-4-6",
                max_tokens=8000,
                thinking={"type": "enabled", "budget_tokens": 5000},
                messages=[{"role": "user", "content": fix_prompt}]
            )

            fixed_code = self.extract_code(
                next(b.text for b in response.content if b.type == "text")
            )

            # 2. 执行验证
            exec_result = self.safe_execute(fixed_code)
            history.append({
                "iteration": iteration,
                "fixed_code": fixed_code,
                "exec_result": exec_result
            })

            if exec_result["success"]:
                return {"status": "fixed", "final_code": fixed_code,
                         "iterations": iteration, "history": history}

            # 下一次迭代用新错误继续
            current_code = fixed_code
            error = exec_result["error"]

        return {"status": "failed", "history": history}

    def safe_execute(self, code: str) -> dict:
        """在沙箱中执行代码"""
        with tempfile.NamedTemporaryFile(suffix=".py", mode="w", delete=False) as f:
            f.write(code)
            tmpfile = f.name

        try:
            result = subprocess.run(
                ["python", tmpfile],
                capture_output=True, text=True, timeout=10
            )
            return {
                "success": result.returncode == 0,
                "stdout": result.stdout,
                "error": result.stderr
            }
        except subprocess.TimeoutExpired:
            return {"success": False, "error": "Timeout (>10s)"}
        finally:
            os.unlink(tmpfile)

主入口：统一 Agent

class ReasoningAssistant:
    """统一入口：根据任务类型路由到对应 Agent"""

    def __init__(self):
        self.math_agent = MathReasoningAgent()
        self.code_agent = CodeDebuggingAgent()

    def classify_task(self, query: str) -> str:
        response = client.messages.create(
            model="claude-haiku-4-5-20251001",
            max_tokens=10,
            messages=[{"role": "user", "content":
                f"分类为 math/code/general（只输出一个词）：{query[:200]}"}]
        )
        return response.content[0].text.strip().lower()

    def process(self, query: str) -> dict:
        task_type = self.classify_task(query)

        if "math" in task_type:
            return {"type": "math", "result": self.math_agent.solve(query)}
        elif "code" in task_type:
            code = self.extract_code_from_query(query)
            return {"type": "code", "result": self.code_agent.debug(code)}
        else:
            # 通用推理问答
            response = client.messages.create(
                model="claude-sonnet-4-6",
                max_tokens=4000,
                thinking={"type": "enabled", "budget_tokens": 3000},
                messages=[{"role": "user", "content": query}]
            )
            return {"type": "general",
                    "result": next(b.text for b in response.content if b.type == "text")}

# 使用示例
agent = ReasoningAssistant()
result = agent.process("求解：x² - 5x + 6 = 0")
print(result)

科学推导助手（第三类任务）

除了数学和代码，推理模型在科学推导类任务上同样出色：

class ScienceReasoningAgent:
    """科学推导 Agent：步骤分解 + 原理解释"""

    SYSTEM_PROMPT = """你是一位知识渊博的科学导师，擅长物理、化学、生物。
对于科学推导题，请：
1. 明确标出所用的定律/公式及其适用条件
2. 逐步推导，每步注明物理/化学意义
3. 最终答案包含数值结果和单位
4. 检查量纲（单位）是否一致"""

    def solve(self, problem: str) -> dict:
        response = client.messages.create(
            model="claude-sonnet-4-6",
            max_tokens=14000,
            thinking={"type": "enabled", "budget_tokens": 8000},
            system=self.SYSTEM_PROMPT,
            messages=[{"role": "user", "content": problem}]
        )
        thinking = next((b.thinking for b in response.content if b.type == "thinking"), "")
        solution = next(b.text for b in response.content if b.type == "text")
        return {
            "problem": problem,
            "solution": solution,
            "thinking_preview": thinking[:300] + "..." if thinking else "",
            "output_tokens": response.usage.output_tokens
        }

生产级部署注意事项

并发限制与速率控制

推理模型的每次请求耗时较长（思考 + 生成可能需要 10-60 秒），在高并发场景下需要使用队列（如 Celery、Redis Queue）异步处理，避免 HTTP 超时。同时注意 API 的 TPM（每分钟 Token）限制，推理请求消耗 token 远多于普通请求。

代码执行沙箱安全性

本章的 CodeDebuggingAgent 使用 subprocess 执行用户代码，存在安全风险。生产环境应使用 Docker 容器隔离：每次执行创建新的临时容器，限制网络访问、文件系统权限和 CPU/内存使用量。推荐使用 e2b.dev 等专业代码沙箱服务。

结果缓存策略

对于相同或相似的问题，可以缓存推理结果（Redis TTL 缓存）。使用问题的向量相似度（而非精确匹配）识别相似问题，如果相似度 >0.95 且缓存未过期，直接返回缓存答案，节省大量推理成本。

用户体验：流式进度反馈

推理模型的等待时间较长，务必实现流式输出 + 进度指示器。前端可以显示"正在分析中（已思考 3000 token）..."这样的实时反馈，避免用户认为系统卡死。

完整项目 + FastAPI 接口

from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import json

app = FastAPI(title="推理 Agent API")
assistant = ReasoningAssistant()

@app.post("/solve")
async def solve_problem(request: dict) -> dict:
    """非流式接口：等待完整结果"""
    query = request.get("query", "")
    if not query:
        return {"error": "query is required"}
    result = assistant.process(query)
    return result

@app.post("/solve/stream")
async def solve_stream(request: dict):
    """流式接口：实时返回思考过程和答案"""
    query = request.get("query", "")

    async def generate():
        task_type = assistant.classify_task(query)
        yield f"data: {json.dumps({'type': 'classify', 'task_type': task_type})}\n\n"

        # 流式推理
        with client.messages.stream(
            model="claude-sonnet-4-6",
            max_tokens=14000,
            thinking={"type": "enabled", "budget_tokens": 8000},
            messages=[{"role": "user", "content": query}]
        ) as stream:
            current_type = None
            for event in stream:
                if hasattr(event, 'type'):
                    if event.type == 'content_block_start':
                        current_type = event.content_block.type
                        yield f"data: {json.dumps({'type': 'block_start', 'block_type': current_type})}\n\n"
                    elif event.type == 'content_block_delta':
                        if current_type == 'thinking':
                            yield f"data: {json.dumps({'type': 'thinking_delta'})}\n\n"
                        elif current_type == 'text':
                            yield f"data: {json.dumps({'type': 'text', 'content': event.delta.text})}\n\n"

    return StreamingResponse(generate(), media_type="text/event-stream")

# 启动：uvicorn main:app --host 0.0.0.0 --port 8000

推理模型生产部署的成本陷阱

流量突增导致账单爆炸：推理模型单次调用成本是普通 LLM 的 5-30x。一个看似合理的"每用户每天 10 次查询"限制，在 1 万用户时意味着每天 10 万次推理调用，成本可能超过预期 10 倍。必须设置每用户每日成本硬上限（不是次数上限），并在接近上限时降级为普通 LLM。

思考过程不应存储在数据库：Extended Thinking 的 thinking block 可能有几千到几万 token，如果将其完整存储到数据库用于多轮对话，存储成本迅速累积。多轮对话中只需要传递 thinking block 的 hash 或摘要，而不是完整内容（Claude API 多轮对话传递 thinking block 时有特殊要求，详见第4章）。

推理模型 + 流式输出的 UX 问题：推理模型开始输出最终答案前，用户可能等待 5-30 秒的"空白"（思考中）。如果不在前端显示"思考中..."的状态指示，用户会以为系统卡死。必须通过 SSE 推送"thinking"状态，或展示流式的 thinking block 内容，让用户感知到系统在工作。

项目扩展方向

1. 加入 LaTeX 渲染（KaTeX.js）展示数学公式；2. 增加 Docker 容器沙箱隔离代码执行；3. 实现向量缓存（相似问题复用答案）；4. 添加多轮对话历史管理（LangGraph + MemorySaver）；5. 接入监控（LangSmith / OpenTelemetry）跟踪推理质量和成本；6. 加入自定义评估集（第8章方法），持续监控生产质量。

课程完结

恭喜完成《AI 推理模型开发》全部 10 章！你已经掌握了：慢思考原理与双系统理论（第1章）、Chain-of-Thought 完整工程体系（第2章）、DeepSeek-R1 的 GRPO 训练原理（第3章）、Claude Extended Thinking API（第4章）、推理模型专用提示工程（第5章）、Plan-and-Execute Agent 架构（第6章）、推理+结构化输出的解耦方案（第7章）、推理模型评估体系（第8章）、成本控制与智能路由（第9章）、生产级推理 Agent 实战（第10章）。AI 推理时代已经到来，这套知识将成为你构建下一代 AI 应用的核心竞争力。

上一章← 推理成本控制课程首页返回课程目录 →