系统架构
数学解题器(含自动验证)
import anthropic
import sympy
from sympy.parsing.latex import parse_latex
import re
client = anthropic.Anthropic()
class MathReasoningAgent:
"""数学推理 Agent:解题 + 自动验证"""
def solve(self, problem: str, verify: bool = True) -> dict:
# Step 1: 推理求解
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=16000,
thinking={"type": "enabled", "budget_tokens": 10000},
system="""你是数学解题专家。
最终答案必须用 \\boxed{} 格式,例如 \\boxed{42} 或 \\boxed{x=3}。
如果是多步骤计算,最后一行必须是 \\boxed{答案}。""",
messages=[{"role": "user", "content": problem}]
)
thinking = next((b.thinking for b in response.content if b.type == "thinking"), "")
solution = next(b.text for b in response.content if b.type == "text")
# 提取 \boxed{} 中的答案
answer = self.extract_boxed_answer(solution)
result = {
"problem": problem,
"thinking_tokens": len(thinking.split()),
"solution": solution,
"answer": answer,
"verified": None
}
# Step 2: 数值验证(用 sympy)
if verify and answer:
result["verified"] = self.verify_with_sympy(problem, answer)
return result
def extract_boxed_answer(self, text: str) -> str:
match = re.search(r'\\boxed\{([^}]+)\}', text)
return match.group(1) if match else ""
def verify_with_sympy(self, problem: str, answer: str) -> bool:
try:
expr = parse_latex(answer)
return expr is not None
except:
return None # 无法验证
代码调试 Agent(执行反馈循环)
import subprocess
import tempfile
import os
class CodeDebuggingAgent:
"""代码调试 Agent:推理分析 → 修复 → 执行验证 → 迭代"""
MAX_ITERATIONS = 3
def debug(self, code: str, error: str = "") -> dict:
iteration = 0
current_code = code
history = []
while iteration < self.MAX_ITERATIONS:
iteration += 1
# 1. 推理分析并修复
fix_prompt = f"""分析以下代码的问题并修复:
```python
{current_code}
```
{"错误信息:" + error if error else ""}
输出修复后的完整代码(只输出代码,不要说明):"""
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=8000,
thinking={"type": "enabled", "budget_tokens": 5000},
messages=[{"role": "user", "content": fix_prompt}]
)
fixed_code = self.extract_code(
next(b.text for b in response.content if b.type == "text")
)
# 2. 执行验证
exec_result = self.safe_execute(fixed_code)
history.append({
"iteration": iteration,
"fixed_code": fixed_code,
"exec_result": exec_result
})
if exec_result["success"]:
return {"status": "fixed", "final_code": fixed_code,
"iterations": iteration, "history": history}
# 下一次迭代用新错误继续
current_code = fixed_code
error = exec_result["error"]
return {"status": "failed", "history": history}
def safe_execute(self, code: str) -> dict:
"""在沙箱中执行代码"""
with tempfile.NamedTemporaryFile(suffix=".py", mode="w", delete=False) as f:
f.write(code)
tmpfile = f.name
try:
result = subprocess.run(
["python", tmpfile],
capture_output=True, text=True, timeout=10
)
return {
"success": result.returncode == 0,
"stdout": result.stdout,
"error": result.stderr
}
except subprocess.TimeoutExpired:
return {"success": False, "error": "Timeout (>10s)"}
finally:
os.unlink(tmpfile)
主入口:统一 Agent
class ReasoningAssistant:
"""统一入口:根据任务类型路由到对应 Agent"""
def __init__(self):
self.math_agent = MathReasoningAgent()
self.code_agent = CodeDebuggingAgent()
def classify_task(self, query: str) -> str:
response = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=10,
messages=[{"role": "user", "content":
f"分类为 math/code/general(只输出一个词):{query[:200]}"}]
)
return response.content[0].text.strip().lower()
def process(self, query: str) -> dict:
task_type = self.classify_task(query)
if "math" in task_type:
return {"type": "math", "result": self.math_agent.solve(query)}
elif "code" in task_type:
code = self.extract_code_from_query(query)
return {"type": "code", "result": self.code_agent.debug(code)}
else:
# 通用推理问答
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=4000,
thinking={"type": "enabled", "budget_tokens": 3000},
messages=[{"role": "user", "content": query}]
)
return {"type": "general",
"result": next(b.text for b in response.content if b.type == "text")}
# 使用示例
agent = ReasoningAssistant()
result = agent.process("求解:x² - 5x + 6 = 0")
print(result)
科学推导助手(第三类任务)
除了数学和代码,推理模型在科学推导类任务上同样出色:
class ScienceReasoningAgent:
"""科学推导 Agent:步骤分解 + 原理解释"""
SYSTEM_PROMPT = """你是一位知识渊博的科学导师,擅长物理、化学、生物。
对于科学推导题,请:
1. 明确标出所用的定律/公式及其适用条件
2. 逐步推导,每步注明物理/化学意义
3. 最终答案包含数值结果和单位
4. 检查量纲(单位)是否一致"""
def solve(self, problem: str) -> dict:
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=14000,
thinking={"type": "enabled", "budget_tokens": 8000},
system=self.SYSTEM_PROMPT,
messages=[{"role": "user", "content": problem}]
)
thinking = next((b.thinking for b in response.content if b.type == "thinking"), "")
solution = next(b.text for b in response.content if b.type == "text")
return {
"problem": problem,
"solution": solution,
"thinking_preview": thinking[:300] + "..." if thinking else "",
"output_tokens": response.usage.output_tokens
}
生产级部署注意事项
完整项目 + FastAPI 接口
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import json
app = FastAPI(title="推理 Agent API")
assistant = ReasoningAssistant()
@app.post("/solve")
async def solve_problem(request: dict) -> dict:
"""非流式接口:等待完整结果"""
query = request.get("query", "")
if not query:
return {"error": "query is required"}
result = assistant.process(query)
return result
@app.post("/solve/stream")
async def solve_stream(request: dict):
"""流式接口:实时返回思考过程和答案"""
query = request.get("query", "")
async def generate():
task_type = assistant.classify_task(query)
yield f"data: {json.dumps({'type': 'classify', 'task_type': task_type})}\n\n"
# 流式推理
with client.messages.stream(
model="claude-sonnet-4-6",
max_tokens=14000,
thinking={"type": "enabled", "budget_tokens": 8000},
messages=[{"role": "user", "content": query}]
) as stream:
current_type = None
for event in stream:
if hasattr(event, 'type'):
if event.type == 'content_block_start':
current_type = event.content_block.type
yield f"data: {json.dumps({'type': 'block_start', 'block_type': current_type})}\n\n"
elif event.type == 'content_block_delta':
if current_type == 'thinking':
yield f"data: {json.dumps({'type': 'thinking_delta'})}\n\n"
elif current_type == 'text':
yield f"data: {json.dumps({'type': 'text', 'content': event.delta.text})}\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
# 启动:uvicorn main:app --host 0.0.0.0 --port 8000
流量突增导致账单爆炸:推理模型单次调用成本是普通 LLM 的 5-30x。一个看似合理的"每用户每天 10 次查询"限制,在 1 万用户时意味着每天 10 万次推理调用,成本可能超过预期 10 倍。必须设置每用户每日成本硬上限(不是次数上限),并在接近上限时降级为普通 LLM。
思考过程不应存储在数据库:Extended Thinking 的 thinking block 可能有几千到几万 token,如果将其完整存储到数据库用于多轮对话,存储成本迅速累积。多轮对话中只需要传递 thinking block 的 hash 或摘要,而不是完整内容(Claude API 多轮对话传递 thinking block 时有特殊要求,详见第4章)。
推理模型 + 流式输出的 UX 问题:推理模型开始输出最终答案前,用户可能等待 5-30 秒的"空白"(思考中)。如果不在前端显示"思考中..."的状态指示,用户会以为系统卡死。必须通过 SSE 推送"thinking"状态,或展示流式的 thinking block 内容,让用户感知到系统在工作。
1. 加入 LaTeX 渲染(KaTeX.js)展示数学公式;2. 增加 Docker 容器沙箱隔离代码执行;3. 实现向量缓存(相似问题复用答案);4. 添加多轮对话历史管理(LangGraph + MemorySaver);5. 接入监控(LangSmith / OpenTelemetry)跟踪推理质量和成本;6. 加入自定义评估集(第8章方法),持续监控生产质量。
恭喜完成《AI 推理模型开发》全部 10 章!你已经掌握了:慢思考原理与双系统理论(第1章)、Chain-of-Thought 完整工程体系(第2章)、DeepSeek-R1 的 GRPO 训练原理(第3章)、Claude Extended Thinking API(第4章)、推理模型专用提示工程(第5章)、Plan-and-Execute Agent 架构(第6章)、推理+结构化输出的解耦方案(第7章)、推理模型评估体系(第8章)、成本控制与智能路由(第9章)、生产级推理 Agent 实战(第10章)。AI 推理时代已经到来,这套知识将成为你构建下一代 AI 应用的核心竞争力。