目标:配置 Claude Code 的可观测性,实现全链路监控和性能分析
预计时间:30 分钟
对应官方文档:Monitoring Usage、Agent SDK Observability
为什么需要可观测性?
在团队和企业环境中,你需要了解:
- 📊 用量:谁在用?花了多少?
- 🔍 性能:响应有多快?哪里瓶颈?
- 🐛 问题:为什么出错?如何修复?
- 📈 趋势:使用量增长如何?
OpenTelemetry 集成
架构
配置 Claude Code
# 启用 OpenTelemetry
export OTEL_EXPORTER_OTLP_ENDPOINT=http://collector:4317
export OTEL_SERVICE_NAME=claude-code-team
export OTEL_RESOURCE_ATTRIBUTES="team=backend,env=production"
# 启动 Claude Code
claudeAgent SDK 配置
from claude_agent_sdk import Agent, ObservabilityConfig
agent = Agent(
model="claude-sonnet-4-6",
observability=ObservabilityConfig(
exporter="otlp",
endpoint="http://otel-collector:4317",
service_name="code-review-agent",
attributes={
"team": "platform",
"project": "auth-service"
}
)
)收集的遥测数据
Traces(调用链)
{
"trace_id": "abc123",
"span_id": "def456",
"name": "claude.task",
"start_time": "2025-06-18T10:00:00Z",
"end_time": "2025-06-18T10:00:05Z",
"attributes": {
"claude.model": "claude-sonnet-4-6",
"claude.input_tokens": 4520,
"claude.output_tokens": 890,
"claude.tool_calls": 3,
"claude.files_read": 5,
"claude.files_written": 2
},
"events": [
{"time": "10:00:01", "name": "thinking_start"},
{"time": "10:00:03", "name": "tool_call", "attributes": {"tool": "file_read"}},
{"time": "10:00:05", "name": "completion"}
]
}Metrics(指标)
| 指标名 | 类型 | 说明 |
|---|---|---|
claude.tokens.input | Counter | 输入 token 数 |
claude.tokens.output | Counter | 输出 token 数 |
claude.requests.duration | Histogram | 请求耗时 |
claude.tool.calls | Counter | 工具调用次数 |
claude.errors | Counter | 错误次数 |
Logs(日志)
{
"timestamp": "2025-06-18T10:00:05Z",
"level": "INFO",
"message": "Task completed",
"attributes": {
"session_id": "sess_abc",
"task": "refactor auth",
"duration_ms": 5200,
"success": true
}
}监控仪表盘
Grafana 配置
# docker-compose.yaml
version: '3'
services:
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
volumes:
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards
prometheus:
image: prom/prometheus:latest
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
jaeger:
image: jaegertracing/all-in-one:latest
ports:
- "16686:16686"关键看板
┌─────────────────────────────────────────────┐
│ Claude Code 监控仪表盘 │
├─────────────────────────────────────────────┤
│ │
│ 今日用量: $45.20 │ 活跃会话: 12 │
│ 较昨日: +15% │ 等待确认: 3 │
│ │
├─────────────────────────────────────────────┤
│ 请求耗时分布 │
│ ▓▓▓▓▓▓▓░░░ 平均 3.2s │
│ P50: 2.1s P95: 8.5s P99: 15.2s │
│ │
├─────────────────────────────────────────────┤
│ 错误率趋势 │
│ ──────╱╲────── 2.3% │
│ │
├─────────────────────────────────────────────┤
│ Top 5 用户 Top 5 项目 │
│ 1. alice $12.50 1. backend $18.20 │
│ 2. bob $10.30 2. mobile $15.40 │
│ ... │
└─────────────────────────────────────────────┘
告警配置
Prometheus Rules
# alerting-rules.yaml
groups:
- name: claude-alerts
rules:
- alert: HighCostSpike
expr: |
(
sum(claude_cost_usd)
/ sum(claude_cost_usd offset 1d)
) > 2
for: 1h
labels:
severity: warning
annotations:
summary: "Claude Code 成本激增"
- alert: HighErrorRate
expr: |
(
sum(rate(claude_errors_total[5m]))
/ sum(rate(claude_requests_total[5m]))
) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "错误率超过 5%"
- alert: SlowResponse
expr: |
histogram_quantile(0.95,
rate(claude_request_duration_seconds_bucket[5m])
) > 30
for: 10m
labels:
severity: warning
annotations:
summary: "P95 响应时间超过 30 秒"调试技巧
查看详细 Trace
# 导出当前会话的 trace
claude export-trace --session sess_abc123 --output trace.json
# 分析
jq '.spans[] | {name, duration_ms}' trace.json性能分析
# 找出最耗时的操作
claude profile --last-hour
# 输出示例:
# Operation Count Avg(ms) Total(ms)
# file_read 45 120 5400
# bash_exec 12 850 10200
# model_call 8 3200 25600完整的 Grafana 仪表盘配置
{
"dashboard": {
"title": "Claude Code 企业监控",
"tags": ["claude", "ai", "monitoring"],
"timezone": "browser",
"panels": [
{
"title": "实时用量概览",
"type": "stat",
"targets": [
{
"expr": "sum(claude_tokens_input_total) + sum(claude_tokens_output_total)",
"legendFormat": "总 Token 数"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"color": {
"mode": "thresholds",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 1000000},
{"color": "red", "value": 5000000}
]
}
}
}
}
},
{
"title": "Top 10 活跃用户",
"type": "table",
"targets": [
{
"expr": "topk(10, sum by (user) (claude_cost_usd_total))",
"format": "table"
}
]
},
{
"title": "模型使用分布",
"type": "piechart",
"targets": [
{
"expr": "sum by (model) (claude_requests_total)",
"legendFormat": "{{model}}"
}
]
},
{
"title": "错误率趋势",
"type": "timeseries",
"targets": [
{
"expr": "rate(claude_errors_total[5m]) / rate(claude_requests_total[5m])",
"legendFormat": "错误率"
}
]
},
{
"title": "Token 使用热力图",
"type": "heatmap",
"targets": [
{
"expr": "sum by (le) (rate(claude_request_duration_seconds_bucket[5m]))",
"format": "heatmap"
}
]
}
]
}
}Docker Compose 一键部署
# observability-stack.yaml
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus-data:/prometheus
ports:
- "9090:9090"
jaeger:
image: jaegertracing/all-in-one:latest
ports:
- "16686:16686"
- "14268:14268"
loki:
image: grafana/loki:latest
ports:
- "3100:3100"
volumes:
- ./loki-config.yml:/etc/loki/local-config.yaml
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
volumes:
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards
- ./grafana/datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
- grafana-data:/var/lib/grafana
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
otel-collector:
image: otel/opentelemetry-collector-contrib:latest
command: ["--config=/etc/otel-collector-config.yaml"]
volumes:
- ./otel-collector-config.yaml:/etc/otel-collector-config.yaml
ports:
- "4317:4317"
- "4318:4318"
volumes:
prometheus-data:
grafana-data:高级教程完成!
恭喜完成所有高级教程!你已经掌握了:
- ✅ Agent SDK 开发
- ✅ Hooks 自动化
- ✅ 插件开发与市场
- ✅ 动态工作流编排
- ✅ 企业部署管理
- ✅ 安全加固
- ✅ CI/CD 集成
- ✅ 可观测性监控
继续探索
- 📖 完整参考:官方文档
- 💬 社区:Discord
- 🐙 示例代码:GitHub Examples
本教程持续更新中,欢迎反馈和建议!