1. 防中断机制
1.1 中断场景分类
| 中断类型 | 触发原因 | 恢复难度 | 关键机制 |
|---|
| 网络/API 故障 | API 超时、连接断开 | 低 | 自动重试、指数退避 |
| 系统崩溃 | 内存不足、进程被杀 | 中 | 状态持久化、检查点 |
| 用户手动停止 | Ctrl+C、任务取消 | 低 | 优雅关闭、状态保存 |
| 资源耗尽 | Token 用完、磁盘满 | 高 | 资源监控、提前告警 |
| 超时终止 | 执行时间超过限制 | 中 | 分段检查点、断点续传 |
1.2 三层状态持久化架构
flowchart TB
A[Tier 1: 会话状态<br/>内存中] --> B[Tier 2: 文件状态<br/>JSON/MD]
B --> C[Tier 3: 事件日志<br/>Append-only]
Tier 2 状态文件(.current_status.json):
{
"version": "1.0.0",
"session": {
"id": "session_1710928200_a3f9",
"platform": "opencode",
"model": "glm-4-flash"
},
"task": {
"name": "研究报告生成",
"progress": 65,
"status": "in_progress"
},
"state": {
"current_task": "撰写技术架构章节",
"completed_tasks": ["研究背景", "文献综述"],
"pending_tasks": ["技术架构", "案例分析", "结论"]
},
"metrics": {
"loop_count": 12,
"api_calls": 24,
"last_update": "2026-03-20T11:45:00Z"
},
"safety": {
"circuit_breaker_state": "closed",
"no_progress_count": 0
}
}
1.3 检查点机制
#!/bin/bash
# checkpoint_manager.sh
create_checkpoint() {
local checkpoint_id="cp_$(date +%s)_$(openssl rand -hex 4)"
local checkpoint_file="checkpoint-${checkpoint_id}.json"
cat > "$checkpoint_file" << EOF
{
"checkpoint_id": "$checkpoint_id",
"timestamp": "$(date -Iseconds)",
"state": $(cat .current_status.json),
"git_hash": "$(git rev-parse HEAD 2>/dev/null || echo 'none')"
}
EOF
# 保留最近 5 个检查点
ls -t checkpoint-*.json 2>/dev/null | tail -n +6 | xargs -r rm
echo "检查点已创建: $checkpoint_file"
}
recover_from_checkpoint() {
local checkpoint_file="$1"
if [ ! -f "$checkpoint_file" ]; then
echo "错误:检查点文件不存在"
return 1
fi
# 验证 JSON 结构
if ! jq empty "$checkpoint_file" 2>/dev/null; then
echo "错误:检查点文件损坏"
return 1
fi
# 恢复状态
jq '.state' "$checkpoint_file" > .current_status.json
echo "已从检查点恢复: $checkpoint_file"
}
1.4 指数退避重试
#!/bin/bash
# retry_with_backoff.sh
retry_with_backoff() {
local max_retries="${1:-5}"
local command="${@:2}"
local retry_count=0
local base_delay=1
while [ $retry_count -lt $max_retries ]; do
if eval "$command"; then
return 0
fi
local exit_code=$?
((retry_count++))
# 永久性错误,停止重试
if [ $exit_code -eq 1 ]; then
echo "永久性错误,停止重试"
return $exit_code
fi
# 计算退避时间
local delay=$((base_delay * (2 ** (retry_count - 1))))
local jitter=$((RANDOM % 1000 / 1000))
local total_delay=$((delay + jitter))
# 最大延迟 5 分钟
[ $total_delay -gt 300 ] && total_delay=300
echo "尝试 $retry_count/$max_retries 失败,${total_delay}秒后重试..."
sleep $total_delay
done
echo "达到最大重试次数"
return 1
}
2. 防循环陷阱策略
2.1 五层保护架构
flowchart TB
A[Layer 1: 双重退出检测] --> B{完成?}
B -->|否| C[Layer 2: 断路器]
C --> D{停滞?}
D -->|否| E[Layer 3: 速率限制]
E --> F{超限?}
F -->|否| G[Layer 4: 最大迭代]
G --> H{超限?}
H -->|否| I[Layer 5: 内容检测]
I --> J{重复?}
J -->|否| K[继续循环]
2.2 双重退出检测实现
#!/bin/bash
# dual_exit_detection.sh
MIN_COMPLETION_INDICATORS=2
detect_completion_indicators() {
local output="$1"
local count=0
# 完成关键词
local keywords="complete done finished ready implemented"
for keyword in $keywords; do
local matches=$(echo "$output" | grep -oic "$keyword" | head -1)
((count += matches))
done
echo "$count"
}
extract_exit_signal() {
local output="$1"
# 从 RALPH_STATUS 提取
if echo "$output" | grep -qE '"EXIT_SIGNAL":\s*true'; then
echo "true"
return
fi
# 检查显式标记
if echo "$output" | grep -q "<<COMPLETED>>"; then
echo "true"
return
fi
echo "false"
}
should_exit() {
local output="$1"
local indicators=$(detect_completion_indicators "$output")
local exit_signal=$(extract_exit_signal "$output")
# 双重条件判断
if [ "$indicators" -ge "$MIN_COMPLETION_INDICATORS" ] && [ "$exit_signal" = "true" ]; then
return 0 # 允许退出
fi
return 1 # 继续循环
}
2.3 断路器实现
#!/bin/bash
# circuit_breaker.sh
CB_STATE_FILE=".circuit_breaker_state"
CB_NO_PROGRESS_THRESHOLD=3
CB_SAME_ERROR_THRESHOLD=5
CB_COOLDOWN_MINUTES=30
init_circuit_breaker() {
if [ ! -f "$CB_STATE_FILE" ]; then
cat > "$CB_STATE_FILE" << EOF
{
"state": "CLOSED",
"no_progress_count": 0,
"same_error_count": 0,
"last_error_pattern": ""
}
EOF
fi
}
check_circuit_breaker() {
local files_changed="$1"
local has_errors="$2"
local output="$3"
local state=$(jq -r '.state' "$CB_STATE_FILE")
local no_progress=$(jq -r '.no_progress_count' "$CB_STATE_FILE")
local same_error=$(jq -r '.same_error_count' "$CB_STATE_FILE")
# 检测进展
if [ "$files_changed" -gt 0 ]; then
no_progress=0
same_error=0
else
((no_progress++))
fi
# 错误检测
if [ "$has_errors" = "true" ]; then
local current_error=$(echo "$output" | grep -iE "error|exception|failed" | head -3 | md5sum | cut -d' ' -f1)
local last_error=$(jq -r '.last_error_pattern' "$CB_STATE_FILE")
if [ "$current_error" = "$last_error" ]; then
((same_error++))
else
same_error=1
fi
jq --arg err "$current_error" '.last_error_pattern = $err' "$CB_STATE_FILE" > "$CB_STATE_FILE.tmp"
mv "$CB_STATE_FILE.tmp" "$CB_STATE_FILE"
fi
# 更新计数器
jq --argjson np "$no_progress" --argjson se "$same_error" \
'.no_progress_count = $np | .same_error_count = $se' "$CB_STATE_FILE" > "$CB_STATE_FILE.tmp"
mv "$CB_STATE_FILE.tmp" "$CB_STATE_FILE"
# 检查是否打开断路器
if [ $no_progress -ge $CB_NO_PROGRESS_THRESHOLD ] || [ $same_error -ge $CB_SAME_ERROR_THRESHOLD ]; then
jq '.state = "OPEN"' "$CB_STATE_FILE" > "$CB_STATE_FILE.tmp"
mv "$CB_STATE_FILE.tmp" "$CB_STATE_FILE"
echo "断路器已打开"
return 1
fi
return 0
}
2.4 速率限制实现
#!/bin/bash
# rate_limiter.sh
MAX_API_CALLS_PER_HOUR=100
RATE_LIMIT_COOLDOWN=3600
init_rate_limiter() {
if [ ! -f .rate_limit_state ]; then
cat > .rate_limit_state << EOF
{
"call_count": 0,
"hour_start": $(date +%s),
"hour_end": $(($(date +%s) + RATE_LIMIT_COOLDOWN))
}
EOF
fi
}
check_rate_limit() {
local call_count=$(jq -r '.call_count' .rate_limit_state)
local hour_end=$(jq -r '.hour_end' .rate_limit_state)
local current_time=$(date +%s)
# 检查是否需要重置
if [ $current_time -ge $hour_end ]; then
cat > .rate_limit_state << EOF
{
"call_count": 0,
"hour_start": $current_time,
"hour_end": $((current_time + RATE_LIMIT_COOLDOWN))
}
EOF
return 0
fi
# 检查是否达到限制
if [ $call_count -ge $MAX_API_CALLS_PER_HOUR ]; then
local wait_seconds=$((hour_end - current_time))
echo "达到 API 速率限制,等待 $((wait_seconds / 60)) 分钟..."
sleep $wait_seconds
# 重置
cat > .rate_limit_state << EOF
{
"call_count": 0,
"hour_start": $(date +%s),
"hour_end": $(($(date +%s) + RATE_LIMIT_COOLDOWN))
}
EOF
fi
return 0
}
record_api_call() {
local call_count=$(jq -r '.call_count' .rate_limit_state)
((call_count++))
jq --argjson cc "$call_count" '.call_count = $cc' .rate_limit_state > .rate_limit_state.tmp
mv .rate_limit_state.tmp .rate_limit_state
}
3. 监控与告警
3.1 监控面板
#!/bin/bash
# monitor.sh
display_dashboard() {
while true; do
clear
echo "╔════════════════════════════════════════════════════════╗"
echo "║ Auto-Research 实时监控面板 ║"
echo "╚════════════════════════════════════════════════════════╝"
echo ""
local status=$(cat .current_status.json 2>/dev/null || echo '{}')
local progress=$(echo "$status" | jq -r '.task.progress // 0')
local loop_count=$(echo "$status" | jq -r '.metrics.loop_count // 0')
local current_task=$(echo "$status" | jq -r '.state.current_task // "未知"')
echo "📊 进度: ${progress}% | 🔄 循环: $loop_count | 📝 任务: $current_task"
echo ""
# API 使用
local api_calls=$(cat .api_calls 2>/dev/null || echo 0)
local api_percent=$((api_calls * 100 / 100))
echo "📞 API 调用: $api_calls/100 (${api_percent}%)"
# 进度条
local filled=$((api_percent / 2))
printf "["; printf '█%.0s' $(seq 1 $filled); printf '░%.0s' $(seq 1 $((50 - filled))); printf "]\n"
echo ""
# 断路器状态
local cb_state=$(jq -r '.state' .circuit_breaker_state 2>/dev/null || echo "unknown")
case "$cb_state" in
"CLOSED") echo "🟢 断路器: 正常" ;;
"HALF_OPEN") echo "🟡 断路器: 监控" ;;
"OPEN") echo "🔴 断路器: 打开" ;;
*) echo "⚪ 断路器: 未知" ;;
esac
echo ""
echo "按 Ctrl+C 退出监控"
sleep 2
done
}
display_dashboard
4. 故障恢复
4.1 自动恢复流程
#!/bin/bash
# auto_recovery.sh
auto_recover() {
local failure_type="$1"
case "$failure_type" in
"network")
echo "等待 10 秒后重试..."
sleep 10
return 0
;;
"api_rate_limit")
# 等待速率限制重置
check_rate_limit
return 0
;;
"session_expired")
echo "重新初始化会话..."
# 生成新会话 ID
return 0
;;
*)
echo "未知故障类型,无法自动恢复"
return 1
;;
esac
}
4.2 恢复验证检查清单
| 检查项 | 验证方法 | 通过标准 |
|---|
| 状态文件完整性 | jq empty .current_status.json | 无 JSON 错误 |
| 引用的文件存在 | ls -la 检查 | 所有文件存在 |
| Git 仓库状态 | git status | 无冲突 |
| API 密钥有效 | 测试调用 | 返回 200 |
| 断路器状态 | jq -r '.state' | CLOSED 或 HALF_OPEN |
| 速率限制 | cat .api_calls | < MAX_API_CALLS |
5. 关键配置参数
| 参数 | 推荐值 | 说明 |
|---|
MAX_ITERATIONS | 50 | 最大循环次数 |
MAX_API_CALLS_PER_HOUR | 100 | 每小时 API 调用上限 |
CB_NO_PROGRESS_THRESHOLD | 3 | 断路器:无进展阈值 |
CB_SAME_ERROR_THRESHOLD | 5 | 断路器:相同错误阈值 |
MIN_COMPLETION_INDICATORS | 2 | 最小完成指示器数 |
CB_COOLDOWN_MINUTES | 30 | 断路器冷却时间 |