关键代码验证
技术研究 LLM 置信度评分
核心代码实现、集成方式与关键配置
一、Temperature Scaling 实现
1.1 基础实现(2 行代码)
import torch
import torch.nn.functional as F
class TemperatureScaling:
def __init__(self, temperature=1.0):
self.temperature = temperature
def calibrate(self, logits):
"""应用温度缩放"""
return F.softmax(logits / self.temperature, dim=-1)
def fit(self, logits, labels):
"""在验证集上学习最优温度"""
from scipy.optimize import minimize_scalar
def nll_loss(T):
probs = self.calibrate(torch.tensor(logits) / T)
return -torch.log(probs[range(len(labels)), labels]).mean().item()
result = minimize_scalar(nll_loss, bounds=(0.1, 10.0), method='bounded')
self.temperature = result.x
return self
1.2 使用示例
# 准备验证集
val_logits = model.get_logits(val_texts) # [N, num_classes]
val_labels = get_labels(val_texts)
# 拟合并应用
calibrator = TemperatureScaling()
calibrator.fit(val_logits, val_labels)
# 推理时校准
test_logits = model.get_logits(["测试问题"])
calibrated_probs = calibrator.calibrate(torch.tensor(test_logits))
confidence = calibrated_probs.max().item() # 校准后的置信度
1.3 关键配置
| 参数 | 推荐值 | 说明 |
|---|---|---|
| 验证集大小 | 100-500 | 过小易过拟合,过大浪费 |
| 温度范围 | [0.1, 10.0] | 覆盖大部分校准需求 |
| 优化方法 | Bounded (Golden Section) | 单峰函数,快速收敛 |
二、Direct Prompting 实现
2.1 基础置信度 Elicitation
from openai import OpenAI
client = OpenAI()
def get_confidence_with_direct_prompting(question: str) -> dict:
"""直接询问模型置信度"""
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "user",
"content": f"""请回答以下问题,并在最后提供你的置信度评分。
问题:{question}
请在回答的最后,按照以下格式提供置信度:
置信度:<0-100 的整数>%
理由:<简要说明为什么是这个置信度>"""
}
],
temperature=0.0 # 低温度获得确定性回答
)
answer = response.choices[0].message.content
# 解析置信度
import re
match = re.search(r'置信度:\s*(\d+)%', answer)
confidence = int(match.group(1)) / 100 if match else None
return {
"answer": answer,
"confidence": confidence,
"raw_response": answer
}
2.2 数值尺度校准
def calibrated_direct_prompting(question: str, scale: str = "0-100") -> dict:
"""
使用校准后的数值尺度询问置信度
研究发现:模型对"0-100"尺度比"0-10"尺度校准更好
"""
scale_instructions = {
"0-100": "0=完全不确定,100=绝对确定",
"0-10": "0=完全不确定,10=绝对确定",
"percentage": "0%=完全不确定,100%=绝对确定"
}
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": "你是一个诚实的助手。当不确定时,请如实报告低置信度。过度自信会导致用户信任受损。"
},
{
"role": "user",
"content": f"""问题:{question}
请在回答后提供置信度评分({scale_instructions[scale]})。
置信度评分应反映你答案正确的概率。
例如:如果你有 80% 的把握答案正确,评分应为 80。"""
}
],
temperature=0.0
)
return parse_confidence_response(response.choices[0].message.content, scale)
2.3 两阶段 Elicitation(更准确)
def two_stage_confidence_elicitation(question: str) -> dict:
"""
两阶段置信度询问
阶段 1:生成答案
阶段 2:评估答案正确性
研究发现在 QA 场景下,这种方法比单阶段校准更好
"""
# 阶段 1:生成答案
answer_response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": f"请回答:{question}"}],
temperature=0.0
)
answer = answer_response.choices[0].message.content
# 阶段 2:评估置信度
eval_response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "user",
"content": f"""问题:{question}
答案:{answer}
请评估上述答案正确的概率(0-100%)。
考虑以下因素:
1. 你对相关知识的掌握程度
2. 答案中是否存在不确定表述
3. 是否有可能是错误但听起来合理
置信度(0-100%):"""
}
],
temperature=0.0,
max_tokens=10
)
confidence = parse_confidence(eval_response.choices[0].message.content)
return {
"answer": answer,
"confidence": confidence
}
三、Self-Consistency 实现
3.1 基础版本
from collections import Counter
from openai import OpenAI
client = OpenAI()
def self_consistency(question: str, n_samples: int = 10) -> dict:
"""
Self-Consistency:采样多条推理路径,选择最高频答案
"""
answers = []
for i in range(n_samples):
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": f"请逐步推理并回答:{question}"}],
temperature=0.7 # 较高温度促进多样性
)
answers.append(extract_answer(response.choices[0].message.content))
# 多数投票
answer_counts = Counter(answers)
final_answer = answer_counts.most_common(1)[0][0]
# 置信度 = 最高频答案的比例
confidence = answer_counts[final_answer] / n_samples
return {
"answer": final_answer,
"confidence": confidence,
"all_answers": answers,
"distribution": dict(answer_counts)
}
def extract_answer(response: str) -> str:
"""从推理链中提取最终答案"""
# 简单实现:提取最后一句
lines = response.strip().split('\n')
for line in reversed(lines):
if '答案' in line or 'answer' in line.lower():
return line.split(':')[-1].strip()
return lines[-1].strip()
3.2 CISC 改进版本(置信度加权)
def cisc_self_consistency(question: str, n_samples: int = 10) -> dict:
"""
Confidence-Informed Self-Consistency (CISC)
改进点:
1. 每条路径询问置信度
2. 置信度加权投票
3. 减少采样需求(相比标准 SC 减少 40%+)
"""
weighted_votes = {}
for i in range(n_samples):
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "user",
"content": f"""请逐步推理并回答:{question}
在回答的最后,请提供:
1. 最终答案
2. 你对这个答案的置信度(0-100%)"""
}
],
temperature=0.7
)
content = response.choices[0].message.content
answer, confidence = extract_answer_with_confidence(content)
# 加权投票
if answer not in weighted_votes:
weighted_votes[answer] = 0
weighted_votes[answer] += confidence
# 选择加权得分最高的答案
final_answer = max(weighted_votes, key=weighted_votes.get)
total_weight = sum(weighted_votes.values())
confidence = weighted_votes[final_answer] / total_weight
return {
"answer": final_answer,
"confidence": confidence,
"weighted_votes": weighted_votes
}
def extract_answer_with_confidence(response: str) -> tuple:
"""提取答案和置信度"""
import re
# 提取答案
answer_match = re.search(r'答案 [::]\s*(.+)', response)
answer = answer_match.group(1).strip() if answer_match else response
# 提取置信度
conf_match = re.search(r'置信度 [::]\s*(\d+)%?', response)
confidence = int(conf_match.group(1)) / 100 if conf_match else 0.5
return answer, confidence
3.3 自适应采样(提前停止)
def adaptive_self_consistency(question: str, max_samples: int = 40) -> dict:
"""
自适应 Self-Consistency
当某个答案的领先优势足够大时,提前停止采样
可节省 30-50% 的计算成本
"""
answers = []
for i in range(max_samples):
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": f"请回答:{question}"}],
temperature=0.7
)
answer = extract_answer(response.choices[0].message.content)
answers.append(answer)
# 每 5 次采样后检查是否可以提前停止
if (i + 1) % 5 == 0 and i >= 9:
counts = Counter(answers)
most_common = counts.most_common(2)
if len(most_common) >= 2:
top_count = most_common[0][1]
second_count = most_common[1][1]
# 如果领先优势超过阈值,提前停止
lead = (top_count - second_count) / (i + 1)
if lead > 0.3: # 领先 30% 以上
print(f"提前停止于 {i+1} 次采样")
break
counts = Counter(answers)
final_answer = counts.most_common(1)[0][0]
confidence = counts[final_answer] / len(answers)
return {
"answer": final_answer,
"confidence": confidence,
"samples_used": len(answers)
}
四、CoT-UQ 实现
4.1 关键词提取与重要性评分
import re
from typing import List, Dict
class CoT_UQ:
def __init__(self, model_client):
self.client = model_client
def compute_uq(self, question: str, cot_response: str) -> float:
"""
计算 Chain-of-Thought 不确定性分数
步骤:
1. 提取推理步骤
2. 提取每步关键词
3. 评估关键词重要性
4. 聚合得到不确定性
"""
# 步骤 1:提取推理步骤
steps = self.extract_steps(cot_response)
# 步骤 2&3:提取关键词并评分
step_uncertainties = []
for step in steps:
keywords = self.extract_keywords(step)
importance_scores = self.rate_importance(question, step, keywords)
# 步骤不确定性 = 1 - 平均重要性
# 重要性越低,不确定性越高
if importance_scores:
step_uq = 1 - sum(importance_scores) / len(importance_scores)
step_uncertainties.append(step_uq)
# 步骤 4:加权聚合(后期步骤权重更高)
if not step_uncertainties:
return 0.5
weights = [i ** 0.5 for i in range(1, len(step_uncertainties) + 1)]
weighted_uq = sum(u * w for u, w in zip(step_uncertainties, weights)) / sum(weights)
return weighted_uq
def extract_steps(self, response: str) -> List[str]:
"""提取推理步骤"""
# 按序号分割
steps = re.split(r'\n(?:步骤 | step)\s*\d+[::.]\s*', response, flags=re.IGNORECASE)
return [s.strip() for s in steps if s.strip()]
def extract_keywords(self, text: str) -> List[str]:
"""提取关键词"""
# 简单实现:提取名词和数字
# 实际可使用 RAKE 或 YAKE 算法
keywords = re.findall(r'\b[A-Za-z]{4,}\b', text)
return keywords[:10] # 限制数量
def rate_importance(self, question: str, step: str, keywords: List[str]) -> List[float]:
"""评估关键词重要性"""
importance_scores = []
for kw in keywords:
# 重要性评分基于:
# 1. 与问题的相关性
# 2. 在推理中的位置
# 3. 词频
relevance = self.compute_relevance(question, kw)
position_score = self.compute_position_score(step, kw)
importance = (relevance + position_score) / 2
importance_scores.append(importance)
return importance_scores
def compute_relevance(self, question: str, keyword: str) -> float:
"""计算关键词与问题的相关性"""
# 简化版本:基于共现
# 实际可使用嵌入相似度
if keyword.lower() in question.lower():
return 1.0
return 0.5
def compute_position_score(self, text: str, keyword: str) -> float:
"""基于位置的重要性评分"""
# 靠前的关键词通常更重要
idx = text.lower().find(keyword.lower())
if idx == -1:
return 0.0
return 1.0 / (1.0 + idx / len(text))
4.2 使用示例
# 初始化
cot_uq = CoT_UQ(client)
# 生成 CoT 回答
question = "如果 3 个工人 3 小时能挖 3 个洞,那么 9 个工人挖 9 个洞需要多少小时?"
cot_response = client.chat.completions.create(
model="gpt-4",
messages=[{
"role": "user",
"content": f"请逐步推理:{question}"
}],
temperature=0.0
).choices[0].message.content
# 计算不确定性
uq_score = cot_uq.compute_uq(question, cot_response)
confidence = 1 - uq_score
print(f"置信度:{confidence:.2f}")
print(f"不确定性:{uq_score:.2f}")
五、集成到现有系统
5.1 LangChain 集成
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
class ConfidenceLLMChain(LLMChain):
"""带置信度评估的 LLMChain"""
def __init__(self, confidence_method: str = "direct", **kwargs):
super().__init__(**kwargs)
self.confidence_method = confidence_method
def call(self, inputs, return_confidence: bool = True):
"""执行链并返回置信度"""
# 基础执行
result = super().call(inputs)
if return_confidence:
# 根据配置方法获取置信度
if self.confidence_method == "direct":
confidence_result = self._get_direct_confidence(inputs)
elif self.confidence_method == "logprobs":
confidence_result = self._get_logprobs_confidence(inputs)
else:
confidence_result = {"confidence": 0.5}
result.update(confidence_result)
return result
def _get_direct_confidence(self, inputs):
"""直接询问置信度"""
confidence_prompt = ChatPromptTemplate.from_messages([
("user", "评估以下答案的正确概率(0-100%):{answer}")
])
confidence_chain = LLMChain(llm=self.llm, prompt=confidence_prompt)
confidence = confidence_chain.run(answer=inputs.get("answer", ""))
return {"confidence": parse_confidence(confidence)}
def _get_logprobs_confidence(self, inputs):
"""从 logprobs 计算置信度"""
# 需要启用 logprobs
response = self.llm.call_as_api(
inputs["question"],
logprobs=True
)
# 提取 top token 概率
confidence = extract_max_prob(response.logprobs)
return {"confidence": confidence}
# 使用示例
llm = ChatOpenAI(model="gpt-4", temperature=0)
chain = ConfidenceLLMChain(llm=llm, confidence_method="direct")
result = chain({"question": "地球到月球的距离是多少?"})
print(f"答案:{result['text']}")
print(f"置信度:{result['confidence']}")
5.2 关键配置参数
| 参数 | 推荐值 | 影响 |
|---|---|---|
temperature | 0.0 (直接回答) / 0.7 (采样) | 影响输出多样性和置信度 |
n_samples | 10-20 (CISC) / 40 (SC) | 采样次数,影响精度和成本 |
confidence_threshold | 0.7-0.8 | 低于此阈值触发人工审核 |
logprobs | True (如支持) | 启用可获得更细粒度置信度 |
六、生产部署注意事项
6.1 置信度校准流水线
class ProductionConfidencePipeline:
def __init__(self):
self.temperature_scaler = TemperatureScaling()
self.calibrated = False
def calibrate(self, validation_data: List[dict]):
"""在生产部署前校准"""
# 收集验证集上的置信度和真实标签
confidences = []
labels = []
for item in validation_data:
result = self.predict(item["input"])
confidences.append(result["confidence"])
labels.append(1 if result["answer"] == item["ground_truth"] else 0)
# 拟合校准器
self.calibrator.fit(confidences, labels)
self.calibrated = True
def predict(self, input_text: str) -> dict:
"""带校准的预测"""
# 基础预测
result = self.base_model(input_text)
# 应用校准
if self.calibrated:
result["calibrated_confidence"] = self.calibrator.predict(
[[result["confidence"]]]
)[0][0]
return result
6.2 监控与告警
def monitor_confidence_distribution(predictions: List[dict]):
"""监控置信度分布"""
confidences = [p["confidence"] for p in predictions]
# 检查异常
avg_confidence = sum(confidences) / len(confidences)
low_confidence_ratio = sum(1 for c in confidences if c < 0.5) / len(confidences)
# 告警条件
if avg_confidence < 0.6:
send_alert("平均置信度过低", avg_confidence)
if low_confidence_ratio > 0.3:
send_alert("低置信度预测比例过高", low_confidence_ratio)