关键代码验证

技术研究 LLM 置信度评分

核心代码实现、集成方式与关键配置

一、Temperature Scaling 实现

1.1 基础实现（2 行代码）

import torch
import torch.nn.functional as F

class TemperatureScaling:
    def __init__(self, temperature=1.0):
        self.temperature = temperature
    
    def calibrate(self, logits):
        """应用温度缩放"""
        return F.softmax(logits / self.temperature, dim=-1)
    
    def fit(self, logits, labels):
        """在验证集上学习最优温度"""
        from scipy.optimize import minimize_scalar
        
        def nll_loss(T):
            probs = self.calibrate(torch.tensor(logits) / T)
            return -torch.log(probs[range(len(labels)), labels]).mean().item()
        
        result = minimize_scalar(nll_loss, bounds=(0.1, 10.0), method='bounded')
        self.temperature = result.x
        return self

1.2 使用示例

# 准备验证集
val_logits = model.get_logits(val_texts)  # [N, num_classes]
val_labels = get_labels(val_texts)

# 拟合并应用
calibrator = TemperatureScaling()
calibrator.fit(val_logits, val_labels)

# 推理时校准
test_logits = model.get_logits(["测试问题"])
calibrated_probs = calibrator.calibrate(torch.tensor(test_logits))
confidence = calibrated_probs.max().item()  # 校准后的置信度

1.3 关键配置

参数	推荐值	说明
验证集大小	100-500	过小易过拟合，过大浪费
温度范围	[0.1, 10.0]	覆盖大部分校准需求
优化方法	Bounded (Golden Section)	单峰函数，快速收敛

二、Direct Prompting 实现

2.1 基础置信度 Elicitation

from openai import OpenAI

client = OpenAI()

def get_confidence_with_direct_prompting(question: str) -> dict:
    """直接询问模型置信度"""
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "user",
                "content": f"""请回答以下问题，并在最后提供你的置信度评分。
                
问题：{question}

请在回答的最后，按照以下格式提供置信度：
置信度：<0-100 的整数>%
理由：<简要说明为什么是这个置信度>"""
            }
        ],
        temperature=0.0  # 低温度获得确定性回答
    )
    
    answer = response.choices[0].message.content
    
    # 解析置信度
    import re
    match = re.search(r'置信度：\s*(\d+)%', answer)
    confidence = int(match.group(1)) / 100 if match else None
    
    return {
        "answer": answer,
        "confidence": confidence,
        "raw_response": answer
    }

2.2 数值尺度校准

def calibrated_direct_prompting(question: str, scale: str = "0-100") -> dict:
    """
    使用校准后的数值尺度询问置信度
    
    研究发现：模型对"0-100"尺度比"0-10"尺度校准更好
    """
    scale_instructions = {
        "0-100": "0=完全不确定，100=绝对确定",
        "0-10": "0=完全不确定，10=绝对确定",
        "percentage": "0%=完全不确定，100%=绝对确定"
    }
    
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "system",
                "content": "你是一个诚实的助手。当不确定时，请如实报告低置信度。过度自信会导致用户信任受损。"
            },
            {
                "role": "user",
                "content": f"""问题：{question}

请在回答后提供置信度评分（{scale_instructions[scale]}）。
置信度评分应反映你答案正确的概率。
例如：如果你有 80% 的把握答案正确，评分应为 80。"""
            }
        ],
        temperature=0.0
    )
    
    return parse_confidence_response(response.choices[0].message.content, scale)

2.3 两阶段 Elicitation（更准确）

def two_stage_confidence_elicitation(question: str) -> dict:
    """
    两阶段置信度询问
    
    阶段 1：生成答案
    阶段 2：评估答案正确性
    
    研究发现在 QA 场景下，这种方法比单阶段校准更好
    """
    # 阶段 1：生成答案
    answer_response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": f"请回答：{question}"}],
        temperature=0.0
    )
    answer = answer_response.choices[0].message.content
    
    # 阶段 2：评估置信度
    eval_response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "user",
                "content": f"""问题：{question}
答案：{answer}

请评估上述答案正确的概率（0-100%）。
考虑以下因素：
1. 你对相关知识的掌握程度
2. 答案中是否存在不确定表述
3. 是否有可能是错误但听起来合理

置信度（0-100%）："""
            }
        ],
        temperature=0.0,
        max_tokens=10
    )
    
    confidence = parse_confidence(eval_response.choices[0].message.content)
    
    return {
        "answer": answer,
        "confidence": confidence
    }

三、Self-Consistency 实现

3.1 基础版本

from collections import Counter
from openai import OpenAI

client = OpenAI()

def self_consistency(question: str, n_samples: int = 10) -> dict:
    """
    Self-Consistency：采样多条推理路径，选择最高频答案
    """
    answers = []
    
    for i in range(n_samples):
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": f"请逐步推理并回答：{question}"}],
            temperature=0.7  # 较高温度促进多样性
        )
        answers.append(extract_answer(response.choices[0].message.content))
    
    # 多数投票
    answer_counts = Counter(answers)
    final_answer = answer_counts.most_common(1)[0][0]
    
    # 置信度 = 最高频答案的比例
    confidence = answer_counts[final_answer] / n_samples
    
    return {
        "answer": final_answer,
        "confidence": confidence,
        "all_answers": answers,
        "distribution": dict(answer_counts)
    }

def extract_answer(response: str) -> str:
    """从推理链中提取最终答案"""
    # 简单实现：提取最后一句
    lines = response.strip().split('\n')
    for line in reversed(lines):
        if '答案' in line or 'answer' in line.lower():
            return line.split(':')[-1].strip()
    return lines[-1].strip()

3.2 CISC 改进版本（置信度加权）

def cisc_self_consistency(question: str, n_samples: int = 10) -> dict:
    """
    Confidence-Informed Self-Consistency (CISC)
    
    改进点：
    1. 每条路径询问置信度
    2. 置信度加权投票
    3. 减少采样需求（相比标准 SC 减少 40%+）
    """
    weighted_votes = {}
    
    for i in range(n_samples):
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {
                    "role": "user",
                    "content": f"""请逐步推理并回答：{question}

在回答的最后，请提供：
1. 最终答案
2. 你对这个答案的置信度（0-100%）"""
                }
            ],
            temperature=0.7
        )
        
        content = response.choices[0].message.content
        answer, confidence = extract_answer_with_confidence(content)
        
        # 加权投票
        if answer not in weighted_votes:
            weighted_votes[answer] = 0
        weighted_votes[answer] += confidence
    
    # 选择加权得分最高的答案
    final_answer = max(weighted_votes, key=weighted_votes.get)
    total_weight = sum(weighted_votes.values())
    confidence = weighted_votes[final_answer] / total_weight
    
    return {
        "answer": final_answer,
        "confidence": confidence,
        "weighted_votes": weighted_votes
    }

def extract_answer_with_confidence(response: str) -> tuple:
    """提取答案和置信度"""
    import re
    
    # 提取答案
    answer_match = re.search(r'答案 [:：]\s*(.+)', response)
    answer = answer_match.group(1).strip() if answer_match else response
    
    # 提取置信度
    conf_match = re.search(r'置信度 [:：]\s*(\d+)%?', response)
    confidence = int(conf_match.group(1)) / 100 if conf_match else 0.5
    
    return answer, confidence

3.3 自适应采样（提前停止）

def adaptive_self_consistency(question: str, max_samples: int = 40) -> dict:
    """
    自适应 Self-Consistency
    
    当某个答案的领先优势足够大时，提前停止采样
    可节省 30-50% 的计算成本
    """
    answers = []
    
    for i in range(max_samples):
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": f"请回答：{question}"}],
            temperature=0.7
        )
        answer = extract_answer(response.choices[0].message.content)
        answers.append(answer)
        
        # 每 5 次采样后检查是否可以提前停止
        if (i + 1) % 5 == 0 and i >= 9:
            counts = Counter(answers)
            most_common = counts.most_common(2)
            
            if len(most_common) >= 2:
                top_count = most_common[0][1]
                second_count = most_common[1][1]
                
                # 如果领先优势超过阈值，提前停止
                lead = (top_count - second_count) / (i + 1)
                if lead > 0.3:  # 领先 30% 以上
                    print(f"提前停止于 {i+1} 次采样")
                    break
    
    counts = Counter(answers)
    final_answer = counts.most_common(1)[0][0]
    confidence = counts[final_answer] / len(answers)
    
    return {
        "answer": final_answer,
        "confidence": confidence,
        "samples_used": len(answers)
    }

四、CoT-UQ 实现

4.1 关键词提取与重要性评分

import re
from typing import List, Dict

class CoT_UQ:
    def __init__(self, model_client):
        self.client = model_client
    
    def compute_uq(self, question: str, cot_response: str) -> float:
        """
        计算 Chain-of-Thought 不确定性分数
        
        步骤：
        1. 提取推理步骤
        2. 提取每步关键词
        3. 评估关键词重要性
        4. 聚合得到不确定性
        """
        # 步骤 1：提取推理步骤
        steps = self.extract_steps(cot_response)
        
        # 步骤 2&3：提取关键词并评分
        step_uncertainties = []
        for step in steps:
            keywords = self.extract_keywords(step)
            importance_scores = self.rate_importance(question, step, keywords)
            
            # 步骤不确定性 = 1 - 平均重要性
            # 重要性越低，不确定性越高
            if importance_scores:
                step_uq = 1 - sum(importance_scores) / len(importance_scores)
                step_uncertainties.append(step_uq)
        
        # 步骤 4：加权聚合（后期步骤权重更高）
        if not step_uncertainties:
            return 0.5
        
        weights = [i ** 0.5 for i in range(1, len(step_uncertainties) + 1)]
        weighted_uq = sum(u * w for u, w in zip(step_uncertainties, weights)) / sum(weights)
        
        return weighted_uq
    
    def extract_steps(self, response: str) -> List[str]:
        """提取推理步骤"""
        # 按序号分割
        steps = re.split(r'\n(?:步骤 | step)\s*\d+[:：.]\s*', response, flags=re.IGNORECASE)
        return [s.strip() for s in steps if s.strip()]
    
    def extract_keywords(self, text: str) -> List[str]:
        """提取关键词"""
        # 简单实现：提取名词和数字
        # 实际可使用 RAKE 或 YAKE 算法
        keywords = re.findall(r'\b[A-Za-z]{4,}\b', text)
        return keywords[:10]  # 限制数量
    
    def rate_importance(self, question: str, step: str, keywords: List[str]) -> List[float]:
        """评估关键词重要性"""
        importance_scores = []
        
        for kw in keywords:
            # 重要性评分基于：
            # 1. 与问题的相关性
            # 2. 在推理中的位置
            # 3. 词频
            
            relevance = self.compute_relevance(question, kw)
            position_score = self.compute_position_score(step, kw)
            
            importance = (relevance + position_score) / 2
            importance_scores.append(importance)
        
        return importance_scores
    
    def compute_relevance(self, question: str, keyword: str) -> float:
        """计算关键词与问题的相关性"""
        # 简化版本：基于共现
        # 实际可使用嵌入相似度
        if keyword.lower() in question.lower():
            return 1.0
        return 0.5
    
    def compute_position_score(self, text: str, keyword: str) -> float:
        """基于位置的重要性评分"""
        # 靠前的关键词通常更重要
        idx = text.lower().find(keyword.lower())
        if idx == -1:
            return 0.0
        return 1.0 / (1.0 + idx / len(text))

4.2 使用示例

# 初始化
cot_uq = CoT_UQ(client)

# 生成 CoT 回答
question = "如果 3 个工人 3 小时能挖 3 个洞，那么 9 个工人挖 9 个洞需要多少小时？"
cot_response = client.chat.completions.create(
    model="gpt-4",
    messages=[{
        "role": "user",
        "content": f"请逐步推理：{question}"
    }],
    temperature=0.0
).choices[0].message.content

# 计算不确定性
uq_score = cot_uq.compute_uq(question, cot_response)
confidence = 1 - uq_score

print(f"置信度：{confidence:.2f}")
print(f"不确定性：{uq_score:.2f}")

五、集成到现有系统

5.1 LangChain 集成

from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate

class ConfidenceLLMChain(LLMChain):
    """带置信度评估的 LLMChain"""
    
    def __init__(self, confidence_method: str = "direct", **kwargs):
        super().__init__(**kwargs)
        self.confidence_method = confidence_method
    
    def call(self, inputs, return_confidence: bool = True):
        """执行链并返回置信度"""
        # 基础执行
        result = super().call(inputs)
        
        if return_confidence:
            # 根据配置方法获取置信度
            if self.confidence_method == "direct":
                confidence_result = self._get_direct_confidence(inputs)
            elif self.confidence_method == "logprobs":
                confidence_result = self._get_logprobs_confidence(inputs)
            else:
                confidence_result = {"confidence": 0.5}
            
            result.update(confidence_result)
        
        return result
    
    def _get_direct_confidence(self, inputs):
        """直接询问置信度"""
        confidence_prompt = ChatPromptTemplate.from_messages([
            ("user", "评估以下答案的正确概率（0-100%）：{answer}")
        ])
        confidence_chain = LLMChain(llm=self.llm, prompt=confidence_prompt)
        confidence = confidence_chain.run(answer=inputs.get("answer", ""))
        return {"confidence": parse_confidence(confidence)}
    
    def _get_logprobs_confidence(self, inputs):
        """从 logprobs 计算置信度"""
        # 需要启用 logprobs
        response = self.llm.call_as_api(
            inputs["question"],
            logprobs=True
        )
        # 提取 top token 概率
        confidence = extract_max_prob(response.logprobs)
        return {"confidence": confidence}

# 使用示例
llm = ChatOpenAI(model="gpt-4", temperature=0)
chain = ConfidenceLLMChain(llm=llm, confidence_method="direct")
result = chain({"question": "地球到月球的距离是多少？"})
print(f"答案：{result['text']}")
print(f"置信度：{result['confidence']}")

5.2 关键配置参数

参数	推荐值	影响
`temperature`	0.0 (直接回答) / 0.7 (采样)	影响输出多样性和置信度
`n_samples`	10-20 (CISC) / 40 (SC)	采样次数，影响精度和成本
`confidence_threshold`	0.7-0.8	低于此阈值触发人工审核
`logprobs`	True (如支持)	启用可获得更细粒度置信度

六、生产部署注意事项

6.1 置信度校准流水线

class ProductionConfidencePipeline:
    def __init__(self):
        self.temperature_scaler = TemperatureScaling()
        self.calibrated = False
    
    def calibrate(self, validation_data: List[dict]):
        """在生产部署前校准"""
        # 收集验证集上的置信度和真实标签
        confidences = []
        labels = []
        
        for item in validation_data:
            result = self.predict(item["input"])
            confidences.append(result["confidence"])
            labels.append(1 if result["answer"] == item["ground_truth"] else 0)
        
        # 拟合校准器
        self.calibrator.fit(confidences, labels)
        self.calibrated = True
    
    def predict(self, input_text: str) -> dict:
        """带校准的预测"""
        # 基础预测
        result = self.base_model(input_text)
        
        # 应用校准
        if self.calibrated:
            result["calibrated_confidence"] = self.calibrator.predict(
                [[result["confidence"]]]
            )[0][0]
        
        return result

6.2 监控与告警

def monitor_confidence_distribution(predictions: List[dict]):
    """监控置信度分布"""
    confidences = [p["confidence"] for p in predictions]
    
    # 检查异常
    avg_confidence = sum(confidences) / len(confidences)
    low_confidence_ratio = sum(1 for c in confidences if c < 0.5) / len(confidences)
    
    # 告警条件
    if avg_confidence < 0.6:
        send_alert("平均置信度过低", avg_confidence)
    
    if low_confidence_ratio > 0.3:
        send_alert("低置信度预测比例过高", low_confidence_ratio)