Appearance
关键代码验证
1. 后训练量化(PTQ)工作流
1.1 使用 AutoGPTQ 量化为 INT4 GPTQ
python
# 安装依赖
# pip install auto-gptq transformers optimum accelerate
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer
# 定义量化配置
quantize_config = BaseQuantizeConfig(
bits=4, # 量化位数
group_size=128, # 分组大小(越大精度越高,但速度越慢)
damp_percent=0.01, # 阻尼系数(减少量化误差)
desc_act=False, # 是否激活描述符
sym=True, # 对称量化
true_sequential=True, # 顺序量化
model_name_or_path="llama-2-7b-gptq"
)
# 加载原始模型
model_path = "meta-llama/Llama-2-7b-hf"
model = AutoGPTQForCausalLM.from_pretrained(
model_path,
quantize_config=quantize_config,
use_safetensors=True
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# 准备校准数据(约 128 样本)
from datasets import load_dataset
calib_dataset = load_dataset("c4", "en", split="validation")
calib_samples = [tokenizer.encode(text[:2048]) for text in calib_dataset[:128]["text"]]
# 执行量化
model.quantize(
calib_samples,
batch_size=1,
use_triton=False # 是否使用 Triton(NVIDIA GPU)
)
# 保存量化后的模型
save_path = "llama-2-7b-gptq-4bit"
model.save_quantized(save_path, use_safetensors=True)
tokenizer.save_pretrained(save_path)
print(f"量化模型已保存到: {save_path}")1.2 使用 AutoAWQ 量化为 INT4 AWQ
python
# 安装依赖
# pip install autoawq transformers
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
# 量化配置
quant_config = {
"zero_point": True, # 是否使用零点
"q_group_size": 128, # 分组大小
"w_bit": 4, # 量化位数
"version": "GEMM" # 计算版本(GEMM 或 GEMV)
}
# 加载并量化模型
model_path = "meta-llama/Llama-2-7b-hf"
model = AutoAWQForCausalLM.from_pretrained(
model_path,
device_map="cuda", # 使用 GPU
safetensors=True
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# 量化(内部使用校准数据集)
model.quantize(tokenizer, quant_config=quant_config)
# 保存量化模型
save_path = "llama-2-7b-awq-4bit"
model.save_quantized(save_path)
tokenizer.save_pretrained(save_path)
print(f"AWQ 量化模型已保存到: {save_path}")2. 格式转换工作流
2.1 转换为 GGUF 格式(CPU 推理)
bash
# 克隆 llama.cpp
git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
# 编译(支持 CUDA 可选)
make
# 安装 Python 依赖
pip install -r requirements.txt
# 下载原始模型(HuggingFace 格式)
# git lfs install
# git clone https://huggingface.co/meta-llama/Llama-2-7b-hf
# 转换为 GGUF(FP16)
python convert-hf-to-gguf.py \
--model ../Llama-2-7b-hf \
--outfile llama-2-7b-f16.gguf \
--outtype f16
# 量化为 Q4_K_M(推荐的 4-bit 量化)
./quantize \
llama-2-7b-f16.gguf \
llama-2-7b-q4_k_m.gguf \
Q4_K_M
# 验证转换
./main -m llama-2-7b-q4_k_m.gguf \
--prompt "Hello, world!" \
--n-predict 50量化参数说明:
Q4_K_M:推荐平衡方案(精度和速度)Q4_K_S:更激进的量化(更小文件)Q5_K_M:更高精度(稍大文件)
2.2 转换为 MLX 格式(Apple Silicon)
python
# 安装 MLX
# pip install mlx mlx-lm
import mlx.core as mx
from mlx import nn
from transformers import AutoTokenizer
# 方法 1:使用 mlx-convert 命令行工具
# python -m mlx.convert --model meta-llama/Llama-2-7b-hf --quantize
# 方法 2:Python API 转换
from mlx_lm import convert
# 转换并量化为 INT4
convert(
hf_path="meta-llama/Llama-2-7b-hf",
mlx_path="llama-2-7b-mlx-4bit",
quantize=True, # 启用量化
q_bits=4, # 量化位数
q_group=128, # 分组大小
dtype=mx.float16 # 数据类型
)
# 加载并推理
from mlx_lm import load, generate
model, tokenizer = load("llama-2-7b-mlx-4bit")
# 生成文本
response = generate(
model,
tokenizer,
prompt="Explain quantum computing in simple terms:",
max_tokens=100,
temp=0.7
)
print(response)2.3 GGUF ↔ safetarkers 双向转换
bash
# GGUF → safetarkers(恢复为 HuggingFace 格式)
python convert-gguf-to-hf.py \
--model llama-2-7b-q4_k_m.gguf \
--out llama-2-7b-hf-restored \
--dtype f16
# safetarkers → GGUF
python convert-hf-to-gguf.py \
--model llama-2-7b-hf-restored \
--outfile llama-2-7b-converted.gguf \
--outtype q4_k_m # 直接量化3. 推理验证
3.1 使用量化模型推理(GPTQ)
python
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
# 加载 GPTQ 量化模型
model_path = "llama-2-7b-gptq-4bit"
model = AutoGPTQForCausalLM.from_quantized(
model_path,
use_safetensors=True,
device_map="cuda",
use_triton=False
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# 推理
prompt = "Write a Python function to sort a list:"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
**inputs,
max_new_tokens=100,
temperature=0.7,
top_p=0.95,
do_sample=True
)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)3.2 使用 vLLM 推理(AWQ/GPTQ 优化)
python
# 安装 vLLM
# pip install vllm
from vllm import LLM, SamplingParams
# 加载量化模型(自动检测格式)
llm = LLM(
model="llama-2-7b-awq-4bit", # 支持 AWQ、GPTQ、safetensors
quantization="awq", # 指定量化方法(可选)
gpu_memory_utilization=0.9, # GPU 显存利用率
max_model_len=2048
)
# 采样参数
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.95,
max_tokens=100
)
# 批量推理
prompts = [
"Explain machine learning:",
"What is quantum computing?",
"How does the internet work?"
]
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
print(f"Prompt: {output.prompt}")
print(f"Output: {output.outputs[0].text}\n")3.3 使用 llama.cpp 推理(GGUF)
bash
# 交互式推理
./main -m llama-2-7b-q4_k_m.gguf \
-p "The future of AI is:" \
-n 128 \
-t 8 \ # 线程数
--color
# REST API 模式(用于集成)
./server -m llama-2-7b-q4_k_m.gguf \
--port 8080 \
--host 0.0.0.0 \
--ctx-size 2048
# 调用 API
curl http://localhost:8080/completion \
-d '{"prompt": "Hello, world!", "n_predict": 50}'4. 性能基准测试
4.1 对比不同格式的推理速度
python
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
# 测试不同格式
models_to_test = [
("FP16", "meta-llama/Llama-2-7b-hf", "transformers"),
("INT4 GPTQ", "llama-2-7b-gptq-4bit", "auto_gptq"),
("INT4 AWQ", "llama-2-7b-awq-4bit", "auto_awq"),
]
prompt = "Explain the concept of machine learning in simple terms:"
iterations = 10
results = []
for label, path, framework in models_to_test:
# 加载模型
if framework == "transformers":
model = AutoModelForCausalLM.from_pretrained(
path, torch_dtype=torch.float16, device_map="cuda"
)
elif framework == "auto_gptq":
from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_quantized(path, device_map="cuda")
elif framework == "auto_awq":
from awq import AutoAWQForCausalLM
model = AutoAWQForCausalLM.from_quantized(path, device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(path)
# 预热
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
_ = model.generate(**inputs, max_new_tokens=10)
# 基准测试
times = []
for _ in range(iterations):
start = time.time()
outputs = model.generate(**inputs, max_new_tokens=50)
times.append(time.time() - start)
avg_time = sum(times) / iterations
tokens_per_sec = 50 / avg_time
results.append({
"format": label,
"avg_time_ms": avg_time * 1000,
"tokens_per_sec": tokens_per_sec,
"memory_gb": torch.cuda.max_memory_allocated() / 1e9
})
print(f"{label}: {avg_time*1000:.2f}ms ({tokens_per_sec:.2f} tok/s)")
# 输出对比表格
import pandas as pd
df = pd.DataFrame(results)
print("\n性能对比:")
print(df.to_string(index=False))4.2 精度评估(对比 FP16 基准)
python
import torch
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
from awq import AutoAWQForCausalLM
# 加载 FP16 基准模型
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
base_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
torch_dtype=torch.float16,
device_map="cuda"
)
# 加载量化模型
gptq_model = AutoGPTQForCausalLM.from_quantized(
"llama-2-7b-gptq-4bit", device_map="cuda"
)
awq_model = AutoAWQForCausalLM.from_quantized(
"llama-2-7b-awq-4bit", device_map="cuda"
)
def compute_similarity(model1, model2, prompt):
"""计算两个模型输出的相似度(余弦相似度)"""
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs1 = model1(**inputs, output_hidden_states=True)
outputs2 = model2(**inputs, output_hidden_states=True)
# 提取最后一层隐藏状态
hidden1 = outputs1.hidden_states[-1][:, -1, :] # [batch, hidden_size]
hidden2 = outputs2.hidden_states[-1][:, -1, :]
# 计算余弦相似度
similarity = torch.nn.functional.cosine_similarity(hidden1, hidden2)
return similarity.item()
# 测试样本
test_prompts = [
"What is the capital of France?",
"Explain how photosynthesis works.",
"Write a poem about the ocean."
]
print("精度评估(与 FP16 的余弦相似度):")
for prompt in test_prompts:
sim_gptq = compute_similarity(base_model, gptq_model, prompt)
sim_awq = compute_similarity(base_model, awq_model, prompt)
print(f"\nPrompt: {prompt[:40]}...")
print(f"GPTQ (INT4): {sim_gptq:.4f}")
print(f"AWQ (INT4): {sim_awq:.4f}")5. 关键配置参数
5.1 量化参数影响
| 参数 | 取值范围 | 推荐值 | 影响 |
|---|---|---|---|
bits | 2, 3, 4, 8 | 4 (平衡) | 比特数越低,模型越小,但精度损失越大 |
group_size | 32, 64, 128, 256 | 128 | 分组越大,精度越高,但量化速度越慢 |
damp_percent | 0.0 - 1.0 | 0.01 | 减少量化震荡(过大会导致精度下降) |
sym | True/False | True | 对称量化更快,非对称量化精度稍高 |
desc_act | True/False | False | 影响激活函数的处理方式 |
5.2 推理参数调优
python
sampling_params = {
"temperature": 0.7, # 0.0(确定性)- 1.0+(随机性)
"top_p": 0.95, # Nucleus sampling(0.0-1.0)
"top_k": 40, # Top-K sampling(0-100)
"repetition_penalty": 1.1, # 避免重复(1.0-2.0)
"max_tokens": 256, # 最大生成长度
"presence_penalty": 0.0, # 新词奖励(-2.0-2.0)
"frequency_penalty": 0.0 # 高频词惩罚(-2.0-2.0)
}