371 lines
14 KiB
Python
371 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
分析训练与推理Loss差距的实验脚本
|
||
系统性地验证各种可能的原因
|
||
"""
|
||
|
||
import json
|
||
import random
|
||
import torch
|
||
import torch.nn.functional as F
|
||
from transformers import AutoTokenizer
|
||
import os
|
||
from model.LMConfig import LMConfig
|
||
from model.model_original import MiniMindLM
|
||
|
||
def create_eval_data_from_training_data():
|
||
"""
|
||
从训练数据中重新提取样本创建eval_data.json
|
||
确保数据来源一致性
|
||
"""
|
||
print("=== 1. 创建来自训练数据的评估集 ===")
|
||
|
||
train_data_path = "/home/pci/ycz/Code/Minimind/dataset/stable/merged_pretrain.jsonl"
|
||
eval_data_path = "dataset/stable/eval_data_from_train.json"
|
||
|
||
# 确保目录存在
|
||
os.makedirs("dataset/stable", exist_ok=True)
|
||
|
||
# 从训练数据中随机选择20条
|
||
samples = []
|
||
with open(train_data_path, 'r', encoding='utf-8') as f:
|
||
all_lines = f.readlines()
|
||
|
||
# 随机选择20条数据
|
||
selected_lines = random.sample(all_lines, min(20, len(all_lines)))
|
||
|
||
for line in selected_lines:
|
||
try:
|
||
data = json.loads(line.strip())
|
||
samples.append(data)
|
||
except json.JSONDecodeError:
|
||
continue
|
||
|
||
# 保存到新的评估文件
|
||
with open(eval_data_path, 'w', encoding='utf-8') as f:
|
||
for sample in samples:
|
||
f.write(json.dumps(sample, ensure_ascii=False) + '\n')
|
||
|
||
print(f"✅ 创建了包含{len(samples)}个样本的评估数据集")
|
||
print(f" 保存路径: {eval_data_path}")
|
||
|
||
return eval_data_path, samples
|
||
|
||
def load_model_and_tokenizer(model_path, device='cuda'):
|
||
"""
|
||
加载模型和tokenizer,确保与训练时配置一致
|
||
"""
|
||
print("=== 2. 加载模型和tokenizer ===")
|
||
|
||
# 使用与训练时完全相同的配置
|
||
config = LMConfig(
|
||
dim=512,
|
||
n_layers=8,
|
||
n_heads=32,
|
||
vocab_size=6400,
|
||
max_seq_len=512,
|
||
dropout=0.0,
|
||
norm_eps=1e-5,
|
||
rope_theta=1e6,
|
||
use_moe=False
|
||
)
|
||
|
||
model = MiniMindLM(config)
|
||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||
|
||
# 加载权重
|
||
if os.path.exists(model_path):
|
||
print(f"正在加载权重: {model_path}")
|
||
state_dict = torch.load(model_path, map_location=device)
|
||
|
||
# 检查权重匹配情况
|
||
model_keys = set(model.state_dict().keys())
|
||
checkpoint_keys = set(state_dict.keys())
|
||
matched_keys = model_keys & checkpoint_keys
|
||
missing_keys = model_keys - checkpoint_keys
|
||
unexpected_keys = checkpoint_keys - model_keys
|
||
|
||
print(f" 模型参数: {len(model_keys)}")
|
||
print(f" 权重文件参数: {len(checkpoint_keys)}")
|
||
print(f" 匹配参数: {len(matched_keys)}")
|
||
print(f" 缺失参数: {len(missing_keys)}")
|
||
print(f" 多余参数: {len(unexpected_keys)}")
|
||
|
||
if missing_keys:
|
||
print(f" ❌ 缺失参数: {list(missing_keys)[:5]}...")
|
||
if unexpected_keys:
|
||
print(f" ⚠️ 多余参数: {list(unexpected_keys)[:5]}...")
|
||
|
||
model.load_state_dict(state_dict, strict=False)
|
||
model.to(device)
|
||
model.eval()
|
||
|
||
print("✅ 模型加载完成")
|
||
else:
|
||
raise FileNotFoundError(f"模型文件不存在: {model_path}")
|
||
|
||
return model, tokenizer, config
|
||
|
||
def test_inference_modes(model, tokenizer, samples, device='cuda'):
|
||
"""
|
||
测试不同推理模式的loss差异
|
||
"""
|
||
print("=== 3. 测试不同推理模式 ===")
|
||
|
||
results = {}
|
||
|
||
for mode_name, use_cache in [("无缓存", False), ("有KV缓存", True)]:
|
||
print(f"\n--- 测试模式: {mode_name} ---")
|
||
|
||
total_loss = 0
|
||
valid_samples = 0
|
||
|
||
for i, sample in enumerate(samples[:5]): # 测试前5个样本
|
||
text = sample['text']
|
||
|
||
# 确保文本长度足够
|
||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||
if len(tokens) < 130: # 100输入 + 30预测
|
||
continue
|
||
|
||
input_tokens = tokens[:100]
|
||
target_tokens = tokens[100:130] # 30个预测token
|
||
|
||
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
|
||
target_ids = torch.tensor([target_tokens], dtype=torch.long).to(device)
|
||
|
||
with torch.no_grad():
|
||
# 方法1: 直接forward计算loss(类似训练)
|
||
full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
|
||
outputs = model(full_input)
|
||
logits = outputs.logits
|
||
|
||
# 计算loss
|
||
shift_logits = logits[0, 99:129, :].contiguous() # 取预测部分的logits
|
||
shift_labels = target_ids[0].contiguous()
|
||
|
||
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||
|
||
total_loss += loss.item()
|
||
valid_samples += 1
|
||
|
||
print(f" 样本{i+1}: loss = {loss.item():.4f}")
|
||
|
||
avg_loss = total_loss / valid_samples if valid_samples > 0 else 0
|
||
results[mode_name] = avg_loss
|
||
print(f" {mode_name}平均loss: {avg_loss:.4f}")
|
||
|
||
return results
|
||
|
||
def test_autoregressive_vs_teacher_forcing(model, tokenizer, samples, device='cuda'):
|
||
"""
|
||
对比自回归生成vs教师强制的loss差异
|
||
"""
|
||
print("=== 4. 对比自回归生成 vs 教师强制 ===")
|
||
|
||
results = {}
|
||
|
||
for i, sample in enumerate(samples[:3]): # 测试前3个样本
|
||
text = sample['text']
|
||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||
|
||
if len(tokens) < 130:
|
||
continue
|
||
|
||
input_tokens = tokens[:100]
|
||
target_tokens = tokens[100:130]
|
||
|
||
print(f"\n--- 样本 {i+1} ---")
|
||
|
||
# 方法1: 教师强制(类似训练时)
|
||
with torch.no_grad():
|
||
full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
|
||
outputs = model(full_input)
|
||
logits = outputs.logits
|
||
|
||
shift_logits = logits[0, 99:129, :].contiguous()
|
||
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||
|
||
teacher_forcing_loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||
print(f" 教师强制loss: {teacher_forcing_loss.item():.4f}")
|
||
|
||
# 方法2: 自回归生成(逐步预测)
|
||
with torch.no_grad():
|
||
current_sequence = torch.tensor([input_tokens], dtype=torch.long).to(device)
|
||
autoregressive_losses = []
|
||
|
||
for step in range(len(target_tokens)):
|
||
outputs = model(current_sequence)
|
||
logits = outputs.logits[0, -1, :] # 只取最后一个位置的logits
|
||
|
||
# 计算当前步骤的loss
|
||
true_next_token = target_tokens[step]
|
||
step_loss = F.cross_entropy(logits.unsqueeze(0),
|
||
torch.tensor([true_next_token], device=device))
|
||
autoregressive_losses.append(step_loss.item())
|
||
|
||
# 添加真实token到序列中(教师强制)
|
||
current_sequence = torch.cat([
|
||
current_sequence,
|
||
torch.tensor([[true_next_token]], device=device)
|
||
], dim=1)
|
||
|
||
autoregressive_loss = sum(autoregressive_losses) / len(autoregressive_losses)
|
||
print(f" 自回归loss: {autoregressive_loss:.4f}")
|
||
print(f" loss差距: {abs(autoregressive_loss - teacher_forcing_loss.item()):.4f}")
|
||
|
||
# 方法3: 真实自回归生成(使用预测token)
|
||
with torch.no_grad():
|
||
current_sequence = torch.tensor([input_tokens], dtype=torch.long).to(device)
|
||
real_autoregressive_losses = []
|
||
|
||
for step in range(len(target_tokens)):
|
||
outputs = model(current_sequence)
|
||
logits = outputs.logits[0, -1, :]
|
||
|
||
# 预测下一个token
|
||
predicted_token = torch.argmax(logits, dim=-1).item()
|
||
|
||
# 计算与真实token的loss
|
||
true_next_token = target_tokens[step]
|
||
step_loss = F.cross_entropy(logits.unsqueeze(0),
|
||
torch.tensor([true_next_token], device=device))
|
||
real_autoregressive_losses.append(step_loss.item())
|
||
|
||
# 使用预测的token继续生成
|
||
current_sequence = torch.cat([
|
||
current_sequence,
|
||
torch.tensor([[predicted_token]], device=device)
|
||
], dim=1)
|
||
|
||
real_autoregressive_loss = sum(real_autoregressive_losses) / len(real_autoregressive_losses)
|
||
print(f" 真实自回归loss: {real_autoregressive_loss:.4f}")
|
||
|
||
def analyze_data_distribution(samples, tokenizer):
|
||
"""
|
||
分析评估数据的分布特征
|
||
"""
|
||
print("=== 5. 分析数据分布 ===")
|
||
|
||
lengths = []
|
||
vocab_coverage = set()
|
||
|
||
for sample in samples:
|
||
text = sample['text']
|
||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||
lengths.append(len(tokens))
|
||
vocab_coverage.update(tokens)
|
||
|
||
print(f"文本长度统计:")
|
||
print(f" 平均长度: {sum(lengths)/len(lengths):.1f} tokens")
|
||
print(f" 最短: {min(lengths)} tokens")
|
||
print(f" 最长: {max(lengths)} tokens")
|
||
print(f" 词汇覆盖: {len(vocab_coverage)} 个不同token")
|
||
print(f" 词汇覆盖率: {len(vocab_coverage)/6400*100:.1f}%")
|
||
|
||
def compare_training_vs_inference_computation(model, tokenizer, samples, device='cuda'):
|
||
"""
|
||
对比训练时和推理时的具体计算过程
|
||
"""
|
||
print("=== 6. 对比训练与推理的计算过程 ===")
|
||
|
||
sample = samples[0]
|
||
text = sample['text']
|
||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||
|
||
if len(tokens) < 130:
|
||
print("样本长度不足,跳过")
|
||
return
|
||
|
||
input_tokens = tokens[:100]
|
||
target_tokens = tokens[100:130]
|
||
|
||
print(f"测试样本长度: {len(tokens)} tokens")
|
||
print(f"输入部分: {len(input_tokens)} tokens")
|
||
print(f"目标部分: {len(target_tokens)} tokens")
|
||
|
||
# 模拟训练时的计算
|
||
print("\n--- 模拟训练时计算 ---")
|
||
with torch.no_grad():
|
||
# 训练时:一次性输入完整序列
|
||
full_sequence = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
|
||
outputs = model(full_sequence)
|
||
logits = outputs.logits
|
||
|
||
print(f"输入形状: {full_sequence.shape}")
|
||
print(f"输出logits形状: {logits.shape}")
|
||
|
||
# 计算loss的方式和训练时一致
|
||
shift_logits = logits[0, :-1, :].contiguous() # 去掉最后一个position
|
||
shift_labels = full_sequence[0, 1:].contiguous() # 去掉第一个position
|
||
|
||
# 只计算预测部分的loss
|
||
predict_start = 99 # 从第100个token开始预测
|
||
predict_logits = shift_logits[predict_start:predict_start+30, :]
|
||
predict_labels = shift_labels[predict_start:predict_start+30]
|
||
|
||
training_loss = F.cross_entropy(predict_logits, predict_labels, reduction='mean')
|
||
print(f"训练方式loss: {training_loss.item():.4f}")
|
||
|
||
# 模拟推理时的计算
|
||
print("\n--- 模拟推理时计算 ---")
|
||
with torch.no_grad():
|
||
# 推理时:分别处理输入和目标
|
||
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
|
||
|
||
# 使用和eval_model.py相同的方法
|
||
full_input_for_loss = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
|
||
outputs = model(full_input_for_loss, logits_to_keep=30)
|
||
|
||
if outputs.logits is not None:
|
||
shift_logits = outputs.logits[0, -30:, :].contiguous()
|
||
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||
|
||
inference_loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||
print(f"推理方式loss: {inference_loss.item():.4f}")
|
||
else:
|
||
print("无法获取logits")
|
||
|
||
def main():
|
||
"""
|
||
主函数:系统性分析训练与推理loss差距
|
||
"""
|
||
print("🔍 开始分析训练与推理Loss差距")
|
||
print("="*60)
|
||
|
||
# 设置随机种子确保结果可重现
|
||
random.seed(42)
|
||
torch.manual_seed(42)
|
||
|
||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
|
||
|
||
try:
|
||
# 1. 创建来自训练数据的评估集
|
||
eval_data_path, samples = create_eval_data_from_training_data()
|
||
|
||
# 2. 加载模型
|
||
model, tokenizer, config = load_model_and_tokenizer(model_path, device)
|
||
|
||
# 3. 分析数据分布
|
||
analyze_data_distribution(samples, tokenizer)
|
||
|
||
# 4. 测试不同推理模式
|
||
mode_results = test_inference_modes(model, tokenizer, samples, device)
|
||
|
||
# 5. 对比自回归vs教师强制
|
||
test_autoregressive_vs_teacher_forcing(model, tokenizer, samples, device)
|
||
|
||
# 6. 对比训练与推理的具体计算过程
|
||
compare_training_vs_inference_computation(model, tokenizer, samples, device)
|
||
|
||
print("\n" + "="*60)
|
||
print("🎯 分析完成")
|
||
|
||
except Exception as e:
|
||
print(f"❌ 分析过程中出现错误: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
if __name__ == "__main__":
|
||
main() |