Clean up temporary debugging and analysis files
Remove debugging scripts and analysis files that are no longer needed: - analyze_position_slicing.py - analyze_train_inference_gap.py - debug_model.py - eval_model_final_fixed.py - eval_model_fixed.py - final_fix_eval_model.py - fix_logits_to_keep_issue.py - investigate_logits_to_keep.py - train_inference_gap_analysis_report.md 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
495fc412cd
commit
c4c72ac154
@ -1,193 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
深入分析位置切片的问题
|
||||
验证logits_to_keep和位置索引的正确性
|
||||
"""
|
||||
|
||||
import json
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import AutoTokenizer
|
||||
from model.LMConfig import LMConfig
|
||||
from model.model_original import MiniMindLM
|
||||
|
||||
|
||||
def analyze_position_indexing():
|
||||
"""
|
||||
分析位置索引的正确性
|
||||
"""
|
||||
print("🔍 分析位置索引和切片逻辑")
|
||||
print("="*60)
|
||||
|
||||
device = 'cuda'
|
||||
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
|
||||
|
||||
# 加载模型
|
||||
config = LMConfig(
|
||||
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
|
||||
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
|
||||
)
|
||||
|
||||
model = MiniMindLM(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
|
||||
state_dict = torch.load(model_path, map_location=device)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
# 加载测试数据
|
||||
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
|
||||
sample = json.loads(f.readline().strip())
|
||||
|
||||
text = sample['text']
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
input_length = 100
|
||||
predict_length = 30
|
||||
input_tokens = tokens[:input_length]
|
||||
target_tokens = tokens[input_length:input_length + predict_length]
|
||||
|
||||
print(f"输入长度: {input_length}")
|
||||
print(f"预测长度: {predict_length}")
|
||||
print(f"总序列长度: {input_length + predict_length}")
|
||||
print(f"输入token位置: 0 到 {input_length-1}")
|
||||
print(f"目标token位置: {input_length} 到 {input_length + predict_length - 1}")
|
||||
|
||||
with torch.no_grad():
|
||||
full_input = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
|
||||
target_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
print(f"\n🔬 详细分析不同切片方法:")
|
||||
|
||||
# 方法1: 标准forward
|
||||
outputs1 = model(full_input)
|
||||
logits1 = outputs1.logits
|
||||
print(f"\n1. 标准forward:")
|
||||
print(f" 输入形状: {full_input.shape}")
|
||||
print(f" 输出logits形状: {logits1.shape}")
|
||||
|
||||
# 在transformer中,position i的logits预测position i+1的token
|
||||
# 所以要预测position 100-129的token,需要position 99-128的logits
|
||||
correct_slice = logits1[0, input_length-1:input_length+predict_length-1, :].contiguous()
|
||||
loss1 = F.cross_entropy(correct_slice, target_labels, reduction='mean')
|
||||
print(f" 正确切片 [{input_length-1}:{input_length+predict_length-1}]: {correct_slice.shape}")
|
||||
print(f" Loss: {loss1.item():.4f}")
|
||||
|
||||
# 方法2: logits_to_keep
|
||||
outputs2 = model(full_input, logits_to_keep=predict_length)
|
||||
logits2 = outputs2.logits
|
||||
print(f"\n2. logits_to_keep={predict_length}:")
|
||||
print(f" 输出logits形状: {logits2.shape}")
|
||||
|
||||
# 当logits_to_keep=30时,返回最后30个位置的logits
|
||||
# 这应该对应position 100-129,但实际是哪些位置?
|
||||
keep_slice = logits2[0, -predict_length:, :].contiguous()
|
||||
loss2 = F.cross_entropy(keep_slice, target_labels, reduction='mean')
|
||||
print(f" logits_to_keep切片 [-{predict_length}:]: {keep_slice.shape}")
|
||||
print(f" Loss: {loss2.item():.4f}")
|
||||
|
||||
# 检查这两个切片是否相同
|
||||
print(f"\n🔍 切片对比:")
|
||||
if torch.allclose(correct_slice, keep_slice, rtol=1e-6):
|
||||
print(f" ✅ 两个切片完全相同")
|
||||
else:
|
||||
diff = torch.abs(correct_slice - keep_slice).max()
|
||||
print(f" ❌ 切片不同,最大差异: {diff.item():.8f}")
|
||||
|
||||
# 检查具体哪些位置不同
|
||||
diff_mask = ~torch.isclose(correct_slice, keep_slice, rtol=1e-6)
|
||||
diff_positions = torch.where(diff_mask.any(dim=-1))[0]
|
||||
print(f" 不同的位置: {diff_positions.tolist()}")
|
||||
|
||||
# 方法3: 验证eval_model.py中的逻辑
|
||||
print(f"\n3. eval_model.py的逻辑:")
|
||||
# eval_model.py使用的是logits[0, -predict_length:, :]
|
||||
eval_slice = logits1[0, -predict_length:, :].contiguous()
|
||||
loss3 = F.cross_entropy(eval_slice, target_labels, reduction='mean')
|
||||
print(f" eval_model.py切片 [-{predict_length}:]: {eval_slice.shape}")
|
||||
print(f" 这对应logits中的位置: {logits1.shape[1] - predict_length} 到 {logits1.shape[1] - 1}")
|
||||
print(f" Loss: {loss3.item():.4f}")
|
||||
|
||||
# 检查eval_model.py的切片是否正确
|
||||
if torch.allclose(correct_slice, eval_slice, rtol=1e-6):
|
||||
print(f" ✅ eval_model.py切片正确")
|
||||
else:
|
||||
diff = torch.abs(correct_slice - eval_slice).max()
|
||||
print(f" ❌ eval_model.py切片错误,最大差异: {diff.item():.8f}")
|
||||
|
||||
|
||||
def compare_different_sequence_lengths():
|
||||
"""
|
||||
比较不同序列长度下的行为
|
||||
"""
|
||||
print(f"\n🧪 测试不同序列长度")
|
||||
print("="*60)
|
||||
|
||||
device = 'cuda'
|
||||
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
|
||||
|
||||
# 加载模型
|
||||
config = LMConfig(
|
||||
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
|
||||
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
|
||||
)
|
||||
|
||||
model = MiniMindLM(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
|
||||
state_dict = torch.load(model_path, map_location=device)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
# 创建测试序列
|
||||
test_tokens = list(range(200)) # 简单的数字序列
|
||||
|
||||
test_configs = [
|
||||
(50, 20), # 50输入,20预测
|
||||
(100, 30), # 100输入,30预测
|
||||
(150, 40), # 150输入,40预测
|
||||
]
|
||||
|
||||
for input_len, predict_len in test_configs:
|
||||
print(f"\n测试配置: 输入{input_len}, 预测{predict_len}")
|
||||
|
||||
sequence = test_tokens[:input_len + predict_len]
|
||||
input_ids = torch.tensor([sequence], dtype=torch.long).to(device)
|
||||
target_labels = torch.tensor(sequence[input_len:], dtype=torch.long).to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
# 标准方法
|
||||
outputs_std = model(input_ids)
|
||||
logits_std = outputs_std.logits
|
||||
slice_std = logits_std[0, input_len-1:input_len+predict_len-1, :].contiguous()
|
||||
loss_std = F.cross_entropy(slice_std, target_labels, reduction='mean')
|
||||
|
||||
# logits_to_keep方法
|
||||
outputs_keep = model(input_ids, logits_to_keep=predict_len)
|
||||
logits_keep = outputs_keep.logits
|
||||
slice_keep = logits_keep[0, -predict_len:, :].contiguous()
|
||||
loss_keep = F.cross_entropy(slice_keep, target_labels, reduction='mean')
|
||||
|
||||
# eval_model.py方法
|
||||
slice_eval = logits_std[0, -predict_len:, :].contiguous()
|
||||
loss_eval = F.cross_entropy(slice_eval, target_labels, reduction='mean')
|
||||
|
||||
print(f" 标准方法loss: {loss_std.item():.4f}")
|
||||
print(f" logits_to_keep loss: {loss_keep.item():.4f}")
|
||||
print(f" eval_model.py loss: {loss_eval.item():.4f}")
|
||||
|
||||
# 检查是否相同
|
||||
std_vs_keep = torch.allclose(slice_std, slice_keep, rtol=1e-6)
|
||||
std_vs_eval = torch.allclose(slice_std, slice_eval, rtol=1e-6)
|
||||
keep_vs_eval = torch.allclose(slice_keep, slice_eval, rtol=1e-6)
|
||||
|
||||
print(f" 标准 vs logits_to_keep: {'✅' if std_vs_keep else '❌'}")
|
||||
print(f" 标准 vs eval_model.py: {'✅' if std_vs_eval else '❌'}")
|
||||
print(f" logits_to_keep vs eval_model.py: {'✅' if keep_vs_eval else '❌'}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
analyze_position_indexing()
|
||||
compare_different_sequence_lengths()
|
||||
@ -1,371 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
分析训练与推理Loss差距的实验脚本
|
||||
系统性地验证各种可能的原因
|
||||
"""
|
||||
|
||||
import json
|
||||
import random
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import AutoTokenizer
|
||||
import os
|
||||
from model.LMConfig import LMConfig
|
||||
from model.model_original import MiniMindLM
|
||||
|
||||
def create_eval_data_from_training_data():
|
||||
"""
|
||||
从训练数据中重新提取样本创建eval_data.json
|
||||
确保数据来源一致性
|
||||
"""
|
||||
print("=== 1. 创建来自训练数据的评估集 ===")
|
||||
|
||||
train_data_path = "/home/pci/ycz/Code/Minimind/dataset/stable/merged_pretrain.jsonl"
|
||||
eval_data_path = "dataset/stable/eval_data_from_train.json"
|
||||
|
||||
# 确保目录存在
|
||||
os.makedirs("dataset/stable", exist_ok=True)
|
||||
|
||||
# 从训练数据中随机选择20条
|
||||
samples = []
|
||||
with open(train_data_path, 'r', encoding='utf-8') as f:
|
||||
all_lines = f.readlines()
|
||||
|
||||
# 随机选择20条数据
|
||||
selected_lines = random.sample(all_lines, min(20, len(all_lines)))
|
||||
|
||||
for line in selected_lines:
|
||||
try:
|
||||
data = json.loads(line.strip())
|
||||
samples.append(data)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# 保存到新的评估文件
|
||||
with open(eval_data_path, 'w', encoding='utf-8') as f:
|
||||
for sample in samples:
|
||||
f.write(json.dumps(sample, ensure_ascii=False) + '\n')
|
||||
|
||||
print(f"✅ 创建了包含{len(samples)}个样本的评估数据集")
|
||||
print(f" 保存路径: {eval_data_path}")
|
||||
|
||||
return eval_data_path, samples
|
||||
|
||||
def load_model_and_tokenizer(model_path, device='cuda'):
|
||||
"""
|
||||
加载模型和tokenizer,确保与训练时配置一致
|
||||
"""
|
||||
print("=== 2. 加载模型和tokenizer ===")
|
||||
|
||||
# 使用与训练时完全相同的配置
|
||||
config = LMConfig(
|
||||
dim=512,
|
||||
n_layers=8,
|
||||
n_heads=32,
|
||||
vocab_size=6400,
|
||||
max_seq_len=512,
|
||||
dropout=0.0,
|
||||
norm_eps=1e-5,
|
||||
rope_theta=1e6,
|
||||
use_moe=False
|
||||
)
|
||||
|
||||
model = MiniMindLM(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
|
||||
# 加载权重
|
||||
if os.path.exists(model_path):
|
||||
print(f"正在加载权重: {model_path}")
|
||||
state_dict = torch.load(model_path, map_location=device)
|
||||
|
||||
# 检查权重匹配情况
|
||||
model_keys = set(model.state_dict().keys())
|
||||
checkpoint_keys = set(state_dict.keys())
|
||||
matched_keys = model_keys & checkpoint_keys
|
||||
missing_keys = model_keys - checkpoint_keys
|
||||
unexpected_keys = checkpoint_keys - model_keys
|
||||
|
||||
print(f" 模型参数: {len(model_keys)}")
|
||||
print(f" 权重文件参数: {len(checkpoint_keys)}")
|
||||
print(f" 匹配参数: {len(matched_keys)}")
|
||||
print(f" 缺失参数: {len(missing_keys)}")
|
||||
print(f" 多余参数: {len(unexpected_keys)}")
|
||||
|
||||
if missing_keys:
|
||||
print(f" ❌ 缺失参数: {list(missing_keys)[:5]}...")
|
||||
if unexpected_keys:
|
||||
print(f" ⚠️ 多余参数: {list(unexpected_keys)[:5]}...")
|
||||
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
print("✅ 模型加载完成")
|
||||
else:
|
||||
raise FileNotFoundError(f"模型文件不存在: {model_path}")
|
||||
|
||||
return model, tokenizer, config
|
||||
|
||||
def test_inference_modes(model, tokenizer, samples, device='cuda'):
|
||||
"""
|
||||
测试不同推理模式的loss差异
|
||||
"""
|
||||
print("=== 3. 测试不同推理模式 ===")
|
||||
|
||||
results = {}
|
||||
|
||||
for mode_name, use_cache in [("无缓存", False), ("有KV缓存", True)]:
|
||||
print(f"\n--- 测试模式: {mode_name} ---")
|
||||
|
||||
total_loss = 0
|
||||
valid_samples = 0
|
||||
|
||||
for i, sample in enumerate(samples[:5]): # 测试前5个样本
|
||||
text = sample['text']
|
||||
|
||||
# 确保文本长度足够
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
if len(tokens) < 130: # 100输入 + 30预测
|
||||
continue
|
||||
|
||||
input_tokens = tokens[:100]
|
||||
target_tokens = tokens[100:130] # 30个预测token
|
||||
|
||||
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
|
||||
target_ids = torch.tensor([target_tokens], dtype=torch.long).to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
# 方法1: 直接forward计算loss(类似训练)
|
||||
full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
|
||||
outputs = model(full_input)
|
||||
logits = outputs.logits
|
||||
|
||||
# 计算loss
|
||||
shift_logits = logits[0, 99:129, :].contiguous() # 取预测部分的logits
|
||||
shift_labels = target_ids[0].contiguous()
|
||||
|
||||
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||||
|
||||
total_loss += loss.item()
|
||||
valid_samples += 1
|
||||
|
||||
print(f" 样本{i+1}: loss = {loss.item():.4f}")
|
||||
|
||||
avg_loss = total_loss / valid_samples if valid_samples > 0 else 0
|
||||
results[mode_name] = avg_loss
|
||||
print(f" {mode_name}平均loss: {avg_loss:.4f}")
|
||||
|
||||
return results
|
||||
|
||||
def test_autoregressive_vs_teacher_forcing(model, tokenizer, samples, device='cuda'):
|
||||
"""
|
||||
对比自回归生成vs教师强制的loss差异
|
||||
"""
|
||||
print("=== 4. 对比自回归生成 vs 教师强制 ===")
|
||||
|
||||
results = {}
|
||||
|
||||
for i, sample in enumerate(samples[:3]): # 测试前3个样本
|
||||
text = sample['text']
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
if len(tokens) < 130:
|
||||
continue
|
||||
|
||||
input_tokens = tokens[:100]
|
||||
target_tokens = tokens[100:130]
|
||||
|
||||
print(f"\n--- 样本 {i+1} ---")
|
||||
|
||||
# 方法1: 教师强制(类似训练时)
|
||||
with torch.no_grad():
|
||||
full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
|
||||
outputs = model(full_input)
|
||||
logits = outputs.logits
|
||||
|
||||
shift_logits = logits[0, 99:129, :].contiguous()
|
||||
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
teacher_forcing_loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||||
print(f" 教师强制loss: {teacher_forcing_loss.item():.4f}")
|
||||
|
||||
# 方法2: 自回归生成(逐步预测)
|
||||
with torch.no_grad():
|
||||
current_sequence = torch.tensor([input_tokens], dtype=torch.long).to(device)
|
||||
autoregressive_losses = []
|
||||
|
||||
for step in range(len(target_tokens)):
|
||||
outputs = model(current_sequence)
|
||||
logits = outputs.logits[0, -1, :] # 只取最后一个位置的logits
|
||||
|
||||
# 计算当前步骤的loss
|
||||
true_next_token = target_tokens[step]
|
||||
step_loss = F.cross_entropy(logits.unsqueeze(0),
|
||||
torch.tensor([true_next_token], device=device))
|
||||
autoregressive_losses.append(step_loss.item())
|
||||
|
||||
# 添加真实token到序列中(教师强制)
|
||||
current_sequence = torch.cat([
|
||||
current_sequence,
|
||||
torch.tensor([[true_next_token]], device=device)
|
||||
], dim=1)
|
||||
|
||||
autoregressive_loss = sum(autoregressive_losses) / len(autoregressive_losses)
|
||||
print(f" 自回归loss: {autoregressive_loss:.4f}")
|
||||
print(f" loss差距: {abs(autoregressive_loss - teacher_forcing_loss.item()):.4f}")
|
||||
|
||||
# 方法3: 真实自回归生成(使用预测token)
|
||||
with torch.no_grad():
|
||||
current_sequence = torch.tensor([input_tokens], dtype=torch.long).to(device)
|
||||
real_autoregressive_losses = []
|
||||
|
||||
for step in range(len(target_tokens)):
|
||||
outputs = model(current_sequence)
|
||||
logits = outputs.logits[0, -1, :]
|
||||
|
||||
# 预测下一个token
|
||||
predicted_token = torch.argmax(logits, dim=-1).item()
|
||||
|
||||
# 计算与真实token的loss
|
||||
true_next_token = target_tokens[step]
|
||||
step_loss = F.cross_entropy(logits.unsqueeze(0),
|
||||
torch.tensor([true_next_token], device=device))
|
||||
real_autoregressive_losses.append(step_loss.item())
|
||||
|
||||
# 使用预测的token继续生成
|
||||
current_sequence = torch.cat([
|
||||
current_sequence,
|
||||
torch.tensor([[predicted_token]], device=device)
|
||||
], dim=1)
|
||||
|
||||
real_autoregressive_loss = sum(real_autoregressive_losses) / len(real_autoregressive_losses)
|
||||
print(f" 真实自回归loss: {real_autoregressive_loss:.4f}")
|
||||
|
||||
def analyze_data_distribution(samples, tokenizer):
|
||||
"""
|
||||
分析评估数据的分布特征
|
||||
"""
|
||||
print("=== 5. 分析数据分布 ===")
|
||||
|
||||
lengths = []
|
||||
vocab_coverage = set()
|
||||
|
||||
for sample in samples:
|
||||
text = sample['text']
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
lengths.append(len(tokens))
|
||||
vocab_coverage.update(tokens)
|
||||
|
||||
print(f"文本长度统计:")
|
||||
print(f" 平均长度: {sum(lengths)/len(lengths):.1f} tokens")
|
||||
print(f" 最短: {min(lengths)} tokens")
|
||||
print(f" 最长: {max(lengths)} tokens")
|
||||
print(f" 词汇覆盖: {len(vocab_coverage)} 个不同token")
|
||||
print(f" 词汇覆盖率: {len(vocab_coverage)/6400*100:.1f}%")
|
||||
|
||||
def compare_training_vs_inference_computation(model, tokenizer, samples, device='cuda'):
|
||||
"""
|
||||
对比训练时和推理时的具体计算过程
|
||||
"""
|
||||
print("=== 6. 对比训练与推理的计算过程 ===")
|
||||
|
||||
sample = samples[0]
|
||||
text = sample['text']
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
if len(tokens) < 130:
|
||||
print("样本长度不足,跳过")
|
||||
return
|
||||
|
||||
input_tokens = tokens[:100]
|
||||
target_tokens = tokens[100:130]
|
||||
|
||||
print(f"测试样本长度: {len(tokens)} tokens")
|
||||
print(f"输入部分: {len(input_tokens)} tokens")
|
||||
print(f"目标部分: {len(target_tokens)} tokens")
|
||||
|
||||
# 模拟训练时的计算
|
||||
print("\n--- 模拟训练时计算 ---")
|
||||
with torch.no_grad():
|
||||
# 训练时:一次性输入完整序列
|
||||
full_sequence = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
|
||||
outputs = model(full_sequence)
|
||||
logits = outputs.logits
|
||||
|
||||
print(f"输入形状: {full_sequence.shape}")
|
||||
print(f"输出logits形状: {logits.shape}")
|
||||
|
||||
# 计算loss的方式和训练时一致
|
||||
shift_logits = logits[0, :-1, :].contiguous() # 去掉最后一个position
|
||||
shift_labels = full_sequence[0, 1:].contiguous() # 去掉第一个position
|
||||
|
||||
# 只计算预测部分的loss
|
||||
predict_start = 99 # 从第100个token开始预测
|
||||
predict_logits = shift_logits[predict_start:predict_start+30, :]
|
||||
predict_labels = shift_labels[predict_start:predict_start+30]
|
||||
|
||||
training_loss = F.cross_entropy(predict_logits, predict_labels, reduction='mean')
|
||||
print(f"训练方式loss: {training_loss.item():.4f}")
|
||||
|
||||
# 模拟推理时的计算
|
||||
print("\n--- 模拟推理时计算 ---")
|
||||
with torch.no_grad():
|
||||
# 推理时:分别处理输入和目标
|
||||
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
|
||||
|
||||
# 使用和eval_model.py相同的方法
|
||||
full_input_for_loss = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
|
||||
outputs = model(full_input_for_loss, logits_to_keep=30)
|
||||
|
||||
if outputs.logits is not None:
|
||||
shift_logits = outputs.logits[0, -30:, :].contiguous()
|
||||
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
inference_loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||||
print(f"推理方式loss: {inference_loss.item():.4f}")
|
||||
else:
|
||||
print("无法获取logits")
|
||||
|
||||
def main():
|
||||
"""
|
||||
主函数:系统性分析训练与推理loss差距
|
||||
"""
|
||||
print("🔍 开始分析训练与推理Loss差距")
|
||||
print("="*60)
|
||||
|
||||
# 设置随机种子确保结果可重现
|
||||
random.seed(42)
|
||||
torch.manual_seed(42)
|
||||
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
|
||||
|
||||
try:
|
||||
# 1. 创建来自训练数据的评估集
|
||||
eval_data_path, samples = create_eval_data_from_training_data()
|
||||
|
||||
# 2. 加载模型
|
||||
model, tokenizer, config = load_model_and_tokenizer(model_path, device)
|
||||
|
||||
# 3. 分析数据分布
|
||||
analyze_data_distribution(samples, tokenizer)
|
||||
|
||||
# 4. 测试不同推理模式
|
||||
mode_results = test_inference_modes(model, tokenizer, samples, device)
|
||||
|
||||
# 5. 对比自回归vs教师强制
|
||||
test_autoregressive_vs_teacher_forcing(model, tokenizer, samples, device)
|
||||
|
||||
# 6. 对比训练与推理的具体计算过程
|
||||
compare_training_vs_inference_computation(model, tokenizer, samples, device)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("🎯 分析完成")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 分析过程中出现错误: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
101
debug_model.py
101
debug_model.py
@ -1,101 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
调试模型生成过程
|
||||
"""
|
||||
|
||||
import torch
|
||||
from transformers import AutoTokenizer
|
||||
from model.model_original import MiniMindLM
|
||||
from model.LMConfig import LMConfig
|
||||
|
||||
def debug_generation():
|
||||
# 加载模型和tokenizer
|
||||
device = 'cuda'
|
||||
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
|
||||
|
||||
# 配置
|
||||
config = LMConfig(
|
||||
dim=512,
|
||||
n_layers=8,
|
||||
n_heads=32,
|
||||
vocab_size=6400,
|
||||
max_seq_len=512
|
||||
)
|
||||
|
||||
# 初始化模型
|
||||
model = MiniMindLM(config)
|
||||
|
||||
# 加载权重
|
||||
state_dict = torch.load(model_path, map_location=device)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
# 加载tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
|
||||
# 测试文本
|
||||
text = "The quick brown fox"
|
||||
input_tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
print(f"输入文本: {text}")
|
||||
print(f"输入tokens: {input_tokens}")
|
||||
print(f"解码回来: {tokenizer.decode(input_tokens)}")
|
||||
|
||||
# 转为tensor
|
||||
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
|
||||
print(f"输入张量形状: {input_ids.shape}")
|
||||
|
||||
# 手动生成一步
|
||||
with torch.no_grad():
|
||||
# 前向传播
|
||||
outputs = model(input_ids)
|
||||
logits = outputs.logits
|
||||
print(f"输出logits形状: {logits.shape}")
|
||||
|
||||
# 获取最后一个位置的logits
|
||||
next_token_logits = logits[0, -1, :]
|
||||
print(f"下一个token的logits形状: {next_token_logits.shape}")
|
||||
|
||||
# 应用温度
|
||||
next_token_logits = next_token_logits / 1.0
|
||||
|
||||
# 获取概率分布
|
||||
probs = torch.softmax(next_token_logits, dim=-1)
|
||||
|
||||
# 找出top-5的token
|
||||
top_probs, top_indices = torch.topk(probs, 10)
|
||||
print(f"\nTop 10 候选tokens:")
|
||||
for i, (prob, idx) in enumerate(zip(top_probs, top_indices)):
|
||||
token_text = tokenizer.decode([idx.item()], skip_special_tokens=True)
|
||||
print(f" {i+1}. Token {idx.item()}: '{token_text}' (prob: {prob.item():.4f})")
|
||||
|
||||
# 贪婪采样
|
||||
next_token = torch.argmax(next_token_logits, dim=-1)
|
||||
print(f"\n贪婪采样选择的token: {next_token.item()}")
|
||||
print(f"对应文本: '{tokenizer.decode([next_token.item()], skip_special_tokens=True)}'")
|
||||
|
||||
# 使用generate方法
|
||||
print(f"\n使用generate方法:")
|
||||
with torch.no_grad():
|
||||
generated = model.generate(
|
||||
input_ids,
|
||||
max_new_tokens=5,
|
||||
temperature=1.0,
|
||||
top_p=0.95,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id
|
||||
)
|
||||
|
||||
print(f"生成的完整序列长度: {generated[0].shape}")
|
||||
print(f"生成的tokens: {generated[0].tolist()}")
|
||||
|
||||
# 提取新生成的部分
|
||||
if len(generated[0]) > len(input_tokens):
|
||||
new_tokens = generated[0][len(input_tokens):].tolist()
|
||||
print(f"新生成的tokens: {new_tokens}")
|
||||
print(f"新生成的文本: '{tokenizer.decode(new_tokens, skip_special_tokens=True)}'")
|
||||
else:
|
||||
print("没有生成新的tokens")
|
||||
|
||||
if __name__ == "__main__":
|
||||
debug_generation()
|
||||
@ -1,519 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
评估预训练模型的推理效果
|
||||
用于测试不同实验中训练出来的模型在eval_data.json上的表现
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import AutoTokenizer
|
||||
from model.LMConfig import LMConfig
|
||||
|
||||
|
||||
def load_model(model_path, model_type, device, config_params=None):
|
||||
"""
|
||||
加载模型和tokenizer
|
||||
|
||||
Args:
|
||||
model_path: 模型权重文件路径
|
||||
model_type: 模型类型 (model/model_original/model_no_feed)
|
||||
device: 运行设备
|
||||
config_params: 模型配置参数字典
|
||||
|
||||
Returns:
|
||||
model: 加载好的模型
|
||||
tokenizer: tokenizer实例
|
||||
"""
|
||||
# 初始化配置
|
||||
if config_params:
|
||||
lm_config = LMConfig(**config_params)
|
||||
else:
|
||||
lm_config = LMConfig()
|
||||
|
||||
# 打印配置信息
|
||||
print(f"模型配置:")
|
||||
print(f" dim: {lm_config.dim}")
|
||||
print(f" n_layers: {lm_config.n_layers}")
|
||||
print(f" n_heads: {lm_config.n_heads}")
|
||||
print(f" vocab_size: {lm_config.vocab_size}")
|
||||
print(f" max_seq_len: {lm_config.max_seq_len}")
|
||||
if hasattr(lm_config, 'knowledge_num'):
|
||||
print(f" knowledge_num: {lm_config.knowledge_num}")
|
||||
print(f" knowledge_length: {lm_config.knowledge_length}")
|
||||
print(f" knowledge_dim: {lm_config.knowledge_dim}")
|
||||
print()
|
||||
|
||||
# 加载tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
|
||||
# 根据模型类型导入对应的模型类
|
||||
if model_type == "model":
|
||||
from model.model import MiniMindLM
|
||||
elif model_type == "model_original":
|
||||
from model.model_original import MiniMindLM
|
||||
elif model_type == "model_no_feed":
|
||||
from model.model_no_feed import MiniMindLM
|
||||
else:
|
||||
raise ValueError(f"不支持的模型类型: {model_type}")
|
||||
|
||||
# 初始化模型
|
||||
model = MiniMindLM(lm_config)
|
||||
|
||||
# 加载权重
|
||||
if os.path.exists(model_path):
|
||||
print(f"正在从 {model_path} 加载模型权重...")
|
||||
|
||||
# 加载权重文件
|
||||
state_dict = torch.load(model_path, map_location=device)
|
||||
|
||||
# 获取模型的参数名称
|
||||
model_keys = set(model.state_dict().keys())
|
||||
checkpoint_keys = set(state_dict.keys())
|
||||
|
||||
# 统计权重匹配情况
|
||||
matched_keys = model_keys & checkpoint_keys
|
||||
missing_keys = model_keys - checkpoint_keys
|
||||
unexpected_keys = checkpoint_keys - model_keys
|
||||
|
||||
print(f"\n权重加载详情:")
|
||||
print(f" 模型总参数数量: {len(model_keys)}")
|
||||
print(f" 权重文件参数数量: {len(checkpoint_keys)}")
|
||||
print(f" 成功匹配参数: {len(matched_keys)}")
|
||||
print(f" 缺失参数: {len(missing_keys)}")
|
||||
print(f" 多余参数: {len(unexpected_keys)}")
|
||||
|
||||
# 详细列出缺失和多余的参数
|
||||
if missing_keys:
|
||||
print(f"\n❌ 缺失的参数 ({len(missing_keys)}):")
|
||||
for key in sorted(missing_keys):
|
||||
print(f" - {key}")
|
||||
|
||||
if unexpected_keys:
|
||||
print(f"\n⚠️ 权重文件中多余的参数 ({len(unexpected_keys)}):")
|
||||
for key in sorted(unexpected_keys):
|
||||
print(f" + {key}")
|
||||
|
||||
# 加载权重(允许部分匹配)
|
||||
try:
|
||||
incompatible_keys = model.load_state_dict(state_dict, strict=False)
|
||||
|
||||
# 检查加载结果
|
||||
if len(incompatible_keys.missing_keys) == 0 and len(incompatible_keys.unexpected_keys) == 0:
|
||||
print(f"\n✅ 权重加载完全成功!")
|
||||
elif len(incompatible_keys.missing_keys) == 0:
|
||||
print(f"\n✅ 权重加载成功(忽略多余参数)")
|
||||
else:
|
||||
print(f"\n⚠️ 权重加载部分成功,存在缺失参数")
|
||||
print(f" 这可能影响模型性能,请检查模型配置参数是否正确")
|
||||
|
||||
# 计算加载成功率
|
||||
success_rate = len(matched_keys) / len(model_keys) * 100
|
||||
print(f" 参数加载成功率: {success_rate:.1f}%")
|
||||
|
||||
if success_rate < 90:
|
||||
print(f" ❌ 警告:加载成功率过低,模型可能无法正常工作!")
|
||||
elif success_rate < 100:
|
||||
print(f" ⚠️ 警告:存在缺失参数,可能影响模型性能")
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"权重加载失败: {e}")
|
||||
|
||||
# 验证关键层的形状
|
||||
print("🔍 验证关键层形状:")
|
||||
key_layers = [
|
||||
'tok_embeddings.weight',
|
||||
'output.weight',
|
||||
'norm.weight',
|
||||
]
|
||||
|
||||
# 添加每一层的验证
|
||||
for i in range(lm_config.n_layers):
|
||||
key_layers.extend([
|
||||
f'layers.{i}.attention_norm.weight',
|
||||
f'layers.{i}.ffn_norm.weight',
|
||||
f'layers.{i}.self_attention.wq.weight',
|
||||
f'layers.{i}.self_attention.wk.weight',
|
||||
f'layers.{i}.self_attention.wv.weight',
|
||||
f'layers.{i}.self_attention.wo.weight',
|
||||
])
|
||||
|
||||
# FFN层的验证(model_original有FFN,其他模型可能没有)
|
||||
if f'layers.{i}.feed_forward.w1.weight' in model_keys:
|
||||
key_layers.extend([
|
||||
f'layers.{i}.feed_forward.w1.weight',
|
||||
f'layers.{i}.feed_forward.w2.weight',
|
||||
f'layers.{i}.feed_forward.w3.weight',
|
||||
])
|
||||
|
||||
# 验证KnowledgeDataset相关层(仅model和model_no_feed)
|
||||
if model_type in ['model', 'model_no_feed']:
|
||||
key_layers.extend([
|
||||
'knowledge_dataset.to_queries.0.weight',
|
||||
'knowledge_dataset.keys',
|
||||
'knowledge_dataset.knowledge_dataset',
|
||||
])
|
||||
|
||||
# 添加CrossAttention层
|
||||
for i in range(lm_config.n_layers):
|
||||
key_layers.extend([
|
||||
f'layers.{i}.cross_attention.to_q.weight',
|
||||
f'layers.{i}.cross_attention.to_k.weight',
|
||||
f'layers.{i}.cross_attention.to_v.weight',
|
||||
f'layers.{i}.cross_attention.to_out.weight',
|
||||
])
|
||||
|
||||
# 检查关键层
|
||||
verified_layers = 0
|
||||
total_key_layers = 0
|
||||
|
||||
for layer_name in key_layers:
|
||||
if layer_name in model_keys: # 只检查模型中实际存在的层
|
||||
total_key_layers += 1
|
||||
if layer_name in matched_keys:
|
||||
verified_layers += 1
|
||||
expected_shape = model.state_dict()[layer_name].shape
|
||||
actual_shape = state_dict[layer_name].shape if layer_name in state_dict else "缺失"
|
||||
if layer_name in state_dict and expected_shape == actual_shape:
|
||||
print(f" ✅ {layer_name}: {actual_shape}")
|
||||
else:
|
||||
print(f" ❌ {layer_name}: 期望 {expected_shape}, 实际 {actual_shape}")
|
||||
else:
|
||||
print(f" ❌ {layer_name}: 缺失")
|
||||
|
||||
print(f"\n关键层验证结果: {verified_layers}/{total_key_layers} 层验证成功")
|
||||
|
||||
if verified_layers == total_key_layers:
|
||||
print("✅ 所有关键层验证通过!")
|
||||
elif verified_layers / total_key_layers >= 0.9:
|
||||
print("⚠️ 大部分关键层验证通过,模型应该可以正常工作")
|
||||
else:
|
||||
print("❌ 关键层验证失败过多,模型可能无法正常工作!")
|
||||
|
||||
print()
|
||||
else:
|
||||
raise FileNotFoundError(f"模型文件不存在: {model_path}")
|
||||
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def load_eval_data(data_path, num_samples=20):
|
||||
"""
|
||||
加载评估数据集
|
||||
|
||||
Args:
|
||||
data_path: 数据文件路径
|
||||
num_samples: 要评估的样本数量
|
||||
|
||||
Returns:
|
||||
samples: 数据样本列表
|
||||
"""
|
||||
data = []
|
||||
with open(data_path, 'r', encoding='utf-8') as f:
|
||||
for line_num, line in enumerate(f):
|
||||
line = line.strip()
|
||||
if line: # 跳过空行
|
||||
try:
|
||||
sample = json.loads(line)
|
||||
data.append(sample)
|
||||
if len(data) >= num_samples:
|
||||
break
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"警告:第{line_num+1}行JSON解析失败: {e}")
|
||||
continue
|
||||
|
||||
# 只取前num_samples条数据
|
||||
samples = data[:num_samples]
|
||||
print(f"加载了 {len(samples)} 条评估数据")
|
||||
|
||||
return samples
|
||||
|
||||
|
||||
def evaluate_sample(model, tokenizer, text, input_length=100, predict_length=100, device='cuda'):
|
||||
"""
|
||||
评估单个样本
|
||||
|
||||
Args:
|
||||
model: 模型实例
|
||||
tokenizer: tokenizer实例
|
||||
text: 输入文本
|
||||
input_length: 输入token数量
|
||||
predict_length: 预测token数量
|
||||
device: 运行设备
|
||||
|
||||
Returns:
|
||||
input_text: 输入文本
|
||||
predicted_text: 预测文本
|
||||
ground_truth_text: 真实文本
|
||||
loss: 预测损失(如果可计算)
|
||||
"""
|
||||
# 对文本进行分词
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
# 确保有足够的token
|
||||
if len(tokens) < input_length + predict_length:
|
||||
print(f"警告:文本长度不足,只有 {len(tokens)} 个token")
|
||||
return None, None, None, None
|
||||
|
||||
# 分割输入和目标
|
||||
input_tokens = tokens[:input_length]
|
||||
target_tokens = tokens[input_length:input_length + predict_length]
|
||||
|
||||
# 转换为张量
|
||||
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
|
||||
|
||||
# 生成预测
|
||||
with torch.no_grad():
|
||||
# 使用generate方法生成,调整参数改善生成质量
|
||||
generated = model.generate(
|
||||
input_ids,
|
||||
max_new_tokens=predict_length,
|
||||
temperature=1.0,
|
||||
top_p=0.95,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id
|
||||
)
|
||||
|
||||
# 提取生成的token(去掉输入部分)
|
||||
# generated包含完整序列,需要从input_length位置开始提取新生成的部分
|
||||
full_generated_tokens = generated[0].tolist()
|
||||
if len(full_generated_tokens) > input_length:
|
||||
predicted_tokens = full_generated_tokens[input_length:]
|
||||
else:
|
||||
# 如果生成序列长度不够,说明没有新生成内容
|
||||
predicted_tokens = []
|
||||
|
||||
# 检查是否因EOS token提前结束生成
|
||||
eos_found = False
|
||||
eos_position = -1
|
||||
actual_predicted_length = len(predicted_tokens)
|
||||
|
||||
if predicted_tokens and tokenizer.eos_token_id is not None:
|
||||
try:
|
||||
eos_position = predicted_tokens.index(tokenizer.eos_token_id)
|
||||
eos_found = True
|
||||
# 只保留EOS token之前的内容
|
||||
predicted_tokens = predicted_tokens[:eos_position]
|
||||
actual_predicted_length = len(predicted_tokens)
|
||||
except ValueError:
|
||||
# 没有找到EOS token
|
||||
pass
|
||||
|
||||
# 计算loss(使用forward方法)
|
||||
# 准备用于loss计算的输入
|
||||
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
|
||||
outputs = model(loss_input_ids) # 移除logits_to_keep参数
|
||||
|
||||
# 计算loss
|
||||
logits = outputs.logits
|
||||
loss = None
|
||||
if logits is not None:
|
||||
# 重塑logits和目标 - 修复:使用正确的位置切片
|
||||
# 在Transformer中,position i的logits预测position i+1的token
|
||||
# 要预测position input_length到input_length+predict_length-1的token
|
||||
# 需要使用position input_length-1到input_length+predict_length-2的logits
|
||||
shift_logits = logits[0, input_length-1:input_length+predict_length-1, :].contiguous()
|
||||
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
# 计算交叉熵损失
|
||||
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||||
loss = loss.item()
|
||||
|
||||
# 解码文本
|
||||
input_text = tokenizer.decode(input_tokens, skip_special_tokens=True)
|
||||
# 只解码实际生成的token,限制在predict_length内
|
||||
actual_predicted_tokens = predicted_tokens[:predict_length] if predicted_tokens else []
|
||||
predicted_text = tokenizer.decode(actual_predicted_tokens, skip_special_tokens=True) if actual_predicted_tokens else "[未生成内容]"
|
||||
ground_truth_text = tokenizer.decode(target_tokens, skip_special_tokens=True)
|
||||
|
||||
# 返回额外的生成统计信息
|
||||
generation_stats = {
|
||||
'requested_length': predict_length,
|
||||
'actual_length': actual_predicted_length,
|
||||
'eos_found': eos_found,
|
||||
'eos_position': eos_position if eos_found else None,
|
||||
'truncated_by_eos': eos_found and eos_position < predict_length
|
||||
}
|
||||
|
||||
return input_text, predicted_text, ground_truth_text, loss, generation_stats
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='评估预训练模型')
|
||||
parser.add_argument('--model_path', type=str, default='out/experiment_1_4_0/pretrain_512.pth',
|
||||
help='模型权重文件路径')
|
||||
parser.add_argument('--model_type', type=str, default='model',
|
||||
choices=['model', 'model_original', 'model_no_feed'],
|
||||
help='模型类型')
|
||||
parser.add_argument('--data_path', type=str, default='dataset/stable/eval_data.json',
|
||||
help='评估数据集路径')
|
||||
parser.add_argument('--num_samples', type=int, default=20,
|
||||
help='评估样本数量')
|
||||
parser.add_argument('--input_length', type=int, default=100,
|
||||
help='输入token长度')
|
||||
parser.add_argument('--predict_length', type=int, default=100,
|
||||
help='预测token长度')
|
||||
parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
|
||||
help='运行设备')
|
||||
|
||||
# 模型架构参数
|
||||
parser.add_argument('--dim', type=int, default=512,
|
||||
help='模型维度')
|
||||
parser.add_argument('--n_layers', type=int, default=8,
|
||||
help='Transformer层数')
|
||||
parser.add_argument('--n_heads', type=int, default=32,
|
||||
help='注意力头数')
|
||||
parser.add_argument('--n_kv_heads', type=int, default=8,
|
||||
help='KV注意力头数')
|
||||
parser.add_argument('--vocab_size', type=int, default=6400,
|
||||
help='词汇表大小')
|
||||
parser.add_argument('--max_seq_len', type=int, default=512,
|
||||
help='最大序列长度')
|
||||
parser.add_argument('--dropout', type=float, default=0.0,
|
||||
help='Dropout率')
|
||||
parser.add_argument('--norm_eps', type=float, default=1e-5,
|
||||
help='层归一化epsilon')
|
||||
parser.add_argument('--rope_theta', type=float, default=1e6,
|
||||
help='RoPE theta参数')
|
||||
|
||||
# KnowledgeDataset相关参数(仅model和model_no_feed使用)
|
||||
parser.add_argument('--knowledge_num', type=int, default=1048576,
|
||||
help='知识条目数量')
|
||||
parser.add_argument('--knowledge_length', type=int, default=32,
|
||||
help='单条知识长度')
|
||||
parser.add_argument('--knowledge_dim', type=int, default=128,
|
||||
help='知识维度')
|
||||
|
||||
# MOE相关参数
|
||||
parser.add_argument('--use_moe', action='store_true',
|
||||
help='是否使用MOE')
|
||||
parser.add_argument('--num_experts_per_tok', type=int, default=2,
|
||||
help='每个token激活的专家数')
|
||||
parser.add_argument('--n_routed_experts', type=int, default=4,
|
||||
help='路由专家数量')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"评估配置:")
|
||||
print(f" 模型路径: {args.model_path}")
|
||||
print(f" 模型类型: {args.model_type}")
|
||||
print(f" 数据路径: {args.data_path}")
|
||||
print(f" 样本数量: {args.num_samples}")
|
||||
print(f" 输入长度: {args.input_length} tokens")
|
||||
print(f" 预测长度: {args.predict_length} tokens")
|
||||
print(f" 运行设备: {args.device}")
|
||||
print()
|
||||
|
||||
# 构建配置参数字典
|
||||
config_params = {
|
||||
'dim': args.dim,
|
||||
'n_layers': args.n_layers,
|
||||
'n_heads': args.n_heads,
|
||||
'n_kv_heads': args.n_kv_heads,
|
||||
'vocab_size': args.vocab_size,
|
||||
'max_seq_len': args.max_seq_len,
|
||||
'dropout': args.dropout,
|
||||
'norm_eps': args.norm_eps,
|
||||
'rope_theta': args.rope_theta,
|
||||
'use_moe': args.use_moe,
|
||||
'num_experts_per_tok': args.num_experts_per_tok,
|
||||
'n_routed_experts': args.n_routed_experts,
|
||||
}
|
||||
|
||||
# 只有model和model_no_feed需要KnowledgeDataset参数
|
||||
if args.model_type in ['model', 'model_no_feed']:
|
||||
config_params.update({
|
||||
'knowledge_num': args.knowledge_num,
|
||||
'knowledge_length': args.knowledge_length,
|
||||
'knowledge_dim': args.knowledge_dim,
|
||||
})
|
||||
|
||||
# 加载模型
|
||||
model, tokenizer = load_model(args.model_path, args.model_type, args.device, config_params)
|
||||
|
||||
# 加载数据
|
||||
samples = load_eval_data(args.data_path, args.num_samples)
|
||||
|
||||
# 评估每个样本
|
||||
total_loss = 0
|
||||
valid_samples = 0
|
||||
total_requested_tokens = 0
|
||||
total_actual_tokens = 0
|
||||
samples_with_eos = 0
|
||||
samples_truncated_by_eos = 0
|
||||
|
||||
for i, sample in enumerate(samples):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"样本 {i+1}/{len(samples)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
text = sample['text']
|
||||
|
||||
# 评估样本
|
||||
input_text, predicted_text, ground_truth_text, loss, generation_stats = evaluate_sample(
|
||||
model, tokenizer, text,
|
||||
args.input_length, args.predict_length, args.device
|
||||
)
|
||||
|
||||
if input_text is None:
|
||||
print("跳过该样本(文本长度不足)")
|
||||
continue
|
||||
|
||||
# 打印结果
|
||||
print(f"\n输入 ({args.input_length} tokens):")
|
||||
print(f" {input_text}")
|
||||
print(f"\n预测输出 (请求{generation_stats['requested_length']}个token, 实际生成{generation_stats['actual_length']}个):")
|
||||
print(f" {predicted_text}")
|
||||
print(f"\n真实值 ({args.predict_length} tokens):")
|
||||
print(f" {ground_truth_text}")
|
||||
|
||||
# 打印生成统计信息
|
||||
print(f"\n生成统计:")
|
||||
print(f" 请求生成: {generation_stats['requested_length']} tokens")
|
||||
print(f" 实际生成: {generation_stats['actual_length']} tokens")
|
||||
if generation_stats['eos_found']:
|
||||
print(f" ✅ 发现EOS token在位置 {generation_stats['eos_position']}")
|
||||
if generation_stats['truncated_by_eos']:
|
||||
print(f" ⚠️ 因EOS token提前结束生成")
|
||||
else:
|
||||
print(f" ✅ EOS token出现在预期位置")
|
||||
else:
|
||||
print(f" ❌ 未发现EOS token (可能达到最大长度限制)")
|
||||
|
||||
if loss is not None:
|
||||
print(f"\nLoss: {loss:.4f}")
|
||||
total_loss += loss
|
||||
valid_samples += 1
|
||||
|
||||
# 更新生成统计
|
||||
total_requested_tokens += generation_stats['requested_length']
|
||||
total_actual_tokens += generation_stats['actual_length']
|
||||
if generation_stats['eos_found']:
|
||||
samples_with_eos += 1
|
||||
if generation_stats['truncated_by_eos']:
|
||||
samples_truncated_by_eos += 1
|
||||
|
||||
# 打印总体统计
|
||||
if valid_samples > 0:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"总体统计:")
|
||||
print(f" 有效样本数: {valid_samples}")
|
||||
print(f" 平均Loss: {total_loss / valid_samples:.4f}")
|
||||
print()
|
||||
print(f"生成统计:")
|
||||
print(f" 请求生成总tokens: {total_requested_tokens}")
|
||||
print(f" 实际生成总tokens: {total_actual_tokens}")
|
||||
print(f" 生成完成率: {total_actual_tokens / total_requested_tokens * 100:.1f}%" if total_requested_tokens > 0 else " 生成完成率: N/A")
|
||||
print(f" 发现EOS的样本: {samples_with_eos}/{len(samples)} ({samples_with_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 发现EOS的样本: N/A")
|
||||
print(f" 被EOS截断的样本: {samples_truncated_by_eos}/{len(samples)} ({samples_truncated_by_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 被EOS截断的样本: N/A")
|
||||
print(f" 平均每样本生成长度: {total_actual_tokens/len(samples):.1f} tokens" if len(samples) > 0 else " 平均每样本生成长度: N/A")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,516 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
评估预训练模型的推理效果
|
||||
用于测试不同实验中训练出来的模型在eval_data.json上的表现
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import AutoTokenizer
|
||||
from model.LMConfig import LMConfig
|
||||
|
||||
|
||||
def load_model(model_path, model_type, device, config_params=None):
|
||||
"""
|
||||
加载模型和tokenizer
|
||||
|
||||
Args:
|
||||
model_path: 模型权重文件路径
|
||||
model_type: 模型类型 (model/model_original/model_no_feed)
|
||||
device: 运行设备
|
||||
config_params: 模型配置参数字典
|
||||
|
||||
Returns:
|
||||
model: 加载好的模型
|
||||
tokenizer: tokenizer实例
|
||||
"""
|
||||
# 初始化配置
|
||||
if config_params:
|
||||
lm_config = LMConfig(**config_params)
|
||||
else:
|
||||
lm_config = LMConfig()
|
||||
|
||||
# 打印配置信息
|
||||
print(f"模型配置:")
|
||||
print(f" dim: {lm_config.dim}")
|
||||
print(f" n_layers: {lm_config.n_layers}")
|
||||
print(f" n_heads: {lm_config.n_heads}")
|
||||
print(f" vocab_size: {lm_config.vocab_size}")
|
||||
print(f" max_seq_len: {lm_config.max_seq_len}")
|
||||
if hasattr(lm_config, 'knowledge_num'):
|
||||
print(f" knowledge_num: {lm_config.knowledge_num}")
|
||||
print(f" knowledge_length: {lm_config.knowledge_length}")
|
||||
print(f" knowledge_dim: {lm_config.knowledge_dim}")
|
||||
print()
|
||||
|
||||
# 加载tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
|
||||
# 根据模型类型导入对应的模型类
|
||||
if model_type == "model":
|
||||
from model.model import MiniMindLM
|
||||
elif model_type == "model_original":
|
||||
from model.model_original import MiniMindLM
|
||||
elif model_type == "model_no_feed":
|
||||
from model.model_no_feed import MiniMindLM
|
||||
else:
|
||||
raise ValueError(f"不支持的模型类型: {model_type}")
|
||||
|
||||
# 初始化模型
|
||||
model = MiniMindLM(lm_config)
|
||||
|
||||
# 加载权重
|
||||
if os.path.exists(model_path):
|
||||
print(f"正在从 {model_path} 加载模型权重...")
|
||||
|
||||
# 加载权重文件
|
||||
state_dict = torch.load(model_path, map_location=device)
|
||||
|
||||
# 获取模型的参数名称
|
||||
model_keys = set(model.state_dict().keys())
|
||||
checkpoint_keys = set(state_dict.keys())
|
||||
|
||||
# 统计权重匹配情况
|
||||
matched_keys = model_keys & checkpoint_keys
|
||||
missing_keys = model_keys - checkpoint_keys
|
||||
unexpected_keys = checkpoint_keys - model_keys
|
||||
|
||||
print(f"\n权重加载详情:")
|
||||
print(f" 模型总参数数量: {len(model_keys)}")
|
||||
print(f" 权重文件参数数量: {len(checkpoint_keys)}")
|
||||
print(f" 成功匹配参数: {len(matched_keys)}")
|
||||
print(f" 缺失参数: {len(missing_keys)}")
|
||||
print(f" 多余参数: {len(unexpected_keys)}")
|
||||
|
||||
# 详细列出缺失和多余的参数
|
||||
if missing_keys:
|
||||
print(f"\n❌ 缺失的参数 ({len(missing_keys)}):")
|
||||
for key in sorted(missing_keys):
|
||||
print(f" - {key}")
|
||||
|
||||
if unexpected_keys:
|
||||
print(f"\n⚠️ 权重文件中多余的参数 ({len(unexpected_keys)}):")
|
||||
for key in sorted(unexpected_keys):
|
||||
print(f" + {key}")
|
||||
|
||||
# 加载权重(允许部分匹配)
|
||||
try:
|
||||
incompatible_keys = model.load_state_dict(state_dict, strict=False)
|
||||
|
||||
# 检查加载结果
|
||||
if len(incompatible_keys.missing_keys) == 0 and len(incompatible_keys.unexpected_keys) == 0:
|
||||
print(f"\n✅ 权重加载完全成功!")
|
||||
elif len(incompatible_keys.missing_keys) == 0:
|
||||
print(f"\n✅ 权重加载成功(忽略多余参数)")
|
||||
else:
|
||||
print(f"\n⚠️ 权重加载部分成功,存在缺失参数")
|
||||
print(f" 这可能影响模型性能,请检查模型配置参数是否正确")
|
||||
|
||||
# 计算加载成功率
|
||||
success_rate = len(matched_keys) / len(model_keys) * 100
|
||||
print(f" 参数加载成功率: {success_rate:.1f}%")
|
||||
|
||||
if success_rate < 90:
|
||||
print(f" ❌ 警告:加载成功率过低,模型可能无法正常工作!")
|
||||
elif success_rate < 100:
|
||||
print(f" ⚠️ 警告:存在缺失参数,可能影响模型性能")
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"权重加载失败: {e}")
|
||||
|
||||
# 验证关键层的形状
|
||||
print("🔍 验证关键层形状:")
|
||||
key_layers = [
|
||||
'tok_embeddings.weight',
|
||||
'output.weight',
|
||||
'norm.weight',
|
||||
]
|
||||
|
||||
# 添加每一层的验证
|
||||
for i in range(lm_config.n_layers):
|
||||
key_layers.extend([
|
||||
f'layers.{i}.attention_norm.weight',
|
||||
f'layers.{i}.ffn_norm.weight',
|
||||
f'layers.{i}.self_attention.wq.weight',
|
||||
f'layers.{i}.self_attention.wk.weight',
|
||||
f'layers.{i}.self_attention.wv.weight',
|
||||
f'layers.{i}.self_attention.wo.weight',
|
||||
])
|
||||
|
||||
# FFN层的验证(model_original有FFN,其他模型可能没有)
|
||||
if f'layers.{i}.feed_forward.w1.weight' in model_keys:
|
||||
key_layers.extend([
|
||||
f'layers.{i}.feed_forward.w1.weight',
|
||||
f'layers.{i}.feed_forward.w2.weight',
|
||||
f'layers.{i}.feed_forward.w3.weight',
|
||||
])
|
||||
|
||||
# 验证KnowledgeDataset相关层(仅model和model_no_feed)
|
||||
if model_type in ['model', 'model_no_feed']:
|
||||
key_layers.extend([
|
||||
'knowledge_dataset.to_queries.0.weight',
|
||||
'knowledge_dataset.keys',
|
||||
'knowledge_dataset.knowledge_dataset',
|
||||
])
|
||||
|
||||
# 添加CrossAttention层
|
||||
for i in range(lm_config.n_layers):
|
||||
key_layers.extend([
|
||||
f'layers.{i}.cross_attention.to_q.weight',
|
||||
f'layers.{i}.cross_attention.to_k.weight',
|
||||
f'layers.{i}.cross_attention.to_v.weight',
|
||||
f'layers.{i}.cross_attention.to_out.weight',
|
||||
])
|
||||
|
||||
# 检查关键层
|
||||
verified_layers = 0
|
||||
total_key_layers = 0
|
||||
|
||||
for layer_name in key_layers:
|
||||
if layer_name in model_keys: # 只检查模型中实际存在的层
|
||||
total_key_layers += 1
|
||||
if layer_name in matched_keys:
|
||||
verified_layers += 1
|
||||
expected_shape = model.state_dict()[layer_name].shape
|
||||
actual_shape = state_dict[layer_name].shape if layer_name in state_dict else "缺失"
|
||||
if layer_name in state_dict and expected_shape == actual_shape:
|
||||
print(f" ✅ {layer_name}: {actual_shape}")
|
||||
else:
|
||||
print(f" ❌ {layer_name}: 期望 {expected_shape}, 实际 {actual_shape}")
|
||||
else:
|
||||
print(f" ❌ {layer_name}: 缺失")
|
||||
|
||||
print(f"\n关键层验证结果: {verified_layers}/{total_key_layers} 层验证成功")
|
||||
|
||||
if verified_layers == total_key_layers:
|
||||
print("✅ 所有关键层验证通过!")
|
||||
elif verified_layers / total_key_layers >= 0.9:
|
||||
print("⚠️ 大部分关键层验证通过,模型应该可以正常工作")
|
||||
else:
|
||||
print("❌ 关键层验证失败过多,模型可能无法正常工作!")
|
||||
|
||||
print()
|
||||
else:
|
||||
raise FileNotFoundError(f"模型文件不存在: {model_path}")
|
||||
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def load_eval_data(data_path, num_samples=20):
|
||||
"""
|
||||
加载评估数据集
|
||||
|
||||
Args:
|
||||
data_path: 数据文件路径
|
||||
num_samples: 要评估的样本数量
|
||||
|
||||
Returns:
|
||||
samples: 数据样本列表
|
||||
"""
|
||||
data = []
|
||||
with open(data_path, 'r', encoding='utf-8') as f:
|
||||
for line_num, line in enumerate(f):
|
||||
line = line.strip()
|
||||
if line: # 跳过空行
|
||||
try:
|
||||
sample = json.loads(line)
|
||||
data.append(sample)
|
||||
if len(data) >= num_samples:
|
||||
break
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"警告:第{line_num+1}行JSON解析失败: {e}")
|
||||
continue
|
||||
|
||||
# 只取前num_samples条数据
|
||||
samples = data[:num_samples]
|
||||
print(f"加载了 {len(samples)} 条评估数据")
|
||||
|
||||
return samples
|
||||
|
||||
|
||||
def evaluate_sample(model, tokenizer, text, input_length=100, predict_length=100, device='cuda'):
|
||||
"""
|
||||
评估单个样本
|
||||
|
||||
Args:
|
||||
model: 模型实例
|
||||
tokenizer: tokenizer实例
|
||||
text: 输入文本
|
||||
input_length: 输入token数量
|
||||
predict_length: 预测token数量
|
||||
device: 运行设备
|
||||
|
||||
Returns:
|
||||
input_text: 输入文本
|
||||
predicted_text: 预测文本
|
||||
ground_truth_text: 真实文本
|
||||
loss: 预测损失(如果可计算)
|
||||
"""
|
||||
# 对文本进行分词
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
# 确保有足够的token
|
||||
if len(tokens) < input_length + predict_length:
|
||||
print(f"警告:文本长度不足,只有 {len(tokens)} 个token")
|
||||
return None, None, None, None
|
||||
|
||||
# 分割输入和目标
|
||||
input_tokens = tokens[:input_length]
|
||||
target_tokens = tokens[input_length:input_length + predict_length]
|
||||
|
||||
# 转换为张量
|
||||
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
|
||||
|
||||
# 生成预测
|
||||
with torch.no_grad():
|
||||
# 使用generate方法生成,调整参数改善生成质量
|
||||
generated = model.generate(
|
||||
input_ids,
|
||||
max_new_tokens=predict_length,
|
||||
temperature=1.0,
|
||||
top_p=0.95,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id
|
||||
)
|
||||
|
||||
# 提取生成的token(去掉输入部分)
|
||||
# generated包含完整序列,需要从input_length位置开始提取新生成的部分
|
||||
full_generated_tokens = generated[0].tolist()
|
||||
if len(full_generated_tokens) > input_length:
|
||||
predicted_tokens = full_generated_tokens[input_length:]
|
||||
else:
|
||||
# 如果生成序列长度不够,说明没有新生成内容
|
||||
predicted_tokens = []
|
||||
|
||||
# 检查是否因EOS token提前结束生成
|
||||
eos_found = False
|
||||
eos_position = -1
|
||||
actual_predicted_length = len(predicted_tokens)
|
||||
|
||||
if predicted_tokens and tokenizer.eos_token_id is not None:
|
||||
try:
|
||||
eos_position = predicted_tokens.index(tokenizer.eos_token_id)
|
||||
eos_found = True
|
||||
# 只保留EOS token之前的内容
|
||||
predicted_tokens = predicted_tokens[:eos_position]
|
||||
actual_predicted_length = len(predicted_tokens)
|
||||
except ValueError:
|
||||
# 没有找到EOS token
|
||||
pass
|
||||
|
||||
# 计算loss(使用forward方法)
|
||||
# 准备用于loss计算的输入
|
||||
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
|
||||
outputs = model(loss_input_ids) # 移除logits_to_keep参数
|
||||
|
||||
# 计算loss
|
||||
logits = outputs.logits
|
||||
loss = None
|
||||
if logits is not None:
|
||||
# 重塑logits和目标 - 修复:使用正确的位置切片
|
||||
shift_logits = logits[0, input_length:input_length + predict_length, :].contiguous()
|
||||
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
# 计算交叉熵损失
|
||||
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||||
loss = loss.item()
|
||||
|
||||
# 解码文本
|
||||
input_text = tokenizer.decode(input_tokens, skip_special_tokens=True)
|
||||
# 只解码实际生成的token,限制在predict_length内
|
||||
actual_predicted_tokens = predicted_tokens[:predict_length] if predicted_tokens else []
|
||||
predicted_text = tokenizer.decode(actual_predicted_tokens, skip_special_tokens=True) if actual_predicted_tokens else "[未生成内容]"
|
||||
ground_truth_text = tokenizer.decode(target_tokens, skip_special_tokens=True)
|
||||
|
||||
# 返回额外的生成统计信息
|
||||
generation_stats = {
|
||||
'requested_length': predict_length,
|
||||
'actual_length': actual_predicted_length,
|
||||
'eos_found': eos_found,
|
||||
'eos_position': eos_position if eos_found else None,
|
||||
'truncated_by_eos': eos_found and eos_position < predict_length
|
||||
}
|
||||
|
||||
return input_text, predicted_text, ground_truth_text, loss, generation_stats
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='评估预训练模型')
|
||||
parser.add_argument('--model_path', type=str, default='out/experiment_1_4_0/pretrain_512.pth',
|
||||
help='模型权重文件路径')
|
||||
parser.add_argument('--model_type', type=str, default='model',
|
||||
choices=['model', 'model_original', 'model_no_feed'],
|
||||
help='模型类型')
|
||||
parser.add_argument('--data_path', type=str, default='dataset/stable/eval_data.json',
|
||||
help='评估数据集路径')
|
||||
parser.add_argument('--num_samples', type=int, default=20,
|
||||
help='评估样本数量')
|
||||
parser.add_argument('--input_length', type=int, default=100,
|
||||
help='输入token长度')
|
||||
parser.add_argument('--predict_length', type=int, default=100,
|
||||
help='预测token长度')
|
||||
parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
|
||||
help='运行设备')
|
||||
|
||||
# 模型架构参数
|
||||
parser.add_argument('--dim', type=int, default=512,
|
||||
help='模型维度')
|
||||
parser.add_argument('--n_layers', type=int, default=8,
|
||||
help='Transformer层数')
|
||||
parser.add_argument('--n_heads', type=int, default=32,
|
||||
help='注意力头数')
|
||||
parser.add_argument('--n_kv_heads', type=int, default=8,
|
||||
help='KV注意力头数')
|
||||
parser.add_argument('--vocab_size', type=int, default=6400,
|
||||
help='词汇表大小')
|
||||
parser.add_argument('--max_seq_len', type=int, default=512,
|
||||
help='最大序列长度')
|
||||
parser.add_argument('--dropout', type=float, default=0.0,
|
||||
help='Dropout率')
|
||||
parser.add_argument('--norm_eps', type=float, default=1e-5,
|
||||
help='层归一化epsilon')
|
||||
parser.add_argument('--rope_theta', type=float, default=1e6,
|
||||
help='RoPE theta参数')
|
||||
|
||||
# KnowledgeDataset相关参数(仅model和model_no_feed使用)
|
||||
parser.add_argument('--knowledge_num', type=int, default=1048576,
|
||||
help='知识条目数量')
|
||||
parser.add_argument('--knowledge_length', type=int, default=32,
|
||||
help='单条知识长度')
|
||||
parser.add_argument('--knowledge_dim', type=int, default=128,
|
||||
help='知识维度')
|
||||
|
||||
# MOE相关参数
|
||||
parser.add_argument('--use_moe', action='store_true',
|
||||
help='是否使用MOE')
|
||||
parser.add_argument('--num_experts_per_tok', type=int, default=2,
|
||||
help='每个token激活的专家数')
|
||||
parser.add_argument('--n_routed_experts', type=int, default=4,
|
||||
help='路由专家数量')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"评估配置:")
|
||||
print(f" 模型路径: {args.model_path}")
|
||||
print(f" 模型类型: {args.model_type}")
|
||||
print(f" 数据路径: {args.data_path}")
|
||||
print(f" 样本数量: {args.num_samples}")
|
||||
print(f" 输入长度: {args.input_length} tokens")
|
||||
print(f" 预测长度: {args.predict_length} tokens")
|
||||
print(f" 运行设备: {args.device}")
|
||||
print()
|
||||
|
||||
# 构建配置参数字典
|
||||
config_params = {
|
||||
'dim': args.dim,
|
||||
'n_layers': args.n_layers,
|
||||
'n_heads': args.n_heads,
|
||||
'n_kv_heads': args.n_kv_heads,
|
||||
'vocab_size': args.vocab_size,
|
||||
'max_seq_len': args.max_seq_len,
|
||||
'dropout': args.dropout,
|
||||
'norm_eps': args.norm_eps,
|
||||
'rope_theta': args.rope_theta,
|
||||
'use_moe': args.use_moe,
|
||||
'num_experts_per_tok': args.num_experts_per_tok,
|
||||
'n_routed_experts': args.n_routed_experts,
|
||||
}
|
||||
|
||||
# 只有model和model_no_feed需要KnowledgeDataset参数
|
||||
if args.model_type in ['model', 'model_no_feed']:
|
||||
config_params.update({
|
||||
'knowledge_num': args.knowledge_num,
|
||||
'knowledge_length': args.knowledge_length,
|
||||
'knowledge_dim': args.knowledge_dim,
|
||||
})
|
||||
|
||||
# 加载模型
|
||||
model, tokenizer = load_model(args.model_path, args.model_type, args.device, config_params)
|
||||
|
||||
# 加载数据
|
||||
samples = load_eval_data(args.data_path, args.num_samples)
|
||||
|
||||
# 评估每个样本
|
||||
total_loss = 0
|
||||
valid_samples = 0
|
||||
total_requested_tokens = 0
|
||||
total_actual_tokens = 0
|
||||
samples_with_eos = 0
|
||||
samples_truncated_by_eos = 0
|
||||
|
||||
for i, sample in enumerate(samples):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"样本 {i+1}/{len(samples)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
text = sample['text']
|
||||
|
||||
# 评估样本
|
||||
input_text, predicted_text, ground_truth_text, loss, generation_stats = evaluate_sample(
|
||||
model, tokenizer, text,
|
||||
args.input_length, args.predict_length, args.device
|
||||
)
|
||||
|
||||
if input_text is None:
|
||||
print("跳过该样本(文本长度不足)")
|
||||
continue
|
||||
|
||||
# 打印结果
|
||||
print(f"\n输入 ({args.input_length} tokens):")
|
||||
print(f" {input_text}")
|
||||
print(f"\n预测输出 (请求{generation_stats['requested_length']}个token, 实际生成{generation_stats['actual_length']}个):")
|
||||
print(f" {predicted_text}")
|
||||
print(f"\n真实值 ({args.predict_length} tokens):")
|
||||
print(f" {ground_truth_text}")
|
||||
|
||||
# 打印生成统计信息
|
||||
print(f"\n生成统计:")
|
||||
print(f" 请求生成: {generation_stats['requested_length']} tokens")
|
||||
print(f" 实际生成: {generation_stats['actual_length']} tokens")
|
||||
if generation_stats['eos_found']:
|
||||
print(f" ✅ 发现EOS token在位置 {generation_stats['eos_position']}")
|
||||
if generation_stats['truncated_by_eos']:
|
||||
print(f" ⚠️ 因EOS token提前结束生成")
|
||||
else:
|
||||
print(f" ✅ EOS token出现在预期位置")
|
||||
else:
|
||||
print(f" ❌ 未发现EOS token (可能达到最大长度限制)")
|
||||
|
||||
if loss is not None:
|
||||
print(f"\nLoss: {loss:.4f}")
|
||||
total_loss += loss
|
||||
valid_samples += 1
|
||||
|
||||
# 更新生成统计
|
||||
total_requested_tokens += generation_stats['requested_length']
|
||||
total_actual_tokens += generation_stats['actual_length']
|
||||
if generation_stats['eos_found']:
|
||||
samples_with_eos += 1
|
||||
if generation_stats['truncated_by_eos']:
|
||||
samples_truncated_by_eos += 1
|
||||
|
||||
# 打印总体统计
|
||||
if valid_samples > 0:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"总体统计:")
|
||||
print(f" 有效样本数: {valid_samples}")
|
||||
print(f" 平均Loss: {total_loss / valid_samples:.4f}")
|
||||
print()
|
||||
print(f"生成统计:")
|
||||
print(f" 请求生成总tokens: {total_requested_tokens}")
|
||||
print(f" 实际生成总tokens: {total_actual_tokens}")
|
||||
print(f" 生成完成率: {total_actual_tokens / total_requested_tokens * 100:.1f}%" if total_requested_tokens > 0 else " 生成完成率: N/A")
|
||||
print(f" 发现EOS的样本: {samples_with_eos}/{len(samples)} ({samples_with_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 发现EOS的样本: N/A")
|
||||
print(f" 被EOS截断的样本: {samples_truncated_by_eos}/{len(samples)} ({samples_truncated_by_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 被EOS截断的样本: N/A")
|
||||
print(f" 平均每样本生成长度: {total_actual_tokens/len(samples):.1f} tokens" if len(samples) > 0 else " 平均每样本生成长度: N/A")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,218 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
最终修复eval_model.py中的位置索引错误
|
||||
"""
|
||||
|
||||
import json
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import AutoTokenizer
|
||||
from model.LMConfig import LMConfig
|
||||
from model.model_original import MiniMindLM
|
||||
|
||||
|
||||
def demonstrate_correct_fix():
|
||||
"""
|
||||
演示正确的修复方法
|
||||
"""
|
||||
print("🔧 演示正确的修复方法")
|
||||
print("="*60)
|
||||
|
||||
device = 'cuda'
|
||||
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
|
||||
|
||||
# 加载模型
|
||||
config = LMConfig(
|
||||
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
|
||||
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
|
||||
)
|
||||
|
||||
model = MiniMindLM(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
|
||||
state_dict = torch.load(model_path, map_location=device)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
# 测试多个样本以验证修复效果
|
||||
total_loss_wrong = 0
|
||||
total_loss_correct = 0
|
||||
valid_samples = 0
|
||||
|
||||
print("测试样本的loss对比:")
|
||||
print("样本 | 错误方法 | 正确方法 | 差异")
|
||||
print("-" * 45)
|
||||
|
||||
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
|
||||
for i, line in enumerate(f):
|
||||
if i >= 10: # 测试前10个样本
|
||||
break
|
||||
|
||||
sample = json.loads(line.strip())
|
||||
text = sample['text']
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
if len(tokens) < 130:
|
||||
continue
|
||||
|
||||
input_length = 100
|
||||
predict_length = 30
|
||||
target_tokens = tokens[input_length:input_length + predict_length]
|
||||
|
||||
with torch.no_grad():
|
||||
full_input = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
|
||||
target_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
# 获取完整logits
|
||||
outputs = model(full_input)
|
||||
logits = outputs.logits
|
||||
|
||||
# 错误方法 (eval_model.py原来的方法)
|
||||
wrong_slice = logits[0, -predict_length:, :].contiguous() # 取最后30个
|
||||
loss_wrong = F.cross_entropy(wrong_slice, target_labels, reduction='mean')
|
||||
|
||||
# 正确方法
|
||||
correct_slice = logits[0, input_length-1:input_length+predict_length-1, :].contiguous() # 取99:129
|
||||
loss_correct = F.cross_entropy(correct_slice, target_labels, reduction='mean')
|
||||
|
||||
total_loss_wrong += loss_wrong.item()
|
||||
total_loss_correct += loss_correct.item()
|
||||
valid_samples += 1
|
||||
|
||||
diff = loss_wrong.item() - loss_correct.item()
|
||||
print(f"{i+1:2} | {loss_wrong.item():8.4f} | {loss_correct.item():8.4f} | {diff:+6.4f}")
|
||||
|
||||
avg_loss_wrong = total_loss_wrong / valid_samples
|
||||
avg_loss_correct = total_loss_correct / valid_samples
|
||||
improvement = avg_loss_wrong - avg_loss_correct
|
||||
|
||||
print("-" * 45)
|
||||
print(f"平均 | {avg_loss_wrong:8.4f} | {avg_loss_correct:8.4f} | {improvement:+6.4f}")
|
||||
|
||||
print(f"\n📊 修复效果:")
|
||||
print(f" 错误方法平均loss: {avg_loss_wrong:.4f}")
|
||||
print(f" 正确方法平均loss: {avg_loss_correct:.4f}")
|
||||
print(f" 改进幅度: {improvement:.4f} ({improvement/avg_loss_wrong*100:.1f}%)")
|
||||
print(f" 正确方法更接近训练时的教师强制loss (~2.4)")
|
||||
|
||||
|
||||
def create_final_fixed_eval_model():
|
||||
"""
|
||||
创建最终修复版的eval_model.py
|
||||
"""
|
||||
print(f"\n🔧 创建最终修复版的eval_model.py")
|
||||
print("="*60)
|
||||
|
||||
# 读取原始eval_model.py
|
||||
with open('eval_model.py', 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# 修复evaluate_sample函数中的关键部分
|
||||
old_loss_calculation = ''' # 计算loss(使用forward方法)
|
||||
# 准备用于loss计算的输入
|
||||
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
|
||||
outputs = model(loss_input_ids, logits_to_keep=predict_length)
|
||||
|
||||
# 计算loss
|
||||
logits = outputs.logits
|
||||
loss = None
|
||||
if logits is not None:
|
||||
# 重塑logits和目标
|
||||
shift_logits = logits[0, -predict_length:, :].contiguous()
|
||||
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
# 计算交叉熵损失
|
||||
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||||
loss = loss.item()'''
|
||||
|
||||
new_loss_calculation = ''' # 计算loss(使用forward方法)
|
||||
# 准备用于loss计算的输入
|
||||
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
|
||||
outputs = model(loss_input_ids) # 移除logits_to_keep参数
|
||||
|
||||
# 计算loss
|
||||
logits = outputs.logits
|
||||
loss = None
|
||||
if logits is not None:
|
||||
# 重塑logits和目标 - 修复:使用正确的位置切片
|
||||
# 在Transformer中,position i的logits预测position i+1的token
|
||||
# 要预测position input_length到input_length+predict_length-1的token
|
||||
# 需要使用position input_length-1到input_length+predict_length-2的logits
|
||||
shift_logits = logits[0, input_length-1:input_length+predict_length-1, :].contiguous()
|
||||
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
# 计算交叉熵损失
|
||||
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||||
loss = loss.item()'''
|
||||
|
||||
# 替换内容
|
||||
fixed_content = content.replace(old_loss_calculation, new_loss_calculation)
|
||||
|
||||
# 保存修复后的文件
|
||||
with open('eval_model_final_fixed.py', 'w', encoding='utf-8') as f:
|
||||
f.write(fixed_content)
|
||||
|
||||
print(f"✅ 创建了最终修复版本:eval_model_final_fixed.py")
|
||||
print(f"主要修复:")
|
||||
print(f" 1. 移除 logits_to_keep 参数(避免计算差异)")
|
||||
print(f" 2. 使用正确的位置切片: [input_length-1:input_length+predict_length-1]")
|
||||
print(f" 3. 这考虑了Transformer中position i预测position i+1的特性")
|
||||
|
||||
# 直接修复原文件
|
||||
with open('eval_model.py', 'w', encoding='utf-8') as f:
|
||||
f.write(fixed_content)
|
||||
|
||||
print(f"✅ 同时直接修复了原文件:eval_model.py")
|
||||
|
||||
|
||||
def test_final_fix():
|
||||
"""
|
||||
测试最终修复版本
|
||||
"""
|
||||
print(f"\n🧪 测试最终修复版本")
|
||||
print("="*60)
|
||||
|
||||
import subprocess
|
||||
|
||||
# 运行修复后的eval_model.py,使用较少样本快速测试
|
||||
cmd = [
|
||||
'.venv/bin/python', 'eval_model.py',
|
||||
'--model_path', 'out/experiment_1_4_0/pretrain_512.pth',
|
||||
'--model_type', 'model_original',
|
||||
'--num_samples', '5',
|
||||
'--input_length', '100',
|
||||
'--predict_length', '30'
|
||||
]
|
||||
|
||||
print("运行命令:")
|
||||
print(" ".join(cmd))
|
||||
print("\n运行结果:")
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
|
||||
# 提取关键信息
|
||||
output_lines = result.stdout.split('\n')
|
||||
for line in output_lines:
|
||||
if 'Loss:' in line or '平均Loss:' in line or '总体统计:' in line or '有效样本数:' in line:
|
||||
print(line)
|
||||
|
||||
if result.returncode == 0:
|
||||
print("\n✅ 修复后的eval_model.py运行成功!")
|
||||
else:
|
||||
print(f"\n❌ 运行失败,错误码: {result.returncode}")
|
||||
if result.stderr:
|
||||
print("错误信息:")
|
||||
print(result.stderr[:500])
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
print("❌ 运行超时")
|
||||
except Exception as e:
|
||||
print(f"❌ 运行出错: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demonstrate_correct_fix()
|
||||
create_final_fixed_eval_model()
|
||||
test_final_fix()
|
||||
@ -1,247 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
修复logits_to_keep参数导致的loss计算错误
|
||||
验证问题并提供解决方案
|
||||
"""
|
||||
|
||||
import json
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import AutoTokenizer
|
||||
from model.LMConfig import LMConfig
|
||||
from model.model_original import MiniMindLM
|
||||
|
||||
|
||||
def demonstrate_logits_to_keep_issue():
|
||||
"""
|
||||
演示logits_to_keep参数导致的问题
|
||||
"""
|
||||
print("🔍 验证logits_to_keep参数问题")
|
||||
print("="*60)
|
||||
|
||||
device = 'cuda'
|
||||
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
|
||||
|
||||
# 加载模型
|
||||
config = LMConfig(
|
||||
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
|
||||
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
|
||||
)
|
||||
|
||||
model = MiniMindLM(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
|
||||
state_dict = torch.load(model_path, map_location=device)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
# 加载测试数据
|
||||
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
|
||||
sample = json.loads(f.readline().strip())
|
||||
|
||||
text = sample['text']
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
input_tokens = tokens[:100]
|
||||
target_tokens = tokens[100:130] # 30个目标token
|
||||
|
||||
print(f"测试样本: {len(tokens)} tokens")
|
||||
print(f"输入: {len(input_tokens)} tokens")
|
||||
print(f"目标: {len(target_tokens)} tokens")
|
||||
|
||||
with torch.no_grad():
|
||||
full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
|
||||
target_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
print(f"\n🔬 详细对比不同方法:")
|
||||
|
||||
# 方法1: 标准forward (正确方法)
|
||||
outputs1 = model(full_input)
|
||||
logits1 = outputs1.logits
|
||||
correct_logits = logits1[0, 99:129, :].contiguous() # 取position 99-128
|
||||
loss1 = F.cross_entropy(correct_logits, target_labels, reduction='mean')
|
||||
|
||||
print(f"1. 标准forward (正确):")
|
||||
print(f" 完整logits形状: {logits1.shape}")
|
||||
print(f" 用于计算的logits形状: {correct_logits.shape}")
|
||||
print(f" Loss: {loss1.item():.4f}")
|
||||
|
||||
# 方法2: 使用logits_to_keep=30 (错误方法)
|
||||
outputs2 = model(full_input, logits_to_keep=30)
|
||||
logits2 = outputs2.logits
|
||||
incorrect_logits = logits2[0, -30:, :].contiguous() # 最后30个
|
||||
loss2 = F.cross_entropy(incorrect_logits, target_labels, reduction='mean')
|
||||
|
||||
print(f"\n2. logits_to_keep=30 (eval_model.py方法):")
|
||||
print(f" 部分logits形状: {logits2.shape}")
|
||||
print(f" 用于计算的logits形状: {incorrect_logits.shape}")
|
||||
print(f" Loss: {loss2.item():.4f}")
|
||||
|
||||
# 方法3: 修复后的方法(不使用logits_to_keep)
|
||||
# 这就是方法1,但为了清晰显示修复方案
|
||||
print(f"\n3. 修复方法 (不使用logits_to_keep):")
|
||||
print(f" 使用完整forward,然后选择正确的logits切片")
|
||||
print(f" 这与方法1相同,Loss: {loss1.item():.4f}")
|
||||
|
||||
# 分析差异
|
||||
print(f"\n📊 数值分析:")
|
||||
print(f" Loss差异: {abs(loss2.item() - loss1.item()):.4f}")
|
||||
print(f" Loss增幅: {(loss2.item() / loss1.item() - 1) * 100:.1f}%")
|
||||
|
||||
# 检查logits的微小差异如何被放大
|
||||
logits_diff = torch.abs(correct_logits - incorrect_logits).max()
|
||||
print(f" 最大logits差异: {logits_diff.item():.8f}")
|
||||
|
||||
# 计算softmax概率的差异
|
||||
prob1 = F.softmax(correct_logits, dim=-1)
|
||||
prob2 = F.softmax(incorrect_logits, dim=-1)
|
||||
prob_diff = torch.abs(prob1 - prob2).max()
|
||||
print(f" 最大概率差异: {prob_diff.item():.8f}")
|
||||
|
||||
print(f"\n💡 结论:")
|
||||
print(f" 虽然logits差异很小({logits_diff.item():.8f}),")
|
||||
print(f" 但在交叉熵损失中被显著放大,导致loss增加{(loss2.item() / loss1.item() - 1) * 100:.1f}%")
|
||||
|
||||
|
||||
def create_fixed_eval_model():
|
||||
"""
|
||||
创建修复后的eval_model.py
|
||||
"""
|
||||
print(f"\n🔧 创建修复后的评估脚本")
|
||||
print("="*60)
|
||||
|
||||
# 读取原始eval_model.py
|
||||
with open('eval_model.py', 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# 修复关键部分:移除logits_to_keep的使用
|
||||
fixed_content = content.replace(
|
||||
""" # 计算loss(使用forward方法)
|
||||
# 准备用于loss计算的输入
|
||||
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
|
||||
outputs = model(loss_input_ids, logits_to_keep=predict_length)
|
||||
|
||||
# 计算loss
|
||||
logits = outputs.logits
|
||||
loss = None
|
||||
if logits is not None:
|
||||
# 重塑logits和目标
|
||||
shift_logits = logits[0, -predict_length:, :].contiguous()
|
||||
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
# 计算交叉熵损失
|
||||
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||||
loss = loss.item()""",
|
||||
""" # 计算loss(使用forward方法)
|
||||
# 准备用于loss计算的输入
|
||||
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
|
||||
outputs = model(loss_input_ids) # 移除logits_to_keep参数
|
||||
|
||||
# 计算loss
|
||||
logits = outputs.logits
|
||||
loss = None
|
||||
if logits is not None:
|
||||
# 重塑logits和目标 - 修复:使用正确的位置切片
|
||||
shift_logits = logits[0, input_length:input_length + predict_length, :].contiguous()
|
||||
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
# 计算交叉熵损失
|
||||
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||||
loss = loss.item()"""
|
||||
)
|
||||
|
||||
# 保存修复后的文件
|
||||
with open('eval_model_fixed.py', 'w', encoding='utf-8') as f:
|
||||
f.write(fixed_content)
|
||||
|
||||
print(f"✅ 创建了修复版本:eval_model_fixed.py")
|
||||
print(f"主要修复:")
|
||||
print(f" 1. 移除 logits_to_keep 参数")
|
||||
print(f" 2. 使用正确的位置切片: [input_length:input_length + predict_length]")
|
||||
print(f" 3. 而不是错误的 [-predict_length:]")
|
||||
|
||||
|
||||
def test_fixed_evaluation():
|
||||
"""
|
||||
测试修复后的评估方法
|
||||
"""
|
||||
print(f"\n🧪 测试修复后的评估方法")
|
||||
print("="*60)
|
||||
|
||||
device = 'cuda'
|
||||
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
|
||||
|
||||
# 加载模型
|
||||
config = LMConfig(
|
||||
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
|
||||
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
|
||||
)
|
||||
|
||||
model = MiniMindLM(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
|
||||
state_dict = torch.load(model_path, map_location=device)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
# 测试多个样本
|
||||
total_loss_old = 0
|
||||
total_loss_fixed = 0
|
||||
valid_samples = 0
|
||||
|
||||
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
|
||||
for i, line in enumerate(f):
|
||||
if i >= 10: # 测试前10个样本
|
||||
break
|
||||
|
||||
sample = json.loads(line.strip())
|
||||
text = sample['text']
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
if len(tokens) < 130:
|
||||
continue
|
||||
|
||||
input_length = 100
|
||||
predict_length = 30
|
||||
input_tokens = tokens[:input_length]
|
||||
target_tokens = tokens[input_length:input_length + predict_length]
|
||||
|
||||
with torch.no_grad():
|
||||
full_input = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
|
||||
target_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
# 原始错误方法
|
||||
outputs_old = model(full_input, logits_to_keep=predict_length)
|
||||
logits_old = outputs_old.logits
|
||||
shift_logits_old = logits_old[0, -predict_length:, :].contiguous()
|
||||
loss_old = F.cross_entropy(shift_logits_old, target_labels, reduction='mean')
|
||||
|
||||
# 修复后方法
|
||||
outputs_fixed = model(full_input)
|
||||
logits_fixed = outputs_fixed.logits
|
||||
shift_logits_fixed = logits_fixed[0, input_length:input_length + predict_length, :].contiguous()
|
||||
loss_fixed = F.cross_entropy(shift_logits_fixed, target_labels, reduction='mean')
|
||||
|
||||
total_loss_old += loss_old.item()
|
||||
total_loss_fixed += loss_fixed.item()
|
||||
valid_samples += 1
|
||||
|
||||
print(f"样本{i+1}: 原始{loss_old.item():.4f} -> 修复{loss_fixed.item():.4f}")
|
||||
|
||||
avg_loss_old = total_loss_old / valid_samples
|
||||
avg_loss_fixed = total_loss_fixed / valid_samples
|
||||
|
||||
print(f"\n📊 测试结果总结:")
|
||||
print(f" 测试样本数: {valid_samples}")
|
||||
print(f" 原始方法平均loss: {avg_loss_old:.4f}")
|
||||
print(f" 修复方法平均loss: {avg_loss_fixed:.4f}")
|
||||
print(f" 差异: {abs(avg_loss_old - avg_loss_fixed):.4f}")
|
||||
print(f" 修复后loss更接近训练时的教师强制loss (~2.4)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
demonstrate_logits_to_keep_issue()
|
||||
create_fixed_eval_model()
|
||||
test_fixed_evaluation()
|
||||
@ -1,211 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
深入调查logits_to_keep参数对loss计算的影响
|
||||
"""
|
||||
|
||||
import json
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import AutoTokenizer
|
||||
from model.LMConfig import LMConfig
|
||||
from model.model_original import MiniMindLM
|
||||
|
||||
|
||||
def investigate_logits_to_keep_issue():
|
||||
"""
|
||||
调查logits_to_keep参数的影响
|
||||
"""
|
||||
print("🔍 调查logits_to_keep参数的影响")
|
||||
print("="*60)
|
||||
|
||||
device = 'cuda'
|
||||
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
|
||||
|
||||
# 加载模型
|
||||
config = LMConfig(
|
||||
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
|
||||
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
|
||||
)
|
||||
|
||||
model = MiniMindLM(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
|
||||
state_dict = torch.load(model_path, map_location=device)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
# 加载测试数据
|
||||
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
|
||||
sample = json.loads(f.readline().strip())
|
||||
|
||||
text = sample['text']
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
input_tokens = tokens[:100]
|
||||
target_tokens = tokens[100:130] # 30个目标token
|
||||
|
||||
print(f"测试文本长度: {len(tokens)} tokens")
|
||||
print(f"输入: {len(input_tokens)} tokens")
|
||||
print(f"目标: {len(target_tokens)} tokens")
|
||||
|
||||
with torch.no_grad():
|
||||
# 方法1: 标准forward (类似训练时)
|
||||
full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
|
||||
outputs1 = model(full_input)
|
||||
logits1 = outputs1.logits
|
||||
|
||||
# 计算loss (训练方式)
|
||||
shift_logits1 = logits1[0, 99:129, :].contiguous()
|
||||
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
loss1 = F.cross_entropy(shift_logits1, shift_labels, reduction='mean')
|
||||
|
||||
print(f"\n方法1 (标准forward):")
|
||||
print(f" logits形状: {logits1.shape}")
|
||||
print(f" 用于loss计算的logits形状: {shift_logits1.shape}")
|
||||
print(f" Loss: {loss1.item():.4f}")
|
||||
|
||||
# 方法2: 使用logits_to_keep=30 (eval_model.py的方式)
|
||||
outputs2 = model(full_input, logits_to_keep=30)
|
||||
logits2 = outputs2.logits
|
||||
|
||||
if logits2 is not None:
|
||||
print(f"\n方法2 (logits_to_keep=30):")
|
||||
print(f" logits形状: {logits2.shape}")
|
||||
|
||||
# 按照eval_model.py的方式计算loss
|
||||
shift_logits2 = logits2[0, -30:, :].contiguous()
|
||||
loss2 = F.cross_entropy(shift_logits2, shift_labels, reduction='mean')
|
||||
print(f" 用于loss计算的logits形状: {shift_logits2.shape}")
|
||||
print(f" Loss: {loss2.item():.4f}")
|
||||
|
||||
# 检查logits是否相同
|
||||
expected_logits = logits1[0, 100:130, :] # 从position 100-129
|
||||
actual_logits = logits2[0, -30:, :] # 最后30个position
|
||||
|
||||
print(f"\n逐项对比:")
|
||||
print(f" 期望的logits形状: {expected_logits.shape}")
|
||||
print(f" 实际的logits形状: {actual_logits.shape}")
|
||||
|
||||
# 检查是否相等
|
||||
are_equal = torch.allclose(expected_logits, actual_logits, rtol=1e-4)
|
||||
print(f" logits是否相等: {are_equal}")
|
||||
|
||||
if not are_equal:
|
||||
diff = torch.abs(expected_logits - actual_logits).max()
|
||||
print(f" 最大差异: {diff.item():.6f}")
|
||||
|
||||
# 检查前几个position的差异
|
||||
for i in range(min(5, expected_logits.shape[0])):
|
||||
pos_diff = torch.abs(expected_logits[i] - actual_logits[i]).max()
|
||||
print(f" Position {i} 最大差异: {pos_diff.item():.6f}")
|
||||
else:
|
||||
print("\n方法2: logits为None")
|
||||
|
||||
# 方法3: 不同的logits_to_keep值
|
||||
print(f"\n测试不同logits_to_keep值:")
|
||||
for keep_value in [10, 20, 30, 50, 100]:
|
||||
outputs_test = model(full_input, logits_to_keep=keep_value)
|
||||
if outputs_test.logits is not None:
|
||||
test_logits_shape = outputs_test.logits.shape
|
||||
print(f" logits_to_keep={keep_value}: {test_logits_shape}")
|
||||
else:
|
||||
print(f" logits_to_keep={keep_value}: None")
|
||||
|
||||
|
||||
def check_model_forward_implementation():
|
||||
"""检查模型forward方法中logits_to_keep的实现"""
|
||||
print("\n" + "="*60)
|
||||
print("🔍 检查模型forward方法的实现")
|
||||
|
||||
# 读取模型代码中关于logits_to_keep的实现
|
||||
try:
|
||||
with open('model/model_original.py', 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# 查找logits_to_keep相关的代码
|
||||
lines = content.split('\n')
|
||||
for i, line in enumerate(lines):
|
||||
if 'logits_to_keep' in line:
|
||||
print(f"第{i+1}行: {line.strip()}")
|
||||
# 打印前后几行上下文
|
||||
for j in range(max(0, i-2), min(len(lines), i+3)):
|
||||
if j != i:
|
||||
print(f"第{j+1}行: {lines[j].strip()}")
|
||||
print()
|
||||
except FileNotFoundError:
|
||||
print("无法读取model_original.py文件")
|
||||
|
||||
|
||||
def compare_with_original_eval_script():
|
||||
"""
|
||||
对比原始eval_model.py脚本的行为
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("🔍 对比原始eval_model.py的行为")
|
||||
|
||||
device = 'cuda'
|
||||
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
|
||||
|
||||
# 复制eval_model.py中的相关逻辑
|
||||
config = LMConfig(
|
||||
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
|
||||
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
|
||||
)
|
||||
|
||||
model = MiniMindLM(config)
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
|
||||
state_dict = torch.load(model_path, map_location=device)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
model.to(device)
|
||||
model.eval()
|
||||
|
||||
# 加载数据
|
||||
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
|
||||
sample = json.loads(f.readline().strip())
|
||||
|
||||
text = sample['text']
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
input_length = 100
|
||||
predict_length = 30
|
||||
|
||||
input_tokens = tokens[:input_length]
|
||||
target_tokens = tokens[input_length:input_length + predict_length]
|
||||
|
||||
print(f"复现eval_model.py的计算:")
|
||||
print(f" input_length: {input_length}")
|
||||
print(f" predict_length: {predict_length}")
|
||||
|
||||
with torch.no_grad():
|
||||
# 完全按照eval_model.py的方式
|
||||
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
|
||||
outputs = model(loss_input_ids, logits_to_keep=predict_length)
|
||||
|
||||
print(f" loss_input_ids形状: {loss_input_ids.shape}")
|
||||
print(f" logits_to_keep参数: {predict_length}")
|
||||
|
||||
logits = outputs.logits
|
||||
loss = None
|
||||
if logits is not None:
|
||||
print(f" 输出logits形状: {logits.shape}")
|
||||
|
||||
# 重塑logits和目标
|
||||
shift_logits = logits[0, -predict_length:, :].contiguous()
|
||||
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
||||
|
||||
print(f" shift_logits形状: {shift_logits.shape}")
|
||||
print(f" shift_labels形状: {shift_labels.shape}")
|
||||
|
||||
# 计算交叉熵损失
|
||||
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
||||
print(f" 计算得到的loss: {loss.item():.4f}")
|
||||
else:
|
||||
print(" logits为None")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
investigate_logits_to_keep_issue()
|
||||
check_model_forward_implementation()
|
||||
compare_with_original_eval_script()
|
||||
@ -1,181 +0,0 @@
|
||||
# 训练与推理Loss差距分析报告
|
||||
|
||||
> **实验**: Experiment 1.4.0
|
||||
> **日期**: 2025-07-31
|
||||
> **分析师**: Claude AI
|
||||
> **状态**: 已完成并修复关键问题
|
||||
|
||||
---
|
||||
|
||||
## 📋 问题概述
|
||||
|
||||
### 初始发现
|
||||
用户发现训练loss(2.43)和推理loss(12.34)存在巨大差距,要求进行详细分析。
|
||||
|
||||
**关键数据**:
|
||||
- 训练Loss: 2.43
|
||||
- 初始推理Loss: 12.34
|
||||
- 差距: 9.91 (405% 增长)
|
||||
|
||||
### 可能原因假设
|
||||
1. 数据差异
|
||||
2. 推理脚本问题(权重加载、模型不一致)
|
||||
3. 训练与推理模式不一致(错误累积)
|
||||
4. KV cache问题
|
||||
|
||||
---
|
||||
|
||||
## 🔍 分析过程
|
||||
|
||||
### 第一阶段:数据一致性验证
|
||||
**方法**: 从训练数据中重新提取20个样本创建eval_data_from_train.json
|
||||
|
||||
**结果**: ✅ 确认评估数据来自训练数据集,排除数据差异问题
|
||||
|
||||
### 第二阶段:模型加载验证
|
||||
**方法**: 检查权重加载匹配情况
|
||||
|
||||
**结果**: ✅ 权重加载完全成功(75/75参数匹配),排除模型加载问题
|
||||
|
||||
### 第三阶段:训练vs推理模式对比
|
||||
**方法**: 对比教师强制(teacher forcing)与自回归生成
|
||||
|
||||
**关键发现**:
|
||||
```
|
||||
教师强制loss: ~2.43 (与训练一致)
|
||||
真实自回归loss: ~10-11 (接近推理loss)
|
||||
```
|
||||
|
||||
**初步结论**: 训练与推理的差异主要来自计算方式不同,这本身是正常的
|
||||
|
||||
### 第四阶段:深入调查logits_to_keep参数
|
||||
**方法**: 分析eval_model.py中logits_to_keep参数的影响
|
||||
|
||||
**震惊发现**:
|
||||
```
|
||||
标准forward: Loss = 3.4188
|
||||
使用logits_to_keep=30: Loss = 9.8785
|
||||
差距: 188.9% 增长!
|
||||
```
|
||||
|
||||
### 第五阶段:位置索引深度分析
|
||||
**方法**: 分析Transformer位置索引的正确性
|
||||
|
||||
**根本原因发现**:
|
||||
1. **错误方法**: `logits[0, -predict_length:, :]`
|
||||
2. **正确方法**: `logits[0, input_length-1:input_length+predict_length-1, :]`
|
||||
3. **关键认知**: Transformer中position i的logits预测position i+1的token
|
||||
|
||||
---
|
||||
|
||||
## 🛠️ 修复方案
|
||||
|
||||
### 核心修复
|
||||
**文件**: `eval_model.py`
|
||||
|
||||
**修复前**:
|
||||
```python
|
||||
outputs = model(loss_input_ids, logits_to_keep=predict_length)
|
||||
shift_logits = logits[0, -predict_length:, :].contiguous()
|
||||
```
|
||||
|
||||
**修复后**:
|
||||
```python
|
||||
outputs = model(loss_input_ids) # 移除logits_to_keep
|
||||
shift_logits = logits[0, input_length-1:input_length+predict_length-1, :].contiguous()
|
||||
```
|
||||
|
||||
### 修复原理
|
||||
1. **移除logits_to_keep参数**: 避免计算差异
|
||||
2. **使用正确位置切片**: 考虑Transformer的位置偏移
|
||||
3. **确保一致性**: 与训练时的教师强制计算对齐
|
||||
|
||||
---
|
||||
|
||||
## 📊 修复效果验证
|
||||
|
||||
### 单样本对比
|
||||
```
|
||||
样本 | 错误方法 | 正确方法 | 改善
|
||||
-----|----------|----------|------
|
||||
1 | 9.88 | 3.42 | 65.3%
|
||||
2 | 13.56 | 1.50 | 88.9%
|
||||
3 | 13.62 | 1.78 | 86.9%
|
||||
...
|
||||
平均 | 12.34 | 2.73 | 77.9%
|
||||
```
|
||||
|
||||
### 最终验证
|
||||
**修复后10样本评估**:
|
||||
- 平均Loss: 2.26
|
||||
- 与训练Loss (2.43) 差异: 仅0.17 (7%)
|
||||
- 改善幅度: 81.7% (从12.34降至2.26)
|
||||
|
||||
---
|
||||
|
||||
## 🎯 关键发现总结
|
||||
|
||||
### 主要问题
|
||||
1. **eval_model.py存在位置索引错误**: 这是导致loss被严重高估的根本原因
|
||||
2. **logits_to_keep参数的误用**: 改变了模型计算方式
|
||||
3. **位置偏移的忽略**: 未考虑Transformer的特殊性质
|
||||
|
||||
### 技术洞察
|
||||
1. **Transformer位置特性**: position i的logits预测position i+1
|
||||
2. **微小差异的放大效应**: 即使很小的logits差异也会在交叉熵中被显著放大
|
||||
3. **评估系统的重要性**: 错误的评估会误导整个研究方向
|
||||
|
||||
### 修复成果
|
||||
1. **训练推理一致性**: ✅ 达到优秀水平(差异<10%)
|
||||
2. **评估系统可靠性**: ✅ 修复后可信度大幅提升
|
||||
3. **技术基础**: ✅ 为后续实验提供可靠基准
|
||||
|
||||
---
|
||||
|
||||
## 🔮 后续影响
|
||||
|
||||
### 立即影响
|
||||
- **实验1.4.0评估结果更正**: 推理loss从12.34修正为2.26
|
||||
- **模型性能重新评价**: model_original的baseline表现优秀
|
||||
- **评估工具可靠性**: 修复后的eval_model.py可用于后续实验
|
||||
|
||||
### 长期影响
|
||||
- **研究方向**: 确认当前训练方法的有效性
|
||||
- **技术规范**: 建立正确的模型评估标准
|
||||
- **项目信心**: 为KnowledgeDataset研究提供坚实基础
|
||||
|
||||
---
|
||||
|
||||
## 📝 经验教训
|
||||
|
||||
### 技术层面
|
||||
1. **系统性调试的重要性**: 逐步排除假设,找到根本原因
|
||||
2. **位置索引的细节**: Transformer评估中的关键技术点
|
||||
3. **验证的必要性**: 必须验证评估工具的正确性
|
||||
|
||||
### 方法论层面
|
||||
1. **多角度分析**: 从数据、模型、计算三个维度分析问题
|
||||
2. **对照实验**: 通过不同方法的对比找到差异来源
|
||||
3. **深入理解**: 理解底层原理比表面修复更重要
|
||||
|
||||
### 质量控制
|
||||
1. **评估工具验证**: 在使用前必须验证评估工具的正确性
|
||||
2. **一致性检查**: 训练与推理的一致性是重要指标
|
||||
3. **文档记录**: 详细记录问题发现和修复过程
|
||||
|
||||
---
|
||||
|
||||
## ✅ 结论
|
||||
|
||||
**问题解决**: ✅ 完全解决
|
||||
**根本原因**: eval_model.py中的位置索引错误
|
||||
**修复效果**: 推理loss从12.34降至2.26,改善81.7%
|
||||
**影响评估**: 重大正面影响,为项目建立可靠基础
|
||||
|
||||
**最终状态**: 训练Loss (2.43) 与推理Loss (2.26) 高度一致,证明模型训练成功且评估系统可靠。
|
||||
|
||||
---
|
||||
|
||||
**报告完成时间**: 2025-07-31
|
||||
**验证状态**: ✅ 已通过10样本独立验证
|
||||
**应用状态**: ✅ 已应用于实验1.4.0分析更新
|
||||
Loading…
x
Reference in New Issue
Block a user