Clean up temporary debugging and analysis files

Remove debugging scripts and analysis files that are no longer needed:
- analyze_position_slicing.py
- analyze_train_inference_gap.py
- debug_model.py
- eval_model_final_fixed.py
- eval_model_fixed.py
- final_fix_eval_model.py
- fix_logits_to_keep_issue.py
- investigate_logits_to_keep.py
- train_inference_gap_analysis_report.md

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Yu Chengzhang 2025-09-01 15:40:49 +08:00
parent 495fc412cd
commit c4c72ac154
9 changed files with 0 additions and 2557 deletions

View File

@ -1,193 +0,0 @@
#!/usr/bin/env python3
"""
深入分析位置切片的问题
验证logits_to_keep和位置索引的正确性
"""
import json
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from model.LMConfig import LMConfig
from model.model_original import MiniMindLM
def analyze_position_indexing():
"""
分析位置索引的正确性
"""
print("🔍 分析位置索引和切片逻辑")
print("="*60)
device = 'cuda'
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
# 加载模型
config = LMConfig(
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
)
model = MiniMindLM(config)
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict, strict=False)
model.to(device)
model.eval()
# 加载测试数据
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
sample = json.loads(f.readline().strip())
text = sample['text']
tokens = tokenizer.encode(text, add_special_tokens=False)
input_length = 100
predict_length = 30
input_tokens = tokens[:input_length]
target_tokens = tokens[input_length:input_length + predict_length]
print(f"输入长度: {input_length}")
print(f"预测长度: {predict_length}")
print(f"总序列长度: {input_length + predict_length}")
print(f"输入token位置: 0 到 {input_length-1}")
print(f"目标token位置: {input_length}{input_length + predict_length - 1}")
with torch.no_grad():
full_input = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
target_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
print(f"\n🔬 详细分析不同切片方法:")
# 方法1: 标准forward
outputs1 = model(full_input)
logits1 = outputs1.logits
print(f"\n1. 标准forward:")
print(f" 输入形状: {full_input.shape}")
print(f" 输出logits形状: {logits1.shape}")
# 在transformer中position i的logits预测position i+1的token
# 所以要预测position 100-129的token需要position 99-128的logits
correct_slice = logits1[0, input_length-1:input_length+predict_length-1, :].contiguous()
loss1 = F.cross_entropy(correct_slice, target_labels, reduction='mean')
print(f" 正确切片 [{input_length-1}:{input_length+predict_length-1}]: {correct_slice.shape}")
print(f" Loss: {loss1.item():.4f}")
# 方法2: logits_to_keep
outputs2 = model(full_input, logits_to_keep=predict_length)
logits2 = outputs2.logits
print(f"\n2. logits_to_keep={predict_length}:")
print(f" 输出logits形状: {logits2.shape}")
# 当logits_to_keep=30时返回最后30个位置的logits
# 这应该对应position 100-129但实际是哪些位置
keep_slice = logits2[0, -predict_length:, :].contiguous()
loss2 = F.cross_entropy(keep_slice, target_labels, reduction='mean')
print(f" logits_to_keep切片 [-{predict_length}:]: {keep_slice.shape}")
print(f" Loss: {loss2.item():.4f}")
# 检查这两个切片是否相同
print(f"\n🔍 切片对比:")
if torch.allclose(correct_slice, keep_slice, rtol=1e-6):
print(f" ✅ 两个切片完全相同")
else:
diff = torch.abs(correct_slice - keep_slice).max()
print(f" ❌ 切片不同,最大差异: {diff.item():.8f}")
# 检查具体哪些位置不同
diff_mask = ~torch.isclose(correct_slice, keep_slice, rtol=1e-6)
diff_positions = torch.where(diff_mask.any(dim=-1))[0]
print(f" 不同的位置: {diff_positions.tolist()}")
# 方法3: 验证eval_model.py中的逻辑
print(f"\n3. eval_model.py的逻辑:")
# eval_model.py使用的是logits[0, -predict_length:, :]
eval_slice = logits1[0, -predict_length:, :].contiguous()
loss3 = F.cross_entropy(eval_slice, target_labels, reduction='mean')
print(f" eval_model.py切片 [-{predict_length}:]: {eval_slice.shape}")
print(f" 这对应logits中的位置: {logits1.shape[1] - predict_length}{logits1.shape[1] - 1}")
print(f" Loss: {loss3.item():.4f}")
# 检查eval_model.py的切片是否正确
if torch.allclose(correct_slice, eval_slice, rtol=1e-6):
print(f" ✅ eval_model.py切片正确")
else:
diff = torch.abs(correct_slice - eval_slice).max()
print(f" ❌ eval_model.py切片错误最大差异: {diff.item():.8f}")
def compare_different_sequence_lengths():
"""
比较不同序列长度下的行为
"""
print(f"\n🧪 测试不同序列长度")
print("="*60)
device = 'cuda'
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
# 加载模型
config = LMConfig(
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
)
model = MiniMindLM(config)
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict, strict=False)
model.to(device)
model.eval()
# 创建测试序列
test_tokens = list(range(200)) # 简单的数字序列
test_configs = [
(50, 20), # 50输入20预测
(100, 30), # 100输入30预测
(150, 40), # 150输入40预测
]
for input_len, predict_len in test_configs:
print(f"\n测试配置: 输入{input_len}, 预测{predict_len}")
sequence = test_tokens[:input_len + predict_len]
input_ids = torch.tensor([sequence], dtype=torch.long).to(device)
target_labels = torch.tensor(sequence[input_len:], dtype=torch.long).to(device)
with torch.no_grad():
# 标准方法
outputs_std = model(input_ids)
logits_std = outputs_std.logits
slice_std = logits_std[0, input_len-1:input_len+predict_len-1, :].contiguous()
loss_std = F.cross_entropy(slice_std, target_labels, reduction='mean')
# logits_to_keep方法
outputs_keep = model(input_ids, logits_to_keep=predict_len)
logits_keep = outputs_keep.logits
slice_keep = logits_keep[0, -predict_len:, :].contiguous()
loss_keep = F.cross_entropy(slice_keep, target_labels, reduction='mean')
# eval_model.py方法
slice_eval = logits_std[0, -predict_len:, :].contiguous()
loss_eval = F.cross_entropy(slice_eval, target_labels, reduction='mean')
print(f" 标准方法loss: {loss_std.item():.4f}")
print(f" logits_to_keep loss: {loss_keep.item():.4f}")
print(f" eval_model.py loss: {loss_eval.item():.4f}")
# 检查是否相同
std_vs_keep = torch.allclose(slice_std, slice_keep, rtol=1e-6)
std_vs_eval = torch.allclose(slice_std, slice_eval, rtol=1e-6)
keep_vs_eval = torch.allclose(slice_keep, slice_eval, rtol=1e-6)
print(f" 标准 vs logits_to_keep: {'' if std_vs_keep else ''}")
print(f" 标准 vs eval_model.py: {'' if std_vs_eval else ''}")
print(f" logits_to_keep vs eval_model.py: {'' if keep_vs_eval else ''}")
if __name__ == "__main__":
analyze_position_indexing()
compare_different_sequence_lengths()

View File

@ -1,371 +0,0 @@
#!/usr/bin/env python3
"""
分析训练与推理Loss差距的实验脚本
系统性地验证各种可能的原因
"""
import json
import random
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
import os
from model.LMConfig import LMConfig
from model.model_original import MiniMindLM
def create_eval_data_from_training_data():
"""
从训练数据中重新提取样本创建eval_data.json
确保数据来源一致性
"""
print("=== 1. 创建来自训练数据的评估集 ===")
train_data_path = "/home/pci/ycz/Code/Minimind/dataset/stable/merged_pretrain.jsonl"
eval_data_path = "dataset/stable/eval_data_from_train.json"
# 确保目录存在
os.makedirs("dataset/stable", exist_ok=True)
# 从训练数据中随机选择20条
samples = []
with open(train_data_path, 'r', encoding='utf-8') as f:
all_lines = f.readlines()
# 随机选择20条数据
selected_lines = random.sample(all_lines, min(20, len(all_lines)))
for line in selected_lines:
try:
data = json.loads(line.strip())
samples.append(data)
except json.JSONDecodeError:
continue
# 保存到新的评估文件
with open(eval_data_path, 'w', encoding='utf-8') as f:
for sample in samples:
f.write(json.dumps(sample, ensure_ascii=False) + '\n')
print(f"✅ 创建了包含{len(samples)}个样本的评估数据集")
print(f" 保存路径: {eval_data_path}")
return eval_data_path, samples
def load_model_and_tokenizer(model_path, device='cuda'):
"""
加载模型和tokenizer确保与训练时配置一致
"""
print("=== 2. 加载模型和tokenizer ===")
# 使用与训练时完全相同的配置
config = LMConfig(
dim=512,
n_layers=8,
n_heads=32,
vocab_size=6400,
max_seq_len=512,
dropout=0.0,
norm_eps=1e-5,
rope_theta=1e6,
use_moe=False
)
model = MiniMindLM(config)
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
# 加载权重
if os.path.exists(model_path):
print(f"正在加载权重: {model_path}")
state_dict = torch.load(model_path, map_location=device)
# 检查权重匹配情况
model_keys = set(model.state_dict().keys())
checkpoint_keys = set(state_dict.keys())
matched_keys = model_keys & checkpoint_keys
missing_keys = model_keys - checkpoint_keys
unexpected_keys = checkpoint_keys - model_keys
print(f" 模型参数: {len(model_keys)}")
print(f" 权重文件参数: {len(checkpoint_keys)}")
print(f" 匹配参数: {len(matched_keys)}")
print(f" 缺失参数: {len(missing_keys)}")
print(f" 多余参数: {len(unexpected_keys)}")
if missing_keys:
print(f" ❌ 缺失参数: {list(missing_keys)[:5]}...")
if unexpected_keys:
print(f" ⚠️ 多余参数: {list(unexpected_keys)[:5]}...")
model.load_state_dict(state_dict, strict=False)
model.to(device)
model.eval()
print("✅ 模型加载完成")
else:
raise FileNotFoundError(f"模型文件不存在: {model_path}")
return model, tokenizer, config
def test_inference_modes(model, tokenizer, samples, device='cuda'):
"""
测试不同推理模式的loss差异
"""
print("=== 3. 测试不同推理模式 ===")
results = {}
for mode_name, use_cache in [("无缓存", False), ("有KV缓存", True)]:
print(f"\n--- 测试模式: {mode_name} ---")
total_loss = 0
valid_samples = 0
for i, sample in enumerate(samples[:5]): # 测试前5个样本
text = sample['text']
# 确保文本长度足够
tokens = tokenizer.encode(text, add_special_tokens=False)
if len(tokens) < 130: # 100输入 + 30预测
continue
input_tokens = tokens[:100]
target_tokens = tokens[100:130] # 30个预测token
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
target_ids = torch.tensor([target_tokens], dtype=torch.long).to(device)
with torch.no_grad():
# 方法1: 直接forward计算loss类似训练
full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
outputs = model(full_input)
logits = outputs.logits
# 计算loss
shift_logits = logits[0, 99:129, :].contiguous() # 取预测部分的logits
shift_labels = target_ids[0].contiguous()
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
total_loss += loss.item()
valid_samples += 1
print(f" 样本{i+1}: loss = {loss.item():.4f}")
avg_loss = total_loss / valid_samples if valid_samples > 0 else 0
results[mode_name] = avg_loss
print(f" {mode_name}平均loss: {avg_loss:.4f}")
return results
def test_autoregressive_vs_teacher_forcing(model, tokenizer, samples, device='cuda'):
"""
对比自回归生成vs教师强制的loss差异
"""
print("=== 4. 对比自回归生成 vs 教师强制 ===")
results = {}
for i, sample in enumerate(samples[:3]): # 测试前3个样本
text = sample['text']
tokens = tokenizer.encode(text, add_special_tokens=False)
if len(tokens) < 130:
continue
input_tokens = tokens[:100]
target_tokens = tokens[100:130]
print(f"\n--- 样本 {i+1} ---")
# 方法1: 教师强制(类似训练时)
with torch.no_grad():
full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
outputs = model(full_input)
logits = outputs.logits
shift_logits = logits[0, 99:129, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
teacher_forcing_loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
print(f" 教师强制loss: {teacher_forcing_loss.item():.4f}")
# 方法2: 自回归生成(逐步预测)
with torch.no_grad():
current_sequence = torch.tensor([input_tokens], dtype=torch.long).to(device)
autoregressive_losses = []
for step in range(len(target_tokens)):
outputs = model(current_sequence)
logits = outputs.logits[0, -1, :] # 只取最后一个位置的logits
# 计算当前步骤的loss
true_next_token = target_tokens[step]
step_loss = F.cross_entropy(logits.unsqueeze(0),
torch.tensor([true_next_token], device=device))
autoregressive_losses.append(step_loss.item())
# 添加真实token到序列中教师强制
current_sequence = torch.cat([
current_sequence,
torch.tensor([[true_next_token]], device=device)
], dim=1)
autoregressive_loss = sum(autoregressive_losses) / len(autoregressive_losses)
print(f" 自回归loss: {autoregressive_loss:.4f}")
print(f" loss差距: {abs(autoregressive_loss - teacher_forcing_loss.item()):.4f}")
# 方法3: 真实自回归生成使用预测token
with torch.no_grad():
current_sequence = torch.tensor([input_tokens], dtype=torch.long).to(device)
real_autoregressive_losses = []
for step in range(len(target_tokens)):
outputs = model(current_sequence)
logits = outputs.logits[0, -1, :]
# 预测下一个token
predicted_token = torch.argmax(logits, dim=-1).item()
# 计算与真实token的loss
true_next_token = target_tokens[step]
step_loss = F.cross_entropy(logits.unsqueeze(0),
torch.tensor([true_next_token], device=device))
real_autoregressive_losses.append(step_loss.item())
# 使用预测的token继续生成
current_sequence = torch.cat([
current_sequence,
torch.tensor([[predicted_token]], device=device)
], dim=1)
real_autoregressive_loss = sum(real_autoregressive_losses) / len(real_autoregressive_losses)
print(f" 真实自回归loss: {real_autoregressive_loss:.4f}")
def analyze_data_distribution(samples, tokenizer):
"""
分析评估数据的分布特征
"""
print("=== 5. 分析数据分布 ===")
lengths = []
vocab_coverage = set()
for sample in samples:
text = sample['text']
tokens = tokenizer.encode(text, add_special_tokens=False)
lengths.append(len(tokens))
vocab_coverage.update(tokens)
print(f"文本长度统计:")
print(f" 平均长度: {sum(lengths)/len(lengths):.1f} tokens")
print(f" 最短: {min(lengths)} tokens")
print(f" 最长: {max(lengths)} tokens")
print(f" 词汇覆盖: {len(vocab_coverage)} 个不同token")
print(f" 词汇覆盖率: {len(vocab_coverage)/6400*100:.1f}%")
def compare_training_vs_inference_computation(model, tokenizer, samples, device='cuda'):
"""
对比训练时和推理时的具体计算过程
"""
print("=== 6. 对比训练与推理的计算过程 ===")
sample = samples[0]
text = sample['text']
tokens = tokenizer.encode(text, add_special_tokens=False)
if len(tokens) < 130:
print("样本长度不足,跳过")
return
input_tokens = tokens[:100]
target_tokens = tokens[100:130]
print(f"测试样本长度: {len(tokens)} tokens")
print(f"输入部分: {len(input_tokens)} tokens")
print(f"目标部分: {len(target_tokens)} tokens")
# 模拟训练时的计算
print("\n--- 模拟训练时计算 ---")
with torch.no_grad():
# 训练时:一次性输入完整序列
full_sequence = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
outputs = model(full_sequence)
logits = outputs.logits
print(f"输入形状: {full_sequence.shape}")
print(f"输出logits形状: {logits.shape}")
# 计算loss的方式和训练时一致
shift_logits = logits[0, :-1, :].contiguous() # 去掉最后一个position
shift_labels = full_sequence[0, 1:].contiguous() # 去掉第一个position
# 只计算预测部分的loss
predict_start = 99 # 从第100个token开始预测
predict_logits = shift_logits[predict_start:predict_start+30, :]
predict_labels = shift_labels[predict_start:predict_start+30]
training_loss = F.cross_entropy(predict_logits, predict_labels, reduction='mean')
print(f"训练方式loss: {training_loss.item():.4f}")
# 模拟推理时的计算
print("\n--- 模拟推理时计算 ---")
with torch.no_grad():
# 推理时:分别处理输入和目标
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
# 使用和eval_model.py相同的方法
full_input_for_loss = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
outputs = model(full_input_for_loss, logits_to_keep=30)
if outputs.logits is not None:
shift_logits = outputs.logits[0, -30:, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
inference_loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
print(f"推理方式loss: {inference_loss.item():.4f}")
else:
print("无法获取logits")
def main():
"""
主函数系统性分析训练与推理loss差距
"""
print("🔍 开始分析训练与推理Loss差距")
print("="*60)
# 设置随机种子确保结果可重现
random.seed(42)
torch.manual_seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
try:
# 1. 创建来自训练数据的评估集
eval_data_path, samples = create_eval_data_from_training_data()
# 2. 加载模型
model, tokenizer, config = load_model_and_tokenizer(model_path, device)
# 3. 分析数据分布
analyze_data_distribution(samples, tokenizer)
# 4. 测试不同推理模式
mode_results = test_inference_modes(model, tokenizer, samples, device)
# 5. 对比自回归vs教师强制
test_autoregressive_vs_teacher_forcing(model, tokenizer, samples, device)
# 6. 对比训练与推理的具体计算过程
compare_training_vs_inference_computation(model, tokenizer, samples, device)
print("\n" + "="*60)
print("🎯 分析完成")
except Exception as e:
print(f"❌ 分析过程中出现错误: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()

View File

@ -1,101 +0,0 @@
#!/usr/bin/env python3
"""
调试模型生成过程
"""
import torch
from transformers import AutoTokenizer
from model.model_original import MiniMindLM
from model.LMConfig import LMConfig
def debug_generation():
# 加载模型和tokenizer
device = 'cuda'
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
# 配置
config = LMConfig(
dim=512,
n_layers=8,
n_heads=32,
vocab_size=6400,
max_seq_len=512
)
# 初始化模型
model = MiniMindLM(config)
# 加载权重
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict, strict=False)
model.to(device)
model.eval()
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
# 测试文本
text = "The quick brown fox"
input_tokens = tokenizer.encode(text, add_special_tokens=False)
print(f"输入文本: {text}")
print(f"输入tokens: {input_tokens}")
print(f"解码回来: {tokenizer.decode(input_tokens)}")
# 转为tensor
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
print(f"输入张量形状: {input_ids.shape}")
# 手动生成一步
with torch.no_grad():
# 前向传播
outputs = model(input_ids)
logits = outputs.logits
print(f"输出logits形状: {logits.shape}")
# 获取最后一个位置的logits
next_token_logits = logits[0, -1, :]
print(f"下一个token的logits形状: {next_token_logits.shape}")
# 应用温度
next_token_logits = next_token_logits / 1.0
# 获取概率分布
probs = torch.softmax(next_token_logits, dim=-1)
# 找出top-5的token
top_probs, top_indices = torch.topk(probs, 10)
print(f"\nTop 10 候选tokens:")
for i, (prob, idx) in enumerate(zip(top_probs, top_indices)):
token_text = tokenizer.decode([idx.item()], skip_special_tokens=True)
print(f" {i+1}. Token {idx.item()}: '{token_text}' (prob: {prob.item():.4f})")
# 贪婪采样
next_token = torch.argmax(next_token_logits, dim=-1)
print(f"\n贪婪采样选择的token: {next_token.item()}")
print(f"对应文本: '{tokenizer.decode([next_token.item()], skip_special_tokens=True)}'")
# 使用generate方法
print(f"\n使用generate方法:")
with torch.no_grad():
generated = model.generate(
input_ids,
max_new_tokens=5,
temperature=1.0,
top_p=0.95,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id
)
print(f"生成的完整序列长度: {generated[0].shape}")
print(f"生成的tokens: {generated[0].tolist()}")
# 提取新生成的部分
if len(generated[0]) > len(input_tokens):
new_tokens = generated[0][len(input_tokens):].tolist()
print(f"新生成的tokens: {new_tokens}")
print(f"新生成的文本: '{tokenizer.decode(new_tokens, skip_special_tokens=True)}'")
else:
print("没有生成新的tokens")
if __name__ == "__main__":
debug_generation()

View File

@ -1,519 +0,0 @@
#!/usr/bin/env python3
"""
评估预训练模型的推理效果
用于测试不同实验中训练出来的模型在eval_data.json上的表现
"""
import os
import json
import argparse
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from model.LMConfig import LMConfig
def load_model(model_path, model_type, device, config_params=None):
"""
加载模型和tokenizer
Args:
model_path: 模型权重文件路径
model_type: 模型类型 (model/model_original/model_no_feed)
device: 运行设备
config_params: 模型配置参数字典
Returns:
model: 加载好的模型
tokenizer: tokenizer实例
"""
# 初始化配置
if config_params:
lm_config = LMConfig(**config_params)
else:
lm_config = LMConfig()
# 打印配置信息
print(f"模型配置:")
print(f" dim: {lm_config.dim}")
print(f" n_layers: {lm_config.n_layers}")
print(f" n_heads: {lm_config.n_heads}")
print(f" vocab_size: {lm_config.vocab_size}")
print(f" max_seq_len: {lm_config.max_seq_len}")
if hasattr(lm_config, 'knowledge_num'):
print(f" knowledge_num: {lm_config.knowledge_num}")
print(f" knowledge_length: {lm_config.knowledge_length}")
print(f" knowledge_dim: {lm_config.knowledge_dim}")
print()
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
# 根据模型类型导入对应的模型类
if model_type == "model":
from model.model import MiniMindLM
elif model_type == "model_original":
from model.model_original import MiniMindLM
elif model_type == "model_no_feed":
from model.model_no_feed import MiniMindLM
else:
raise ValueError(f"不支持的模型类型: {model_type}")
# 初始化模型
model = MiniMindLM(lm_config)
# 加载权重
if os.path.exists(model_path):
print(f"正在从 {model_path} 加载模型权重...")
# 加载权重文件
state_dict = torch.load(model_path, map_location=device)
# 获取模型的参数名称
model_keys = set(model.state_dict().keys())
checkpoint_keys = set(state_dict.keys())
# 统计权重匹配情况
matched_keys = model_keys & checkpoint_keys
missing_keys = model_keys - checkpoint_keys
unexpected_keys = checkpoint_keys - model_keys
print(f"\n权重加载详情:")
print(f" 模型总参数数量: {len(model_keys)}")
print(f" 权重文件参数数量: {len(checkpoint_keys)}")
print(f" 成功匹配参数: {len(matched_keys)}")
print(f" 缺失参数: {len(missing_keys)}")
print(f" 多余参数: {len(unexpected_keys)}")
# 详细列出缺失和多余的参数
if missing_keys:
print(f"\n❌ 缺失的参数 ({len(missing_keys)}):")
for key in sorted(missing_keys):
print(f" - {key}")
if unexpected_keys:
print(f"\n⚠️ 权重文件中多余的参数 ({len(unexpected_keys)}):")
for key in sorted(unexpected_keys):
print(f" + {key}")
# 加载权重(允许部分匹配)
try:
incompatible_keys = model.load_state_dict(state_dict, strict=False)
# 检查加载结果
if len(incompatible_keys.missing_keys) == 0 and len(incompatible_keys.unexpected_keys) == 0:
print(f"\n✅ 权重加载完全成功!")
elif len(incompatible_keys.missing_keys) == 0:
print(f"\n✅ 权重加载成功(忽略多余参数)")
else:
print(f"\n⚠️ 权重加载部分成功,存在缺失参数")
print(f" 这可能影响模型性能,请检查模型配置参数是否正确")
# 计算加载成功率
success_rate = len(matched_keys) / len(model_keys) * 100
print(f" 参数加载成功率: {success_rate:.1f}%")
if success_rate < 90:
print(f" ❌ 警告:加载成功率过低,模型可能无法正常工作!")
elif success_rate < 100:
print(f" ⚠️ 警告:存在缺失参数,可能影响模型性能")
except Exception as e:
raise RuntimeError(f"权重加载失败: {e}")
# 验证关键层的形状
print("🔍 验证关键层形状:")
key_layers = [
'tok_embeddings.weight',
'output.weight',
'norm.weight',
]
# 添加每一层的验证
for i in range(lm_config.n_layers):
key_layers.extend([
f'layers.{i}.attention_norm.weight',
f'layers.{i}.ffn_norm.weight',
f'layers.{i}.self_attention.wq.weight',
f'layers.{i}.self_attention.wk.weight',
f'layers.{i}.self_attention.wv.weight',
f'layers.{i}.self_attention.wo.weight',
])
# FFN层的验证model_original有FFN其他模型可能没有
if f'layers.{i}.feed_forward.w1.weight' in model_keys:
key_layers.extend([
f'layers.{i}.feed_forward.w1.weight',
f'layers.{i}.feed_forward.w2.weight',
f'layers.{i}.feed_forward.w3.weight',
])
# 验证KnowledgeDataset相关层仅model和model_no_feed
if model_type in ['model', 'model_no_feed']:
key_layers.extend([
'knowledge_dataset.to_queries.0.weight',
'knowledge_dataset.keys',
'knowledge_dataset.knowledge_dataset',
])
# 添加CrossAttention层
for i in range(lm_config.n_layers):
key_layers.extend([
f'layers.{i}.cross_attention.to_q.weight',
f'layers.{i}.cross_attention.to_k.weight',
f'layers.{i}.cross_attention.to_v.weight',
f'layers.{i}.cross_attention.to_out.weight',
])
# 检查关键层
verified_layers = 0
total_key_layers = 0
for layer_name in key_layers:
if layer_name in model_keys: # 只检查模型中实际存在的层
total_key_layers += 1
if layer_name in matched_keys:
verified_layers += 1
expected_shape = model.state_dict()[layer_name].shape
actual_shape = state_dict[layer_name].shape if layer_name in state_dict else "缺失"
if layer_name in state_dict and expected_shape == actual_shape:
print(f"{layer_name}: {actual_shape}")
else:
print(f"{layer_name}: 期望 {expected_shape}, 实际 {actual_shape}")
else:
print(f"{layer_name}: 缺失")
print(f"\n关键层验证结果: {verified_layers}/{total_key_layers} 层验证成功")
if verified_layers == total_key_layers:
print("✅ 所有关键层验证通过!")
elif verified_layers / total_key_layers >= 0.9:
print("⚠️ 大部分关键层验证通过,模型应该可以正常工作")
else:
print("❌ 关键层验证失败过多,模型可能无法正常工作!")
print()
else:
raise FileNotFoundError(f"模型文件不存在: {model_path}")
model.to(device)
model.eval()
return model, tokenizer
def load_eval_data(data_path, num_samples=20):
"""
加载评估数据集
Args:
data_path: 数据文件路径
num_samples: 要评估的样本数量
Returns:
samples: 数据样本列表
"""
data = []
with open(data_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f):
line = line.strip()
if line: # 跳过空行
try:
sample = json.loads(line)
data.append(sample)
if len(data) >= num_samples:
break
except json.JSONDecodeError as e:
print(f"警告:第{line_num+1}行JSON解析失败: {e}")
continue
# 只取前num_samples条数据
samples = data[:num_samples]
print(f"加载了 {len(samples)} 条评估数据")
return samples
def evaluate_sample(model, tokenizer, text, input_length=100, predict_length=100, device='cuda'):
"""
评估单个样本
Args:
model: 模型实例
tokenizer: tokenizer实例
text: 输入文本
input_length: 输入token数量
predict_length: 预测token数量
device: 运行设备
Returns:
input_text: 输入文本
predicted_text: 预测文本
ground_truth_text: 真实文本
loss: 预测损失如果可计算
"""
# 对文本进行分词
tokens = tokenizer.encode(text, add_special_tokens=False)
# 确保有足够的token
if len(tokens) < input_length + predict_length:
print(f"警告:文本长度不足,只有 {len(tokens)} 个token")
return None, None, None, None
# 分割输入和目标
input_tokens = tokens[:input_length]
target_tokens = tokens[input_length:input_length + predict_length]
# 转换为张量
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
# 生成预测
with torch.no_grad():
# 使用generate方法生成调整参数改善生成质量
generated = model.generate(
input_ids,
max_new_tokens=predict_length,
temperature=1.0,
top_p=0.95,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id
)
# 提取生成的token去掉输入部分
# generated包含完整序列需要从input_length位置开始提取新生成的部分
full_generated_tokens = generated[0].tolist()
if len(full_generated_tokens) > input_length:
predicted_tokens = full_generated_tokens[input_length:]
else:
# 如果生成序列长度不够,说明没有新生成内容
predicted_tokens = []
# 检查是否因EOS token提前结束生成
eos_found = False
eos_position = -1
actual_predicted_length = len(predicted_tokens)
if predicted_tokens and tokenizer.eos_token_id is not None:
try:
eos_position = predicted_tokens.index(tokenizer.eos_token_id)
eos_found = True
# 只保留EOS token之前的内容
predicted_tokens = predicted_tokens[:eos_position]
actual_predicted_length = len(predicted_tokens)
except ValueError:
# 没有找到EOS token
pass
# 计算loss使用forward方法
# 准备用于loss计算的输入
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
outputs = model(loss_input_ids) # 移除logits_to_keep参数
# 计算loss
logits = outputs.logits
loss = None
if logits is not None:
# 重塑logits和目标 - 修复:使用正确的位置切片
# 在Transformer中position i的logits预测position i+1的token
# 要预测position input_length到input_length+predict_length-1的token
# 需要使用position input_length-1到input_length+predict_length-2的logits
shift_logits = logits[0, input_length-1:input_length+predict_length-1, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
# 计算交叉熵损失
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
loss = loss.item()
# 解码文本
input_text = tokenizer.decode(input_tokens, skip_special_tokens=True)
# 只解码实际生成的token限制在predict_length内
actual_predicted_tokens = predicted_tokens[:predict_length] if predicted_tokens else []
predicted_text = tokenizer.decode(actual_predicted_tokens, skip_special_tokens=True) if actual_predicted_tokens else "[未生成内容]"
ground_truth_text = tokenizer.decode(target_tokens, skip_special_tokens=True)
# 返回额外的生成统计信息
generation_stats = {
'requested_length': predict_length,
'actual_length': actual_predicted_length,
'eos_found': eos_found,
'eos_position': eos_position if eos_found else None,
'truncated_by_eos': eos_found and eos_position < predict_length
}
return input_text, predicted_text, ground_truth_text, loss, generation_stats
def main():
parser = argparse.ArgumentParser(description='评估预训练模型')
parser.add_argument('--model_path', type=str, default='out/experiment_1_4_0/pretrain_512.pth',
help='模型权重文件路径')
parser.add_argument('--model_type', type=str, default='model',
choices=['model', 'model_original', 'model_no_feed'],
help='模型类型')
parser.add_argument('--data_path', type=str, default='dataset/stable/eval_data.json',
help='评估数据集路径')
parser.add_argument('--num_samples', type=int, default=20,
help='评估样本数量')
parser.add_argument('--input_length', type=int, default=100,
help='输入token长度')
parser.add_argument('--predict_length', type=int, default=100,
help='预测token长度')
parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
help='运行设备')
# 模型架构参数
parser.add_argument('--dim', type=int, default=512,
help='模型维度')
parser.add_argument('--n_layers', type=int, default=8,
help='Transformer层数')
parser.add_argument('--n_heads', type=int, default=32,
help='注意力头数')
parser.add_argument('--n_kv_heads', type=int, default=8,
help='KV注意力头数')
parser.add_argument('--vocab_size', type=int, default=6400,
help='词汇表大小')
parser.add_argument('--max_seq_len', type=int, default=512,
help='最大序列长度')
parser.add_argument('--dropout', type=float, default=0.0,
help='Dropout率')
parser.add_argument('--norm_eps', type=float, default=1e-5,
help='层归一化epsilon')
parser.add_argument('--rope_theta', type=float, default=1e6,
help='RoPE theta参数')
# KnowledgeDataset相关参数仅model和model_no_feed使用
parser.add_argument('--knowledge_num', type=int, default=1048576,
help='知识条目数量')
parser.add_argument('--knowledge_length', type=int, default=32,
help='单条知识长度')
parser.add_argument('--knowledge_dim', type=int, default=128,
help='知识维度')
# MOE相关参数
parser.add_argument('--use_moe', action='store_true',
help='是否使用MOE')
parser.add_argument('--num_experts_per_tok', type=int, default=2,
help='每个token激活的专家数')
parser.add_argument('--n_routed_experts', type=int, default=4,
help='路由专家数量')
args = parser.parse_args()
print(f"评估配置:")
print(f" 模型路径: {args.model_path}")
print(f" 模型类型: {args.model_type}")
print(f" 数据路径: {args.data_path}")
print(f" 样本数量: {args.num_samples}")
print(f" 输入长度: {args.input_length} tokens")
print(f" 预测长度: {args.predict_length} tokens")
print(f" 运行设备: {args.device}")
print()
# 构建配置参数字典
config_params = {
'dim': args.dim,
'n_layers': args.n_layers,
'n_heads': args.n_heads,
'n_kv_heads': args.n_kv_heads,
'vocab_size': args.vocab_size,
'max_seq_len': args.max_seq_len,
'dropout': args.dropout,
'norm_eps': args.norm_eps,
'rope_theta': args.rope_theta,
'use_moe': args.use_moe,
'num_experts_per_tok': args.num_experts_per_tok,
'n_routed_experts': args.n_routed_experts,
}
# 只有model和model_no_feed需要KnowledgeDataset参数
if args.model_type in ['model', 'model_no_feed']:
config_params.update({
'knowledge_num': args.knowledge_num,
'knowledge_length': args.knowledge_length,
'knowledge_dim': args.knowledge_dim,
})
# 加载模型
model, tokenizer = load_model(args.model_path, args.model_type, args.device, config_params)
# 加载数据
samples = load_eval_data(args.data_path, args.num_samples)
# 评估每个样本
total_loss = 0
valid_samples = 0
total_requested_tokens = 0
total_actual_tokens = 0
samples_with_eos = 0
samples_truncated_by_eos = 0
for i, sample in enumerate(samples):
print(f"\n{'='*60}")
print(f"样本 {i+1}/{len(samples)}")
print(f"{'='*60}")
text = sample['text']
# 评估样本
input_text, predicted_text, ground_truth_text, loss, generation_stats = evaluate_sample(
model, tokenizer, text,
args.input_length, args.predict_length, args.device
)
if input_text is None:
print("跳过该样本(文本长度不足)")
continue
# 打印结果
print(f"\n输入 ({args.input_length} tokens):")
print(f" {input_text}")
print(f"\n预测输出 (请求{generation_stats['requested_length']}个token, 实际生成{generation_stats['actual_length']}个):")
print(f" {predicted_text}")
print(f"\n真实值 ({args.predict_length} tokens):")
print(f" {ground_truth_text}")
# 打印生成统计信息
print(f"\n生成统计:")
print(f" 请求生成: {generation_stats['requested_length']} tokens")
print(f" 实际生成: {generation_stats['actual_length']} tokens")
if generation_stats['eos_found']:
print(f" ✅ 发现EOS token在位置 {generation_stats['eos_position']}")
if generation_stats['truncated_by_eos']:
print(f" ⚠️ 因EOS token提前结束生成")
else:
print(f" ✅ EOS token出现在预期位置")
else:
print(f" ❌ 未发现EOS token (可能达到最大长度限制)")
if loss is not None:
print(f"\nLoss: {loss:.4f}")
total_loss += loss
valid_samples += 1
# 更新生成统计
total_requested_tokens += generation_stats['requested_length']
total_actual_tokens += generation_stats['actual_length']
if generation_stats['eos_found']:
samples_with_eos += 1
if generation_stats['truncated_by_eos']:
samples_truncated_by_eos += 1
# 打印总体统计
if valid_samples > 0:
print(f"\n{'='*60}")
print(f"总体统计:")
print(f" 有效样本数: {valid_samples}")
print(f" 平均Loss: {total_loss / valid_samples:.4f}")
print()
print(f"生成统计:")
print(f" 请求生成总tokens: {total_requested_tokens}")
print(f" 实际生成总tokens: {total_actual_tokens}")
print(f" 生成完成率: {total_actual_tokens / total_requested_tokens * 100:.1f}%" if total_requested_tokens > 0 else " 生成完成率: N/A")
print(f" 发现EOS的样本: {samples_with_eos}/{len(samples)} ({samples_with_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 发现EOS的样本: N/A")
print(f" 被EOS截断的样本: {samples_truncated_by_eos}/{len(samples)} ({samples_truncated_by_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 被EOS截断的样本: N/A")
print(f" 平均每样本生成长度: {total_actual_tokens/len(samples):.1f} tokens" if len(samples) > 0 else " 平均每样本生成长度: N/A")
print(f"{'='*60}")
if __name__ == "__main__":
main()

View File

@ -1,516 +0,0 @@
#!/usr/bin/env python3
"""
评估预训练模型的推理效果
用于测试不同实验中训练出来的模型在eval_data.json上的表现
"""
import os
import json
import argparse
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from model.LMConfig import LMConfig
def load_model(model_path, model_type, device, config_params=None):
"""
加载模型和tokenizer
Args:
model_path: 模型权重文件路径
model_type: 模型类型 (model/model_original/model_no_feed)
device: 运行设备
config_params: 模型配置参数字典
Returns:
model: 加载好的模型
tokenizer: tokenizer实例
"""
# 初始化配置
if config_params:
lm_config = LMConfig(**config_params)
else:
lm_config = LMConfig()
# 打印配置信息
print(f"模型配置:")
print(f" dim: {lm_config.dim}")
print(f" n_layers: {lm_config.n_layers}")
print(f" n_heads: {lm_config.n_heads}")
print(f" vocab_size: {lm_config.vocab_size}")
print(f" max_seq_len: {lm_config.max_seq_len}")
if hasattr(lm_config, 'knowledge_num'):
print(f" knowledge_num: {lm_config.knowledge_num}")
print(f" knowledge_length: {lm_config.knowledge_length}")
print(f" knowledge_dim: {lm_config.knowledge_dim}")
print()
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
# 根据模型类型导入对应的模型类
if model_type == "model":
from model.model import MiniMindLM
elif model_type == "model_original":
from model.model_original import MiniMindLM
elif model_type == "model_no_feed":
from model.model_no_feed import MiniMindLM
else:
raise ValueError(f"不支持的模型类型: {model_type}")
# 初始化模型
model = MiniMindLM(lm_config)
# 加载权重
if os.path.exists(model_path):
print(f"正在从 {model_path} 加载模型权重...")
# 加载权重文件
state_dict = torch.load(model_path, map_location=device)
# 获取模型的参数名称
model_keys = set(model.state_dict().keys())
checkpoint_keys = set(state_dict.keys())
# 统计权重匹配情况
matched_keys = model_keys & checkpoint_keys
missing_keys = model_keys - checkpoint_keys
unexpected_keys = checkpoint_keys - model_keys
print(f"\n权重加载详情:")
print(f" 模型总参数数量: {len(model_keys)}")
print(f" 权重文件参数数量: {len(checkpoint_keys)}")
print(f" 成功匹配参数: {len(matched_keys)}")
print(f" 缺失参数: {len(missing_keys)}")
print(f" 多余参数: {len(unexpected_keys)}")
# 详细列出缺失和多余的参数
if missing_keys:
print(f"\n❌ 缺失的参数 ({len(missing_keys)}):")
for key in sorted(missing_keys):
print(f" - {key}")
if unexpected_keys:
print(f"\n⚠️ 权重文件中多余的参数 ({len(unexpected_keys)}):")
for key in sorted(unexpected_keys):
print(f" + {key}")
# 加载权重(允许部分匹配)
try:
incompatible_keys = model.load_state_dict(state_dict, strict=False)
# 检查加载结果
if len(incompatible_keys.missing_keys) == 0 and len(incompatible_keys.unexpected_keys) == 0:
print(f"\n✅ 权重加载完全成功!")
elif len(incompatible_keys.missing_keys) == 0:
print(f"\n✅ 权重加载成功(忽略多余参数)")
else:
print(f"\n⚠️ 权重加载部分成功,存在缺失参数")
print(f" 这可能影响模型性能,请检查模型配置参数是否正确")
# 计算加载成功率
success_rate = len(matched_keys) / len(model_keys) * 100
print(f" 参数加载成功率: {success_rate:.1f}%")
if success_rate < 90:
print(f" ❌ 警告:加载成功率过低,模型可能无法正常工作!")
elif success_rate < 100:
print(f" ⚠️ 警告:存在缺失参数,可能影响模型性能")
except Exception as e:
raise RuntimeError(f"权重加载失败: {e}")
# 验证关键层的形状
print("🔍 验证关键层形状:")
key_layers = [
'tok_embeddings.weight',
'output.weight',
'norm.weight',
]
# 添加每一层的验证
for i in range(lm_config.n_layers):
key_layers.extend([
f'layers.{i}.attention_norm.weight',
f'layers.{i}.ffn_norm.weight',
f'layers.{i}.self_attention.wq.weight',
f'layers.{i}.self_attention.wk.weight',
f'layers.{i}.self_attention.wv.weight',
f'layers.{i}.self_attention.wo.weight',
])
# FFN层的验证model_original有FFN其他模型可能没有
if f'layers.{i}.feed_forward.w1.weight' in model_keys:
key_layers.extend([
f'layers.{i}.feed_forward.w1.weight',
f'layers.{i}.feed_forward.w2.weight',
f'layers.{i}.feed_forward.w3.weight',
])
# 验证KnowledgeDataset相关层仅model和model_no_feed
if model_type in ['model', 'model_no_feed']:
key_layers.extend([
'knowledge_dataset.to_queries.0.weight',
'knowledge_dataset.keys',
'knowledge_dataset.knowledge_dataset',
])
# 添加CrossAttention层
for i in range(lm_config.n_layers):
key_layers.extend([
f'layers.{i}.cross_attention.to_q.weight',
f'layers.{i}.cross_attention.to_k.weight',
f'layers.{i}.cross_attention.to_v.weight',
f'layers.{i}.cross_attention.to_out.weight',
])
# 检查关键层
verified_layers = 0
total_key_layers = 0
for layer_name in key_layers:
if layer_name in model_keys: # 只检查模型中实际存在的层
total_key_layers += 1
if layer_name in matched_keys:
verified_layers += 1
expected_shape = model.state_dict()[layer_name].shape
actual_shape = state_dict[layer_name].shape if layer_name in state_dict else "缺失"
if layer_name in state_dict and expected_shape == actual_shape:
print(f"{layer_name}: {actual_shape}")
else:
print(f"{layer_name}: 期望 {expected_shape}, 实际 {actual_shape}")
else:
print(f"{layer_name}: 缺失")
print(f"\n关键层验证结果: {verified_layers}/{total_key_layers} 层验证成功")
if verified_layers == total_key_layers:
print("✅ 所有关键层验证通过!")
elif verified_layers / total_key_layers >= 0.9:
print("⚠️ 大部分关键层验证通过,模型应该可以正常工作")
else:
print("❌ 关键层验证失败过多,模型可能无法正常工作!")
print()
else:
raise FileNotFoundError(f"模型文件不存在: {model_path}")
model.to(device)
model.eval()
return model, tokenizer
def load_eval_data(data_path, num_samples=20):
"""
加载评估数据集
Args:
data_path: 数据文件路径
num_samples: 要评估的样本数量
Returns:
samples: 数据样本列表
"""
data = []
with open(data_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f):
line = line.strip()
if line: # 跳过空行
try:
sample = json.loads(line)
data.append(sample)
if len(data) >= num_samples:
break
except json.JSONDecodeError as e:
print(f"警告:第{line_num+1}行JSON解析失败: {e}")
continue
# 只取前num_samples条数据
samples = data[:num_samples]
print(f"加载了 {len(samples)} 条评估数据")
return samples
def evaluate_sample(model, tokenizer, text, input_length=100, predict_length=100, device='cuda'):
"""
评估单个样本
Args:
model: 模型实例
tokenizer: tokenizer实例
text: 输入文本
input_length: 输入token数量
predict_length: 预测token数量
device: 运行设备
Returns:
input_text: 输入文本
predicted_text: 预测文本
ground_truth_text: 真实文本
loss: 预测损失如果可计算
"""
# 对文本进行分词
tokens = tokenizer.encode(text, add_special_tokens=False)
# 确保有足够的token
if len(tokens) < input_length + predict_length:
print(f"警告:文本长度不足,只有 {len(tokens)} 个token")
return None, None, None, None
# 分割输入和目标
input_tokens = tokens[:input_length]
target_tokens = tokens[input_length:input_length + predict_length]
# 转换为张量
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
# 生成预测
with torch.no_grad():
# 使用generate方法生成调整参数改善生成质量
generated = model.generate(
input_ids,
max_new_tokens=predict_length,
temperature=1.0,
top_p=0.95,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id
)
# 提取生成的token去掉输入部分
# generated包含完整序列需要从input_length位置开始提取新生成的部分
full_generated_tokens = generated[0].tolist()
if len(full_generated_tokens) > input_length:
predicted_tokens = full_generated_tokens[input_length:]
else:
# 如果生成序列长度不够,说明没有新生成内容
predicted_tokens = []
# 检查是否因EOS token提前结束生成
eos_found = False
eos_position = -1
actual_predicted_length = len(predicted_tokens)
if predicted_tokens and tokenizer.eos_token_id is not None:
try:
eos_position = predicted_tokens.index(tokenizer.eos_token_id)
eos_found = True
# 只保留EOS token之前的内容
predicted_tokens = predicted_tokens[:eos_position]
actual_predicted_length = len(predicted_tokens)
except ValueError:
# 没有找到EOS token
pass
# 计算loss使用forward方法
# 准备用于loss计算的输入
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
outputs = model(loss_input_ids) # 移除logits_to_keep参数
# 计算loss
logits = outputs.logits
loss = None
if logits is not None:
# 重塑logits和目标 - 修复:使用正确的位置切片
shift_logits = logits[0, input_length:input_length + predict_length, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
# 计算交叉熵损失
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
loss = loss.item()
# 解码文本
input_text = tokenizer.decode(input_tokens, skip_special_tokens=True)
# 只解码实际生成的token限制在predict_length内
actual_predicted_tokens = predicted_tokens[:predict_length] if predicted_tokens else []
predicted_text = tokenizer.decode(actual_predicted_tokens, skip_special_tokens=True) if actual_predicted_tokens else "[未生成内容]"
ground_truth_text = tokenizer.decode(target_tokens, skip_special_tokens=True)
# 返回额外的生成统计信息
generation_stats = {
'requested_length': predict_length,
'actual_length': actual_predicted_length,
'eos_found': eos_found,
'eos_position': eos_position if eos_found else None,
'truncated_by_eos': eos_found and eos_position < predict_length
}
return input_text, predicted_text, ground_truth_text, loss, generation_stats
def main():
parser = argparse.ArgumentParser(description='评估预训练模型')
parser.add_argument('--model_path', type=str, default='out/experiment_1_4_0/pretrain_512.pth',
help='模型权重文件路径')
parser.add_argument('--model_type', type=str, default='model',
choices=['model', 'model_original', 'model_no_feed'],
help='模型类型')
parser.add_argument('--data_path', type=str, default='dataset/stable/eval_data.json',
help='评估数据集路径')
parser.add_argument('--num_samples', type=int, default=20,
help='评估样本数量')
parser.add_argument('--input_length', type=int, default=100,
help='输入token长度')
parser.add_argument('--predict_length', type=int, default=100,
help='预测token长度')
parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
help='运行设备')
# 模型架构参数
parser.add_argument('--dim', type=int, default=512,
help='模型维度')
parser.add_argument('--n_layers', type=int, default=8,
help='Transformer层数')
parser.add_argument('--n_heads', type=int, default=32,
help='注意力头数')
parser.add_argument('--n_kv_heads', type=int, default=8,
help='KV注意力头数')
parser.add_argument('--vocab_size', type=int, default=6400,
help='词汇表大小')
parser.add_argument('--max_seq_len', type=int, default=512,
help='最大序列长度')
parser.add_argument('--dropout', type=float, default=0.0,
help='Dropout率')
parser.add_argument('--norm_eps', type=float, default=1e-5,
help='层归一化epsilon')
parser.add_argument('--rope_theta', type=float, default=1e6,
help='RoPE theta参数')
# KnowledgeDataset相关参数仅model和model_no_feed使用
parser.add_argument('--knowledge_num', type=int, default=1048576,
help='知识条目数量')
parser.add_argument('--knowledge_length', type=int, default=32,
help='单条知识长度')
parser.add_argument('--knowledge_dim', type=int, default=128,
help='知识维度')
# MOE相关参数
parser.add_argument('--use_moe', action='store_true',
help='是否使用MOE')
parser.add_argument('--num_experts_per_tok', type=int, default=2,
help='每个token激活的专家数')
parser.add_argument('--n_routed_experts', type=int, default=4,
help='路由专家数量')
args = parser.parse_args()
print(f"评估配置:")
print(f" 模型路径: {args.model_path}")
print(f" 模型类型: {args.model_type}")
print(f" 数据路径: {args.data_path}")
print(f" 样本数量: {args.num_samples}")
print(f" 输入长度: {args.input_length} tokens")
print(f" 预测长度: {args.predict_length} tokens")
print(f" 运行设备: {args.device}")
print()
# 构建配置参数字典
config_params = {
'dim': args.dim,
'n_layers': args.n_layers,
'n_heads': args.n_heads,
'n_kv_heads': args.n_kv_heads,
'vocab_size': args.vocab_size,
'max_seq_len': args.max_seq_len,
'dropout': args.dropout,
'norm_eps': args.norm_eps,
'rope_theta': args.rope_theta,
'use_moe': args.use_moe,
'num_experts_per_tok': args.num_experts_per_tok,
'n_routed_experts': args.n_routed_experts,
}
# 只有model和model_no_feed需要KnowledgeDataset参数
if args.model_type in ['model', 'model_no_feed']:
config_params.update({
'knowledge_num': args.knowledge_num,
'knowledge_length': args.knowledge_length,
'knowledge_dim': args.knowledge_dim,
})
# 加载模型
model, tokenizer = load_model(args.model_path, args.model_type, args.device, config_params)
# 加载数据
samples = load_eval_data(args.data_path, args.num_samples)
# 评估每个样本
total_loss = 0
valid_samples = 0
total_requested_tokens = 0
total_actual_tokens = 0
samples_with_eos = 0
samples_truncated_by_eos = 0
for i, sample in enumerate(samples):
print(f"\n{'='*60}")
print(f"样本 {i+1}/{len(samples)}")
print(f"{'='*60}")
text = sample['text']
# 评估样本
input_text, predicted_text, ground_truth_text, loss, generation_stats = evaluate_sample(
model, tokenizer, text,
args.input_length, args.predict_length, args.device
)
if input_text is None:
print("跳过该样本(文本长度不足)")
continue
# 打印结果
print(f"\n输入 ({args.input_length} tokens):")
print(f" {input_text}")
print(f"\n预测输出 (请求{generation_stats['requested_length']}个token, 实际生成{generation_stats['actual_length']}个):")
print(f" {predicted_text}")
print(f"\n真实值 ({args.predict_length} tokens):")
print(f" {ground_truth_text}")
# 打印生成统计信息
print(f"\n生成统计:")
print(f" 请求生成: {generation_stats['requested_length']} tokens")
print(f" 实际生成: {generation_stats['actual_length']} tokens")
if generation_stats['eos_found']:
print(f" ✅ 发现EOS token在位置 {generation_stats['eos_position']}")
if generation_stats['truncated_by_eos']:
print(f" ⚠️ 因EOS token提前结束生成")
else:
print(f" ✅ EOS token出现在预期位置")
else:
print(f" ❌ 未发现EOS token (可能达到最大长度限制)")
if loss is not None:
print(f"\nLoss: {loss:.4f}")
total_loss += loss
valid_samples += 1
# 更新生成统计
total_requested_tokens += generation_stats['requested_length']
total_actual_tokens += generation_stats['actual_length']
if generation_stats['eos_found']:
samples_with_eos += 1
if generation_stats['truncated_by_eos']:
samples_truncated_by_eos += 1
# 打印总体统计
if valid_samples > 0:
print(f"\n{'='*60}")
print(f"总体统计:")
print(f" 有效样本数: {valid_samples}")
print(f" 平均Loss: {total_loss / valid_samples:.4f}")
print()
print(f"生成统计:")
print(f" 请求生成总tokens: {total_requested_tokens}")
print(f" 实际生成总tokens: {total_actual_tokens}")
print(f" 生成完成率: {total_actual_tokens / total_requested_tokens * 100:.1f}%" if total_requested_tokens > 0 else " 生成完成率: N/A")
print(f" 发现EOS的样本: {samples_with_eos}/{len(samples)} ({samples_with_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 发现EOS的样本: N/A")
print(f" 被EOS截断的样本: {samples_truncated_by_eos}/{len(samples)} ({samples_truncated_by_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 被EOS截断的样本: N/A")
print(f" 平均每样本生成长度: {total_actual_tokens/len(samples):.1f} tokens" if len(samples) > 0 else " 平均每样本生成长度: N/A")
print(f"{'='*60}")
if __name__ == "__main__":
main()

View File

@ -1,218 +0,0 @@
#!/usr/bin/env python3
"""
最终修复eval_model.py中的位置索引错误
"""
import json
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from model.LMConfig import LMConfig
from model.model_original import MiniMindLM
def demonstrate_correct_fix():
"""
演示正确的修复方法
"""
print("🔧 演示正确的修复方法")
print("="*60)
device = 'cuda'
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
# 加载模型
config = LMConfig(
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
)
model = MiniMindLM(config)
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict, strict=False)
model.to(device)
model.eval()
# 测试多个样本以验证修复效果
total_loss_wrong = 0
total_loss_correct = 0
valid_samples = 0
print("测试样本的loss对比:")
print("样本 | 错误方法 | 正确方法 | 差异")
print("-" * 45)
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
if i >= 10: # 测试前10个样本
break
sample = json.loads(line.strip())
text = sample['text']
tokens = tokenizer.encode(text, add_special_tokens=False)
if len(tokens) < 130:
continue
input_length = 100
predict_length = 30
target_tokens = tokens[input_length:input_length + predict_length]
with torch.no_grad():
full_input = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
target_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
# 获取完整logits
outputs = model(full_input)
logits = outputs.logits
# 错误方法 (eval_model.py原来的方法)
wrong_slice = logits[0, -predict_length:, :].contiguous() # 取最后30个
loss_wrong = F.cross_entropy(wrong_slice, target_labels, reduction='mean')
# 正确方法
correct_slice = logits[0, input_length-1:input_length+predict_length-1, :].contiguous() # 取99:129
loss_correct = F.cross_entropy(correct_slice, target_labels, reduction='mean')
total_loss_wrong += loss_wrong.item()
total_loss_correct += loss_correct.item()
valid_samples += 1
diff = loss_wrong.item() - loss_correct.item()
print(f"{i+1:2} | {loss_wrong.item():8.4f} | {loss_correct.item():8.4f} | {diff:+6.4f}")
avg_loss_wrong = total_loss_wrong / valid_samples
avg_loss_correct = total_loss_correct / valid_samples
improvement = avg_loss_wrong - avg_loss_correct
print("-" * 45)
print(f"平均 | {avg_loss_wrong:8.4f} | {avg_loss_correct:8.4f} | {improvement:+6.4f}")
print(f"\n📊 修复效果:")
print(f" 错误方法平均loss: {avg_loss_wrong:.4f}")
print(f" 正确方法平均loss: {avg_loss_correct:.4f}")
print(f" 改进幅度: {improvement:.4f} ({improvement/avg_loss_wrong*100:.1f}%)")
print(f" 正确方法更接近训练时的教师强制loss (~2.4)")
def create_final_fixed_eval_model():
"""
创建最终修复版的eval_model.py
"""
print(f"\n🔧 创建最终修复版的eval_model.py")
print("="*60)
# 读取原始eval_model.py
with open('eval_model.py', 'r', encoding='utf-8') as f:
content = f.read()
# 修复evaluate_sample函数中的关键部分
old_loss_calculation = ''' # 计算loss使用forward方法
# 准备用于loss计算的输入
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
outputs = model(loss_input_ids, logits_to_keep=predict_length)
# 计算loss
logits = outputs.logits
loss = None
if logits is not None:
# 重塑logits和目标
shift_logits = logits[0, -predict_length:, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
# 计算交叉熵损失
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
loss = loss.item()'''
new_loss_calculation = ''' # 计算loss使用forward方法
# 准备用于loss计算的输入
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
outputs = model(loss_input_ids) # 移除logits_to_keep参数
# 计算loss
logits = outputs.logits
loss = None
if logits is not None:
# 重塑logits和目标 - 修复:使用正确的位置切片
# 在Transformer中position i的logits预测position i+1的token
# 要预测position input_length到input_length+predict_length-1的token
# 需要使用position input_length-1到input_length+predict_length-2的logits
shift_logits = logits[0, input_length-1:input_length+predict_length-1, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
# 计算交叉熵损失
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
loss = loss.item()'''
# 替换内容
fixed_content = content.replace(old_loss_calculation, new_loss_calculation)
# 保存修复后的文件
with open('eval_model_final_fixed.py', 'w', encoding='utf-8') as f:
f.write(fixed_content)
print(f"✅ 创建了最终修复版本eval_model_final_fixed.py")
print(f"主要修复:")
print(f" 1. 移除 logits_to_keep 参数(避免计算差异)")
print(f" 2. 使用正确的位置切片: [input_length-1:input_length+predict_length-1]")
print(f" 3. 这考虑了Transformer中position i预测position i+1的特性")
# 直接修复原文件
with open('eval_model.py', 'w', encoding='utf-8') as f:
f.write(fixed_content)
print(f"✅ 同时直接修复了原文件eval_model.py")
def test_final_fix():
"""
测试最终修复版本
"""
print(f"\n🧪 测试最终修复版本")
print("="*60)
import subprocess
# 运行修复后的eval_model.py使用较少样本快速测试
cmd = [
'.venv/bin/python', 'eval_model.py',
'--model_path', 'out/experiment_1_4_0/pretrain_512.pth',
'--model_type', 'model_original',
'--num_samples', '5',
'--input_length', '100',
'--predict_length', '30'
]
print("运行命令:")
print(" ".join(cmd))
print("\n运行结果:")
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
# 提取关键信息
output_lines = result.stdout.split('\n')
for line in output_lines:
if 'Loss:' in line or '平均Loss:' in line or '总体统计:' in line or '有效样本数:' in line:
print(line)
if result.returncode == 0:
print("\n✅ 修复后的eval_model.py运行成功")
else:
print(f"\n❌ 运行失败,错误码: {result.returncode}")
if result.stderr:
print("错误信息:")
print(result.stderr[:500])
except subprocess.TimeoutExpired:
print("❌ 运行超时")
except Exception as e:
print(f"❌ 运行出错: {e}")
if __name__ == "__main__":
demonstrate_correct_fix()
create_final_fixed_eval_model()
test_final_fix()

View File

@ -1,247 +0,0 @@
#!/usr/bin/env python3
"""
修复logits_to_keep参数导致的loss计算错误
验证问题并提供解决方案
"""
import json
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from model.LMConfig import LMConfig
from model.model_original import MiniMindLM
def demonstrate_logits_to_keep_issue():
"""
演示logits_to_keep参数导致的问题
"""
print("🔍 验证logits_to_keep参数问题")
print("="*60)
device = 'cuda'
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
# 加载模型
config = LMConfig(
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
)
model = MiniMindLM(config)
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict, strict=False)
model.to(device)
model.eval()
# 加载测试数据
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
sample = json.loads(f.readline().strip())
text = sample['text']
tokens = tokenizer.encode(text, add_special_tokens=False)
input_tokens = tokens[:100]
target_tokens = tokens[100:130] # 30个目标token
print(f"测试样本: {len(tokens)} tokens")
print(f"输入: {len(input_tokens)} tokens")
print(f"目标: {len(target_tokens)} tokens")
with torch.no_grad():
full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
target_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
print(f"\n🔬 详细对比不同方法:")
# 方法1: 标准forward (正确方法)
outputs1 = model(full_input)
logits1 = outputs1.logits
correct_logits = logits1[0, 99:129, :].contiguous() # 取position 99-128
loss1 = F.cross_entropy(correct_logits, target_labels, reduction='mean')
print(f"1. 标准forward (正确):")
print(f" 完整logits形状: {logits1.shape}")
print(f" 用于计算的logits形状: {correct_logits.shape}")
print(f" Loss: {loss1.item():.4f}")
# 方法2: 使用logits_to_keep=30 (错误方法)
outputs2 = model(full_input, logits_to_keep=30)
logits2 = outputs2.logits
incorrect_logits = logits2[0, -30:, :].contiguous() # 最后30个
loss2 = F.cross_entropy(incorrect_logits, target_labels, reduction='mean')
print(f"\n2. logits_to_keep=30 (eval_model.py方法):")
print(f" 部分logits形状: {logits2.shape}")
print(f" 用于计算的logits形状: {incorrect_logits.shape}")
print(f" Loss: {loss2.item():.4f}")
# 方法3: 修复后的方法不使用logits_to_keep
# 这就是方法1但为了清晰显示修复方案
print(f"\n3. 修复方法 (不使用logits_to_keep):")
print(f" 使用完整forward然后选择正确的logits切片")
print(f" 这与方法1相同Loss: {loss1.item():.4f}")
# 分析差异
print(f"\n📊 数值分析:")
print(f" Loss差异: {abs(loss2.item() - loss1.item()):.4f}")
print(f" Loss增幅: {(loss2.item() / loss1.item() - 1) * 100:.1f}%")
# 检查logits的微小差异如何被放大
logits_diff = torch.abs(correct_logits - incorrect_logits).max()
print(f" 最大logits差异: {logits_diff.item():.8f}")
# 计算softmax概率的差异
prob1 = F.softmax(correct_logits, dim=-1)
prob2 = F.softmax(incorrect_logits, dim=-1)
prob_diff = torch.abs(prob1 - prob2).max()
print(f" 最大概率差异: {prob_diff.item():.8f}")
print(f"\n💡 结论:")
print(f" 虽然logits差异很小({logits_diff.item():.8f})")
print(f" 但在交叉熵损失中被显著放大导致loss增加{(loss2.item() / loss1.item() - 1) * 100:.1f}%")
def create_fixed_eval_model():
"""
创建修复后的eval_model.py
"""
print(f"\n🔧 创建修复后的评估脚本")
print("="*60)
# 读取原始eval_model.py
with open('eval_model.py', 'r', encoding='utf-8') as f:
content = f.read()
# 修复关键部分移除logits_to_keep的使用
fixed_content = content.replace(
""" # 计算loss使用forward方法
# 准备用于loss计算的输入
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
outputs = model(loss_input_ids, logits_to_keep=predict_length)
# 计算loss
logits = outputs.logits
loss = None
if logits is not None:
# 重塑logits和目标
shift_logits = logits[0, -predict_length:, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
# 计算交叉熵损失
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
loss = loss.item()""",
""" # 计算loss使用forward方法
# 准备用于loss计算的输入
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
outputs = model(loss_input_ids) # 移除logits_to_keep参数
# 计算loss
logits = outputs.logits
loss = None
if logits is not None:
# 重塑logits和目标 - 修复:使用正确的位置切片
shift_logits = logits[0, input_length:input_length + predict_length, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
# 计算交叉熵损失
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
loss = loss.item()"""
)
# 保存修复后的文件
with open('eval_model_fixed.py', 'w', encoding='utf-8') as f:
f.write(fixed_content)
print(f"✅ 创建了修复版本eval_model_fixed.py")
print(f"主要修复:")
print(f" 1. 移除 logits_to_keep 参数")
print(f" 2. 使用正确的位置切片: [input_length:input_length + predict_length]")
print(f" 3. 而不是错误的 [-predict_length:]")
def test_fixed_evaluation():
"""
测试修复后的评估方法
"""
print(f"\n🧪 测试修复后的评估方法")
print("="*60)
device = 'cuda'
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
# 加载模型
config = LMConfig(
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
)
model = MiniMindLM(config)
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict, strict=False)
model.to(device)
model.eval()
# 测试多个样本
total_loss_old = 0
total_loss_fixed = 0
valid_samples = 0
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
if i >= 10: # 测试前10个样本
break
sample = json.loads(line.strip())
text = sample['text']
tokens = tokenizer.encode(text, add_special_tokens=False)
if len(tokens) < 130:
continue
input_length = 100
predict_length = 30
input_tokens = tokens[:input_length]
target_tokens = tokens[input_length:input_length + predict_length]
with torch.no_grad():
full_input = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
target_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
# 原始错误方法
outputs_old = model(full_input, logits_to_keep=predict_length)
logits_old = outputs_old.logits
shift_logits_old = logits_old[0, -predict_length:, :].contiguous()
loss_old = F.cross_entropy(shift_logits_old, target_labels, reduction='mean')
# 修复后方法
outputs_fixed = model(full_input)
logits_fixed = outputs_fixed.logits
shift_logits_fixed = logits_fixed[0, input_length:input_length + predict_length, :].contiguous()
loss_fixed = F.cross_entropy(shift_logits_fixed, target_labels, reduction='mean')
total_loss_old += loss_old.item()
total_loss_fixed += loss_fixed.item()
valid_samples += 1
print(f"样本{i+1}: 原始{loss_old.item():.4f} -> 修复{loss_fixed.item():.4f}")
avg_loss_old = total_loss_old / valid_samples
avg_loss_fixed = total_loss_fixed / valid_samples
print(f"\n📊 测试结果总结:")
print(f" 测试样本数: {valid_samples}")
print(f" 原始方法平均loss: {avg_loss_old:.4f}")
print(f" 修复方法平均loss: {avg_loss_fixed:.4f}")
print(f" 差异: {abs(avg_loss_old - avg_loss_fixed):.4f}")
print(f" 修复后loss更接近训练时的教师强制loss (~2.4)")
if __name__ == "__main__":
demonstrate_logits_to_keep_issue()
create_fixed_eval_model()
test_fixed_evaluation()

View File

@ -1,211 +0,0 @@
#!/usr/bin/env python3
"""
深入调查logits_to_keep参数对loss计算的影响
"""
import json
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from model.LMConfig import LMConfig
from model.model_original import MiniMindLM
def investigate_logits_to_keep_issue():
"""
调查logits_to_keep参数的影响
"""
print("🔍 调查logits_to_keep参数的影响")
print("="*60)
device = 'cuda'
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
# 加载模型
config = LMConfig(
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
)
model = MiniMindLM(config)
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict, strict=False)
model.to(device)
model.eval()
# 加载测试数据
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
sample = json.loads(f.readline().strip())
text = sample['text']
tokens = tokenizer.encode(text, add_special_tokens=False)
input_tokens = tokens[:100]
target_tokens = tokens[100:130] # 30个目标token
print(f"测试文本长度: {len(tokens)} tokens")
print(f"输入: {len(input_tokens)} tokens")
print(f"目标: {len(target_tokens)} tokens")
with torch.no_grad():
# 方法1: 标准forward (类似训练时)
full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
outputs1 = model(full_input)
logits1 = outputs1.logits
# 计算loss (训练方式)
shift_logits1 = logits1[0, 99:129, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
loss1 = F.cross_entropy(shift_logits1, shift_labels, reduction='mean')
print(f"\n方法1 (标准forward):")
print(f" logits形状: {logits1.shape}")
print(f" 用于loss计算的logits形状: {shift_logits1.shape}")
print(f" Loss: {loss1.item():.4f}")
# 方法2: 使用logits_to_keep=30 (eval_model.py的方式)
outputs2 = model(full_input, logits_to_keep=30)
logits2 = outputs2.logits
if logits2 is not None:
print(f"\n方法2 (logits_to_keep=30):")
print(f" logits形状: {logits2.shape}")
# 按照eval_model.py的方式计算loss
shift_logits2 = logits2[0, -30:, :].contiguous()
loss2 = F.cross_entropy(shift_logits2, shift_labels, reduction='mean')
print(f" 用于loss计算的logits形状: {shift_logits2.shape}")
print(f" Loss: {loss2.item():.4f}")
# 检查logits是否相同
expected_logits = logits1[0, 100:130, :] # 从position 100-129
actual_logits = logits2[0, -30:, :] # 最后30个position
print(f"\n逐项对比:")
print(f" 期望的logits形状: {expected_logits.shape}")
print(f" 实际的logits形状: {actual_logits.shape}")
# 检查是否相等
are_equal = torch.allclose(expected_logits, actual_logits, rtol=1e-4)
print(f" logits是否相等: {are_equal}")
if not are_equal:
diff = torch.abs(expected_logits - actual_logits).max()
print(f" 最大差异: {diff.item():.6f}")
# 检查前几个position的差异
for i in range(min(5, expected_logits.shape[0])):
pos_diff = torch.abs(expected_logits[i] - actual_logits[i]).max()
print(f" Position {i} 最大差异: {pos_diff.item():.6f}")
else:
print("\n方法2: logits为None")
# 方法3: 不同的logits_to_keep值
print(f"\n测试不同logits_to_keep值:")
for keep_value in [10, 20, 30, 50, 100]:
outputs_test = model(full_input, logits_to_keep=keep_value)
if outputs_test.logits is not None:
test_logits_shape = outputs_test.logits.shape
print(f" logits_to_keep={keep_value}: {test_logits_shape}")
else:
print(f" logits_to_keep={keep_value}: None")
def check_model_forward_implementation():
"""检查模型forward方法中logits_to_keep的实现"""
print("\n" + "="*60)
print("🔍 检查模型forward方法的实现")
# 读取模型代码中关于logits_to_keep的实现
try:
with open('model/model_original.py', 'r', encoding='utf-8') as f:
content = f.read()
# 查找logits_to_keep相关的代码
lines = content.split('\n')
for i, line in enumerate(lines):
if 'logits_to_keep' in line:
print(f"{i+1}行: {line.strip()}")
# 打印前后几行上下文
for j in range(max(0, i-2), min(len(lines), i+3)):
if j != i:
print(f"{j+1}行: {lines[j].strip()}")
print()
except FileNotFoundError:
print("无法读取model_original.py文件")
def compare_with_original_eval_script():
"""
对比原始eval_model.py脚本的行为
"""
print("\n" + "="*60)
print("🔍 对比原始eval_model.py的行为")
device = 'cuda'
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
# 复制eval_model.py中的相关逻辑
config = LMConfig(
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
)
model = MiniMindLM(config)
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict, strict=False)
model.to(device)
model.eval()
# 加载数据
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
sample = json.loads(f.readline().strip())
text = sample['text']
tokens = tokenizer.encode(text, add_special_tokens=False)
input_length = 100
predict_length = 30
input_tokens = tokens[:input_length]
target_tokens = tokens[input_length:input_length + predict_length]
print(f"复现eval_model.py的计算:")
print(f" input_length: {input_length}")
print(f" predict_length: {predict_length}")
with torch.no_grad():
# 完全按照eval_model.py的方式
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
outputs = model(loss_input_ids, logits_to_keep=predict_length)
print(f" loss_input_ids形状: {loss_input_ids.shape}")
print(f" logits_to_keep参数: {predict_length}")
logits = outputs.logits
loss = None
if logits is not None:
print(f" 输出logits形状: {logits.shape}")
# 重塑logits和目标
shift_logits = logits[0, -predict_length:, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
print(f" shift_logits形状: {shift_logits.shape}")
print(f" shift_labels形状: {shift_labels.shape}")
# 计算交叉熵损失
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
print(f" 计算得到的loss: {loss.item():.4f}")
else:
print(" logits为None")
if __name__ == "__main__":
investigate_logits_to_keep_issue()
check_model_forward_implementation()
compare_with_original_eval_script()

View File

@ -1,181 +0,0 @@
# 训练与推理Loss差距分析报告
> **实验**: Experiment 1.4.0
> **日期**: 2025-07-31
> **分析师**: Claude AI
> **状态**: 已完成并修复关键问题
---
## 📋 问题概述
### 初始发现
用户发现训练loss2.43和推理loss12.34)存在巨大差距,要求进行详细分析。
**关键数据**:
- 训练Loss: 2.43
- 初始推理Loss: 12.34
- 差距: 9.91 (405% 增长)
### 可能原因假设
1. 数据差异
2. 推理脚本问题(权重加载、模型不一致)
3. 训练与推理模式不一致(错误累积)
4. KV cache问题
---
## 🔍 分析过程
### 第一阶段:数据一致性验证
**方法**: 从训练数据中重新提取20个样本创建eval_data_from_train.json
**结果**: ✅ 确认评估数据来自训练数据集,排除数据差异问题
### 第二阶段:模型加载验证
**方法**: 检查权重加载匹配情况
**结果**: ✅ 权重加载完全成功75/75参数匹配排除模型加载问题
### 第三阶段训练vs推理模式对比
**方法**: 对比教师强制(teacher forcing)与自回归生成
**关键发现**:
```
教师强制loss: ~2.43 (与训练一致)
真实自回归loss: ~10-11 (接近推理loss)
```
**初步结论**: 训练与推理的差异主要来自计算方式不同,这本身是正常的
### 第四阶段深入调查logits_to_keep参数
**方法**: 分析eval_model.py中logits_to_keep参数的影响
**震惊发现**:
```
标准forward: Loss = 3.4188
使用logits_to_keep=30: Loss = 9.8785
差距: 188.9% 增长!
```
### 第五阶段:位置索引深度分析
**方法**: 分析Transformer位置索引的正确性
**根本原因发现**:
1. **错误方法**: `logits[0, -predict_length:, :]`
2. **正确方法**: `logits[0, input_length-1:input_length+predict_length-1, :]`
3. **关键认知**: Transformer中position i的logits预测position i+1的token
---
## 🛠️ 修复方案
### 核心修复
**文件**: `eval_model.py`
**修复前**:
```python
outputs = model(loss_input_ids, logits_to_keep=predict_length)
shift_logits = logits[0, -predict_length:, :].contiguous()
```
**修复后**:
```python
outputs = model(loss_input_ids) # 移除logits_to_keep
shift_logits = logits[0, input_length-1:input_length+predict_length-1, :].contiguous()
```
### 修复原理
1. **移除logits_to_keep参数**: 避免计算差异
2. **使用正确位置切片**: 考虑Transformer的位置偏移
3. **确保一致性**: 与训练时的教师强制计算对齐
---
## 📊 修复效果验证
### 单样本对比
```
样本 | 错误方法 | 正确方法 | 改善
-----|----------|----------|------
1 | 9.88 | 3.42 | 65.3%
2 | 13.56 | 1.50 | 88.9%
3 | 13.62 | 1.78 | 86.9%
...
平均 | 12.34 | 2.73 | 77.9%
```
### 最终验证
**修复后10样本评估**:
- 平均Loss: 2.26
- 与训练Loss (2.43) 差异: 仅0.17 (7%)
- 改善幅度: 81.7% (从12.34降至2.26)
---
## 🎯 关键发现总结
### 主要问题
1. **eval_model.py存在位置索引错误**: 这是导致loss被严重高估的根本原因
2. **logits_to_keep参数的误用**: 改变了模型计算方式
3. **位置偏移的忽略**: 未考虑Transformer的特殊性质
### 技术洞察
1. **Transformer位置特性**: position i的logits预测position i+1
2. **微小差异的放大效应**: 即使很小的logits差异也会在交叉熵中被显著放大
3. **评估系统的重要性**: 错误的评估会误导整个研究方向
### 修复成果
1. **训练推理一致性**: ✅ 达到优秀水平(差异<10%
2. **评估系统可靠性**: ✅ 修复后可信度大幅提升
3. **技术基础**: ✅ 为后续实验提供可靠基准
---
## 🔮 后续影响
### 立即影响
- **实验1.4.0评估结果更正**: 推理loss从12.34修正为2.26
- **模型性能重新评价**: model_original的baseline表现优秀
- **评估工具可靠性**: 修复后的eval_model.py可用于后续实验
### 长期影响
- **研究方向**: 确认当前训练方法的有效性
- **技术规范**: 建立正确的模型评估标准
- **项目信心**: 为KnowledgeDataset研究提供坚实基础
---
## 📝 经验教训
### 技术层面
1. **系统性调试的重要性**: 逐步排除假设,找到根本原因
2. **位置索引的细节**: Transformer评估中的关键技术点
3. **验证的必要性**: 必须验证评估工具的正确性
### 方法论层面
1. **多角度分析**: 从数据、模型、计算三个维度分析问题
2. **对照实验**: 通过不同方法的对比找到差异来源
3. **深入理解**: 理解底层原理比表面修复更重要
### 质量控制
1. **评估工具验证**: 在使用前必须验证评估工具的正确性
2. **一致性检查**: 训练与推理的一致性是重要指标
3. **文档记录**: 详细记录问题发现和修复过程
---
## ✅ 结论
**问题解决**: ✅ 完全解决
**根本原因**: eval_model.py中的位置索引错误
**修复效果**: 推理loss从12.34降至2.26改善81.7%
**影响评估**: 重大正面影响,为项目建立可靠基础
**最终状态**: 训练Loss (2.43) 与推理Loss (2.26) 高度一致,证明模型训练成功且评估系统可靠。
---
**报告完成时间**: 2025-07-31
**验证状态**: ✅ 已通过10样本独立验证
**应用状态**: ✅ 已应用于实验1.4.0分析更新