diff --git a/analyze_position_slicing.py b/analyze_position_slicing.py deleted file mode 100644 index c2288e8..0000000 --- a/analyze_position_slicing.py +++ /dev/null @@ -1,193 +0,0 @@ -#!/usr/bin/env python3 -""" -深入分析位置切片的问题 -验证logits_to_keep和位置索引的正确性 -""" - -import json -import torch -import torch.nn.functional as F -from transformers import AutoTokenizer -from model.LMConfig import LMConfig -from model.model_original import MiniMindLM - - -def analyze_position_indexing(): - """ - 分析位置索引的正确性 - """ - print("🔍 分析位置索引和切片逻辑") - print("="*60) - - device = 'cuda' - model_path = 'out/experiment_1_4_0/pretrain_512.pth' - - # 加载模型 - config = LMConfig( - dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512, - dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False - ) - - model = MiniMindLM(config) - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - - state_dict = torch.load(model_path, map_location=device) - model.load_state_dict(state_dict, strict=False) - model.to(device) - model.eval() - - # 加载测试数据 - with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f: - sample = json.loads(f.readline().strip()) - - text = sample['text'] - tokens = tokenizer.encode(text, add_special_tokens=False) - - input_length = 100 - predict_length = 30 - input_tokens = tokens[:input_length] - target_tokens = tokens[input_length:input_length + predict_length] - - print(f"输入长度: {input_length}") - print(f"预测长度: {predict_length}") - print(f"总序列长度: {input_length + predict_length}") - print(f"输入token位置: 0 到 {input_length-1}") - print(f"目标token位置: {input_length} 到 {input_length + predict_length - 1}") - - with torch.no_grad(): - full_input = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device) - target_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - print(f"\n🔬 详细分析不同切片方法:") - - # 方法1: 标准forward - outputs1 = model(full_input) - logits1 = outputs1.logits - print(f"\n1. 标准forward:") - print(f" 输入形状: {full_input.shape}") - print(f" 输出logits形状: {logits1.shape}") - - # 在transformer中,position i的logits预测position i+1的token - # 所以要预测position 100-129的token,需要position 99-128的logits - correct_slice = logits1[0, input_length-1:input_length+predict_length-1, :].contiguous() - loss1 = F.cross_entropy(correct_slice, target_labels, reduction='mean') - print(f" 正确切片 [{input_length-1}:{input_length+predict_length-1}]: {correct_slice.shape}") - print(f" Loss: {loss1.item():.4f}") - - # 方法2: logits_to_keep - outputs2 = model(full_input, logits_to_keep=predict_length) - logits2 = outputs2.logits - print(f"\n2. logits_to_keep={predict_length}:") - print(f" 输出logits形状: {logits2.shape}") - - # 当logits_to_keep=30时,返回最后30个位置的logits - # 这应该对应position 100-129,但实际是哪些位置? - keep_slice = logits2[0, -predict_length:, :].contiguous() - loss2 = F.cross_entropy(keep_slice, target_labels, reduction='mean') - print(f" logits_to_keep切片 [-{predict_length}:]: {keep_slice.shape}") - print(f" Loss: {loss2.item():.4f}") - - # 检查这两个切片是否相同 - print(f"\n🔍 切片对比:") - if torch.allclose(correct_slice, keep_slice, rtol=1e-6): - print(f" ✅ 两个切片完全相同") - else: - diff = torch.abs(correct_slice - keep_slice).max() - print(f" ❌ 切片不同,最大差异: {diff.item():.8f}") - - # 检查具体哪些位置不同 - diff_mask = ~torch.isclose(correct_slice, keep_slice, rtol=1e-6) - diff_positions = torch.where(diff_mask.any(dim=-1))[0] - print(f" 不同的位置: {diff_positions.tolist()}") - - # 方法3: 验证eval_model.py中的逻辑 - print(f"\n3. eval_model.py的逻辑:") - # eval_model.py使用的是logits[0, -predict_length:, :] - eval_slice = logits1[0, -predict_length:, :].contiguous() - loss3 = F.cross_entropy(eval_slice, target_labels, reduction='mean') - print(f" eval_model.py切片 [-{predict_length}:]: {eval_slice.shape}") - print(f" 这对应logits中的位置: {logits1.shape[1] - predict_length} 到 {logits1.shape[1] - 1}") - print(f" Loss: {loss3.item():.4f}") - - # 检查eval_model.py的切片是否正确 - if torch.allclose(correct_slice, eval_slice, rtol=1e-6): - print(f" ✅ eval_model.py切片正确") - else: - diff = torch.abs(correct_slice - eval_slice).max() - print(f" ❌ eval_model.py切片错误,最大差异: {diff.item():.8f}") - - -def compare_different_sequence_lengths(): - """ - 比较不同序列长度下的行为 - """ - print(f"\n🧪 测试不同序列长度") - print("="*60) - - device = 'cuda' - model_path = 'out/experiment_1_4_0/pretrain_512.pth' - - # 加载模型 - config = LMConfig( - dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512, - dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False - ) - - model = MiniMindLM(config) - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - - state_dict = torch.load(model_path, map_location=device) - model.load_state_dict(state_dict, strict=False) - model.to(device) - model.eval() - - # 创建测试序列 - test_tokens = list(range(200)) # 简单的数字序列 - - test_configs = [ - (50, 20), # 50输入,20预测 - (100, 30), # 100输入,30预测 - (150, 40), # 150输入,40预测 - ] - - for input_len, predict_len in test_configs: - print(f"\n测试配置: 输入{input_len}, 预测{predict_len}") - - sequence = test_tokens[:input_len + predict_len] - input_ids = torch.tensor([sequence], dtype=torch.long).to(device) - target_labels = torch.tensor(sequence[input_len:], dtype=torch.long).to(device) - - with torch.no_grad(): - # 标准方法 - outputs_std = model(input_ids) - logits_std = outputs_std.logits - slice_std = logits_std[0, input_len-1:input_len+predict_len-1, :].contiguous() - loss_std = F.cross_entropy(slice_std, target_labels, reduction='mean') - - # logits_to_keep方法 - outputs_keep = model(input_ids, logits_to_keep=predict_len) - logits_keep = outputs_keep.logits - slice_keep = logits_keep[0, -predict_len:, :].contiguous() - loss_keep = F.cross_entropy(slice_keep, target_labels, reduction='mean') - - # eval_model.py方法 - slice_eval = logits_std[0, -predict_len:, :].contiguous() - loss_eval = F.cross_entropy(slice_eval, target_labels, reduction='mean') - - print(f" 标准方法loss: {loss_std.item():.4f}") - print(f" logits_to_keep loss: {loss_keep.item():.4f}") - print(f" eval_model.py loss: {loss_eval.item():.4f}") - - # 检查是否相同 - std_vs_keep = torch.allclose(slice_std, slice_keep, rtol=1e-6) - std_vs_eval = torch.allclose(slice_std, slice_eval, rtol=1e-6) - keep_vs_eval = torch.allclose(slice_keep, slice_eval, rtol=1e-6) - - print(f" 标准 vs logits_to_keep: {'✅' if std_vs_keep else '❌'}") - print(f" 标准 vs eval_model.py: {'✅' if std_vs_eval else '❌'}") - print(f" logits_to_keep vs eval_model.py: {'✅' if keep_vs_eval else '❌'}") - - -if __name__ == "__main__": - analyze_position_indexing() - compare_different_sequence_lengths() \ No newline at end of file diff --git a/analyze_train_inference_gap.py b/analyze_train_inference_gap.py deleted file mode 100644 index 51b11b4..0000000 --- a/analyze_train_inference_gap.py +++ /dev/null @@ -1,371 +0,0 @@ -#!/usr/bin/env python3 -""" -分析训练与推理Loss差距的实验脚本 -系统性地验证各种可能的原因 -""" - -import json -import random -import torch -import torch.nn.functional as F -from transformers import AutoTokenizer -import os -from model.LMConfig import LMConfig -from model.model_original import MiniMindLM - -def create_eval_data_from_training_data(): - """ - 从训练数据中重新提取样本创建eval_data.json - 确保数据来源一致性 - """ - print("=== 1. 创建来自训练数据的评估集 ===") - - train_data_path = "/home/pci/ycz/Code/Minimind/dataset/stable/merged_pretrain.jsonl" - eval_data_path = "dataset/stable/eval_data_from_train.json" - - # 确保目录存在 - os.makedirs("dataset/stable", exist_ok=True) - - # 从训练数据中随机选择20条 - samples = [] - with open(train_data_path, 'r', encoding='utf-8') as f: - all_lines = f.readlines() - - # 随机选择20条数据 - selected_lines = random.sample(all_lines, min(20, len(all_lines))) - - for line in selected_lines: - try: - data = json.loads(line.strip()) - samples.append(data) - except json.JSONDecodeError: - continue - - # 保存到新的评估文件 - with open(eval_data_path, 'w', encoding='utf-8') as f: - for sample in samples: - f.write(json.dumps(sample, ensure_ascii=False) + '\n') - - print(f"✅ 创建了包含{len(samples)}个样本的评估数据集") - print(f" 保存路径: {eval_data_path}") - - return eval_data_path, samples - -def load_model_and_tokenizer(model_path, device='cuda'): - """ - 加载模型和tokenizer,确保与训练时配置一致 - """ - print("=== 2. 加载模型和tokenizer ===") - - # 使用与训练时完全相同的配置 - config = LMConfig( - dim=512, - n_layers=8, - n_heads=32, - vocab_size=6400, - max_seq_len=512, - dropout=0.0, - norm_eps=1e-5, - rope_theta=1e6, - use_moe=False - ) - - model = MiniMindLM(config) - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - - # 加载权重 - if os.path.exists(model_path): - print(f"正在加载权重: {model_path}") - state_dict = torch.load(model_path, map_location=device) - - # 检查权重匹配情况 - model_keys = set(model.state_dict().keys()) - checkpoint_keys = set(state_dict.keys()) - matched_keys = model_keys & checkpoint_keys - missing_keys = model_keys - checkpoint_keys - unexpected_keys = checkpoint_keys - model_keys - - print(f" 模型参数: {len(model_keys)}") - print(f" 权重文件参数: {len(checkpoint_keys)}") - print(f" 匹配参数: {len(matched_keys)}") - print(f" 缺失参数: {len(missing_keys)}") - print(f" 多余参数: {len(unexpected_keys)}") - - if missing_keys: - print(f" ❌ 缺失参数: {list(missing_keys)[:5]}...") - if unexpected_keys: - print(f" ⚠️ 多余参数: {list(unexpected_keys)[:5]}...") - - model.load_state_dict(state_dict, strict=False) - model.to(device) - model.eval() - - print("✅ 模型加载完成") - else: - raise FileNotFoundError(f"模型文件不存在: {model_path}") - - return model, tokenizer, config - -def test_inference_modes(model, tokenizer, samples, device='cuda'): - """ - 测试不同推理模式的loss差异 - """ - print("=== 3. 测试不同推理模式 ===") - - results = {} - - for mode_name, use_cache in [("无缓存", False), ("有KV缓存", True)]: - print(f"\n--- 测试模式: {mode_name} ---") - - total_loss = 0 - valid_samples = 0 - - for i, sample in enumerate(samples[:5]): # 测试前5个样本 - text = sample['text'] - - # 确保文本长度足够 - tokens = tokenizer.encode(text, add_special_tokens=False) - if len(tokens) < 130: # 100输入 + 30预测 - continue - - input_tokens = tokens[:100] - target_tokens = tokens[100:130] # 30个预测token - - input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device) - target_ids = torch.tensor([target_tokens], dtype=torch.long).to(device) - - with torch.no_grad(): - # 方法1: 直接forward计算loss(类似训练) - full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device) - outputs = model(full_input) - logits = outputs.logits - - # 计算loss - shift_logits = logits[0, 99:129, :].contiguous() # 取预测部分的logits - shift_labels = target_ids[0].contiguous() - - loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean') - - total_loss += loss.item() - valid_samples += 1 - - print(f" 样本{i+1}: loss = {loss.item():.4f}") - - avg_loss = total_loss / valid_samples if valid_samples > 0 else 0 - results[mode_name] = avg_loss - print(f" {mode_name}平均loss: {avg_loss:.4f}") - - return results - -def test_autoregressive_vs_teacher_forcing(model, tokenizer, samples, device='cuda'): - """ - 对比自回归生成vs教师强制的loss差异 - """ - print("=== 4. 对比自回归生成 vs 教师强制 ===") - - results = {} - - for i, sample in enumerate(samples[:3]): # 测试前3个样本 - text = sample['text'] - tokens = tokenizer.encode(text, add_special_tokens=False) - - if len(tokens) < 130: - continue - - input_tokens = tokens[:100] - target_tokens = tokens[100:130] - - print(f"\n--- 样本 {i+1} ---") - - # 方法1: 教师强制(类似训练时) - with torch.no_grad(): - full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device) - outputs = model(full_input) - logits = outputs.logits - - shift_logits = logits[0, 99:129, :].contiguous() - shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - teacher_forcing_loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean') - print(f" 教师强制loss: {teacher_forcing_loss.item():.4f}") - - # 方法2: 自回归生成(逐步预测) - with torch.no_grad(): - current_sequence = torch.tensor([input_tokens], dtype=torch.long).to(device) - autoregressive_losses = [] - - for step in range(len(target_tokens)): - outputs = model(current_sequence) - logits = outputs.logits[0, -1, :] # 只取最后一个位置的logits - - # 计算当前步骤的loss - true_next_token = target_tokens[step] - step_loss = F.cross_entropy(logits.unsqueeze(0), - torch.tensor([true_next_token], device=device)) - autoregressive_losses.append(step_loss.item()) - - # 添加真实token到序列中(教师强制) - current_sequence = torch.cat([ - current_sequence, - torch.tensor([[true_next_token]], device=device) - ], dim=1) - - autoregressive_loss = sum(autoregressive_losses) / len(autoregressive_losses) - print(f" 自回归loss: {autoregressive_loss:.4f}") - print(f" loss差距: {abs(autoregressive_loss - teacher_forcing_loss.item()):.4f}") - - # 方法3: 真实自回归生成(使用预测token) - with torch.no_grad(): - current_sequence = torch.tensor([input_tokens], dtype=torch.long).to(device) - real_autoregressive_losses = [] - - for step in range(len(target_tokens)): - outputs = model(current_sequence) - logits = outputs.logits[0, -1, :] - - # 预测下一个token - predicted_token = torch.argmax(logits, dim=-1).item() - - # 计算与真实token的loss - true_next_token = target_tokens[step] - step_loss = F.cross_entropy(logits.unsqueeze(0), - torch.tensor([true_next_token], device=device)) - real_autoregressive_losses.append(step_loss.item()) - - # 使用预测的token继续生成 - current_sequence = torch.cat([ - current_sequence, - torch.tensor([[predicted_token]], device=device) - ], dim=1) - - real_autoregressive_loss = sum(real_autoregressive_losses) / len(real_autoregressive_losses) - print(f" 真实自回归loss: {real_autoregressive_loss:.4f}") - -def analyze_data_distribution(samples, tokenizer): - """ - 分析评估数据的分布特征 - """ - print("=== 5. 分析数据分布 ===") - - lengths = [] - vocab_coverage = set() - - for sample in samples: - text = sample['text'] - tokens = tokenizer.encode(text, add_special_tokens=False) - lengths.append(len(tokens)) - vocab_coverage.update(tokens) - - print(f"文本长度统计:") - print(f" 平均长度: {sum(lengths)/len(lengths):.1f} tokens") - print(f" 最短: {min(lengths)} tokens") - print(f" 最长: {max(lengths)} tokens") - print(f" 词汇覆盖: {len(vocab_coverage)} 个不同token") - print(f" 词汇覆盖率: {len(vocab_coverage)/6400*100:.1f}%") - -def compare_training_vs_inference_computation(model, tokenizer, samples, device='cuda'): - """ - 对比训练时和推理时的具体计算过程 - """ - print("=== 6. 对比训练与推理的计算过程 ===") - - sample = samples[0] - text = sample['text'] - tokens = tokenizer.encode(text, add_special_tokens=False) - - if len(tokens) < 130: - print("样本长度不足,跳过") - return - - input_tokens = tokens[:100] - target_tokens = tokens[100:130] - - print(f"测试样本长度: {len(tokens)} tokens") - print(f"输入部分: {len(input_tokens)} tokens") - print(f"目标部分: {len(target_tokens)} tokens") - - # 模拟训练时的计算 - print("\n--- 模拟训练时计算 ---") - with torch.no_grad(): - # 训练时:一次性输入完整序列 - full_sequence = torch.tensor([tokens[:130]], dtype=torch.long).to(device) - outputs = model(full_sequence) - logits = outputs.logits - - print(f"输入形状: {full_sequence.shape}") - print(f"输出logits形状: {logits.shape}") - - # 计算loss的方式和训练时一致 - shift_logits = logits[0, :-1, :].contiguous() # 去掉最后一个position - shift_labels = full_sequence[0, 1:].contiguous() # 去掉第一个position - - # 只计算预测部分的loss - predict_start = 99 # 从第100个token开始预测 - predict_logits = shift_logits[predict_start:predict_start+30, :] - predict_labels = shift_labels[predict_start:predict_start+30] - - training_loss = F.cross_entropy(predict_logits, predict_labels, reduction='mean') - print(f"训练方式loss: {training_loss.item():.4f}") - - # 模拟推理时的计算 - print("\n--- 模拟推理时计算 ---") - with torch.no_grad(): - # 推理时:分别处理输入和目标 - input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device) - - # 使用和eval_model.py相同的方法 - full_input_for_loss = torch.tensor([tokens[:130]], dtype=torch.long).to(device) - outputs = model(full_input_for_loss, logits_to_keep=30) - - if outputs.logits is not None: - shift_logits = outputs.logits[0, -30:, :].contiguous() - shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - inference_loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean') - print(f"推理方式loss: {inference_loss.item():.4f}") - else: - print("无法获取logits") - -def main(): - """ - 主函数:系统性分析训练与推理loss差距 - """ - print("🔍 开始分析训练与推理Loss差距") - print("="*60) - - # 设置随机种子确保结果可重现 - random.seed(42) - torch.manual_seed(42) - - device = 'cuda' if torch.cuda.is_available() else 'cpu' - model_path = 'out/experiment_1_4_0/pretrain_512.pth' - - try: - # 1. 创建来自训练数据的评估集 - eval_data_path, samples = create_eval_data_from_training_data() - - # 2. 加载模型 - model, tokenizer, config = load_model_and_tokenizer(model_path, device) - - # 3. 分析数据分布 - analyze_data_distribution(samples, tokenizer) - - # 4. 测试不同推理模式 - mode_results = test_inference_modes(model, tokenizer, samples, device) - - # 5. 对比自回归vs教师强制 - test_autoregressive_vs_teacher_forcing(model, tokenizer, samples, device) - - # 6. 对比训练与推理的具体计算过程 - compare_training_vs_inference_computation(model, tokenizer, samples, device) - - print("\n" + "="*60) - print("🎯 分析完成") - - except Exception as e: - print(f"❌ 分析过程中出现错误: {e}") - import traceback - traceback.print_exc() - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/debug_model.py b/debug_model.py deleted file mode 100644 index 9426e2f..0000000 --- a/debug_model.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 -""" -调试模型生成过程 -""" - -import torch -from transformers import AutoTokenizer -from model.model_original import MiniMindLM -from model.LMConfig import LMConfig - -def debug_generation(): - # 加载模型和tokenizer - device = 'cuda' - model_path = 'out/experiment_1_4_0/pretrain_512.pth' - - # 配置 - config = LMConfig( - dim=512, - n_layers=8, - n_heads=32, - vocab_size=6400, - max_seq_len=512 - ) - - # 初始化模型 - model = MiniMindLM(config) - - # 加载权重 - state_dict = torch.load(model_path, map_location=device) - model.load_state_dict(state_dict, strict=False) - model.to(device) - model.eval() - - # 加载tokenizer - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - - # 测试文本 - text = "The quick brown fox" - input_tokens = tokenizer.encode(text, add_special_tokens=False) - print(f"输入文本: {text}") - print(f"输入tokens: {input_tokens}") - print(f"解码回来: {tokenizer.decode(input_tokens)}") - - # 转为tensor - input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device) - print(f"输入张量形状: {input_ids.shape}") - - # 手动生成一步 - with torch.no_grad(): - # 前向传播 - outputs = model(input_ids) - logits = outputs.logits - print(f"输出logits形状: {logits.shape}") - - # 获取最后一个位置的logits - next_token_logits = logits[0, -1, :] - print(f"下一个token的logits形状: {next_token_logits.shape}") - - # 应用温度 - next_token_logits = next_token_logits / 1.0 - - # 获取概率分布 - probs = torch.softmax(next_token_logits, dim=-1) - - # 找出top-5的token - top_probs, top_indices = torch.topk(probs, 10) - print(f"\nTop 10 候选tokens:") - for i, (prob, idx) in enumerate(zip(top_probs, top_indices)): - token_text = tokenizer.decode([idx.item()], skip_special_tokens=True) - print(f" {i+1}. Token {idx.item()}: '{token_text}' (prob: {prob.item():.4f})") - - # 贪婪采样 - next_token = torch.argmax(next_token_logits, dim=-1) - print(f"\n贪婪采样选择的token: {next_token.item()}") - print(f"对应文本: '{tokenizer.decode([next_token.item()], skip_special_tokens=True)}'") - - # 使用generate方法 - print(f"\n使用generate方法:") - with torch.no_grad(): - generated = model.generate( - input_ids, - max_new_tokens=5, - temperature=1.0, - top_p=0.95, - eos_token_id=tokenizer.eos_token_id, - pad_token_id=tokenizer.pad_token_id - ) - - print(f"生成的完整序列长度: {generated[0].shape}") - print(f"生成的tokens: {generated[0].tolist()}") - - # 提取新生成的部分 - if len(generated[0]) > len(input_tokens): - new_tokens = generated[0][len(input_tokens):].tolist() - print(f"新生成的tokens: {new_tokens}") - print(f"新生成的文本: '{tokenizer.decode(new_tokens, skip_special_tokens=True)}'") - else: - print("没有生成新的tokens") - -if __name__ == "__main__": - debug_generation() \ No newline at end of file diff --git a/eval_model_final_fixed.py b/eval_model_final_fixed.py deleted file mode 100644 index 85af033..0000000 --- a/eval_model_final_fixed.py +++ /dev/null @@ -1,519 +0,0 @@ -#!/usr/bin/env python3 -""" -评估预训练模型的推理效果 -用于测试不同实验中训练出来的模型在eval_data.json上的表现 -""" - -import os -import json -import argparse -import torch -import torch.nn.functional as F -from transformers import AutoTokenizer -from model.LMConfig import LMConfig - - -def load_model(model_path, model_type, device, config_params=None): - """ - 加载模型和tokenizer - - Args: - model_path: 模型权重文件路径 - model_type: 模型类型 (model/model_original/model_no_feed) - device: 运行设备 - config_params: 模型配置参数字典 - - Returns: - model: 加载好的模型 - tokenizer: tokenizer实例 - """ - # 初始化配置 - if config_params: - lm_config = LMConfig(**config_params) - else: - lm_config = LMConfig() - - # 打印配置信息 - print(f"模型配置:") - print(f" dim: {lm_config.dim}") - print(f" n_layers: {lm_config.n_layers}") - print(f" n_heads: {lm_config.n_heads}") - print(f" vocab_size: {lm_config.vocab_size}") - print(f" max_seq_len: {lm_config.max_seq_len}") - if hasattr(lm_config, 'knowledge_num'): - print(f" knowledge_num: {lm_config.knowledge_num}") - print(f" knowledge_length: {lm_config.knowledge_length}") - print(f" knowledge_dim: {lm_config.knowledge_dim}") - print() - - # 加载tokenizer - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - - # 根据模型类型导入对应的模型类 - if model_type == "model": - from model.model import MiniMindLM - elif model_type == "model_original": - from model.model_original import MiniMindLM - elif model_type == "model_no_feed": - from model.model_no_feed import MiniMindLM - else: - raise ValueError(f"不支持的模型类型: {model_type}") - - # 初始化模型 - model = MiniMindLM(lm_config) - - # 加载权重 - if os.path.exists(model_path): - print(f"正在从 {model_path} 加载模型权重...") - - # 加载权重文件 - state_dict = torch.load(model_path, map_location=device) - - # 获取模型的参数名称 - model_keys = set(model.state_dict().keys()) - checkpoint_keys = set(state_dict.keys()) - - # 统计权重匹配情况 - matched_keys = model_keys & checkpoint_keys - missing_keys = model_keys - checkpoint_keys - unexpected_keys = checkpoint_keys - model_keys - - print(f"\n权重加载详情:") - print(f" 模型总参数数量: {len(model_keys)}") - print(f" 权重文件参数数量: {len(checkpoint_keys)}") - print(f" 成功匹配参数: {len(matched_keys)}") - print(f" 缺失参数: {len(missing_keys)}") - print(f" 多余参数: {len(unexpected_keys)}") - - # 详细列出缺失和多余的参数 - if missing_keys: - print(f"\n❌ 缺失的参数 ({len(missing_keys)}):") - for key in sorted(missing_keys): - print(f" - {key}") - - if unexpected_keys: - print(f"\n⚠️ 权重文件中多余的参数 ({len(unexpected_keys)}):") - for key in sorted(unexpected_keys): - print(f" + {key}") - - # 加载权重(允许部分匹配) - try: - incompatible_keys = model.load_state_dict(state_dict, strict=False) - - # 检查加载结果 - if len(incompatible_keys.missing_keys) == 0 and len(incompatible_keys.unexpected_keys) == 0: - print(f"\n✅ 权重加载完全成功!") - elif len(incompatible_keys.missing_keys) == 0: - print(f"\n✅ 权重加载成功(忽略多余参数)") - else: - print(f"\n⚠️ 权重加载部分成功,存在缺失参数") - print(f" 这可能影响模型性能,请检查模型配置参数是否正确") - - # 计算加载成功率 - success_rate = len(matched_keys) / len(model_keys) * 100 - print(f" 参数加载成功率: {success_rate:.1f}%") - - if success_rate < 90: - print(f" ❌ 警告:加载成功率过低,模型可能无法正常工作!") - elif success_rate < 100: - print(f" ⚠️ 警告:存在缺失参数,可能影响模型性能") - - except Exception as e: - raise RuntimeError(f"权重加载失败: {e}") - - # 验证关键层的形状 - print("🔍 验证关键层形状:") - key_layers = [ - 'tok_embeddings.weight', - 'output.weight', - 'norm.weight', - ] - - # 添加每一层的验证 - for i in range(lm_config.n_layers): - key_layers.extend([ - f'layers.{i}.attention_norm.weight', - f'layers.{i}.ffn_norm.weight', - f'layers.{i}.self_attention.wq.weight', - f'layers.{i}.self_attention.wk.weight', - f'layers.{i}.self_attention.wv.weight', - f'layers.{i}.self_attention.wo.weight', - ]) - - # FFN层的验证(model_original有FFN,其他模型可能没有) - if f'layers.{i}.feed_forward.w1.weight' in model_keys: - key_layers.extend([ - f'layers.{i}.feed_forward.w1.weight', - f'layers.{i}.feed_forward.w2.weight', - f'layers.{i}.feed_forward.w3.weight', - ]) - - # 验证KnowledgeDataset相关层(仅model和model_no_feed) - if model_type in ['model', 'model_no_feed']: - key_layers.extend([ - 'knowledge_dataset.to_queries.0.weight', - 'knowledge_dataset.keys', - 'knowledge_dataset.knowledge_dataset', - ]) - - # 添加CrossAttention层 - for i in range(lm_config.n_layers): - key_layers.extend([ - f'layers.{i}.cross_attention.to_q.weight', - f'layers.{i}.cross_attention.to_k.weight', - f'layers.{i}.cross_attention.to_v.weight', - f'layers.{i}.cross_attention.to_out.weight', - ]) - - # 检查关键层 - verified_layers = 0 - total_key_layers = 0 - - for layer_name in key_layers: - if layer_name in model_keys: # 只检查模型中实际存在的层 - total_key_layers += 1 - if layer_name in matched_keys: - verified_layers += 1 - expected_shape = model.state_dict()[layer_name].shape - actual_shape = state_dict[layer_name].shape if layer_name in state_dict else "缺失" - if layer_name in state_dict and expected_shape == actual_shape: - print(f" ✅ {layer_name}: {actual_shape}") - else: - print(f" ❌ {layer_name}: 期望 {expected_shape}, 实际 {actual_shape}") - else: - print(f" ❌ {layer_name}: 缺失") - - print(f"\n关键层验证结果: {verified_layers}/{total_key_layers} 层验证成功") - - if verified_layers == total_key_layers: - print("✅ 所有关键层验证通过!") - elif verified_layers / total_key_layers >= 0.9: - print("⚠️ 大部分关键层验证通过,模型应该可以正常工作") - else: - print("❌ 关键层验证失败过多,模型可能无法正常工作!") - - print() - else: - raise FileNotFoundError(f"模型文件不存在: {model_path}") - - model.to(device) - model.eval() - - return model, tokenizer - - -def load_eval_data(data_path, num_samples=20): - """ - 加载评估数据集 - - Args: - data_path: 数据文件路径 - num_samples: 要评估的样本数量 - - Returns: - samples: 数据样本列表 - """ - data = [] - with open(data_path, 'r', encoding='utf-8') as f: - for line_num, line in enumerate(f): - line = line.strip() - if line: # 跳过空行 - try: - sample = json.loads(line) - data.append(sample) - if len(data) >= num_samples: - break - except json.JSONDecodeError as e: - print(f"警告:第{line_num+1}行JSON解析失败: {e}") - continue - - # 只取前num_samples条数据 - samples = data[:num_samples] - print(f"加载了 {len(samples)} 条评估数据") - - return samples - - -def evaluate_sample(model, tokenizer, text, input_length=100, predict_length=100, device='cuda'): - """ - 评估单个样本 - - Args: - model: 模型实例 - tokenizer: tokenizer实例 - text: 输入文本 - input_length: 输入token数量 - predict_length: 预测token数量 - device: 运行设备 - - Returns: - input_text: 输入文本 - predicted_text: 预测文本 - ground_truth_text: 真实文本 - loss: 预测损失(如果可计算) - """ - # 对文本进行分词 - tokens = tokenizer.encode(text, add_special_tokens=False) - - # 确保有足够的token - if len(tokens) < input_length + predict_length: - print(f"警告:文本长度不足,只有 {len(tokens)} 个token") - return None, None, None, None - - # 分割输入和目标 - input_tokens = tokens[:input_length] - target_tokens = tokens[input_length:input_length + predict_length] - - # 转换为张量 - input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device) - - # 生成预测 - with torch.no_grad(): - # 使用generate方法生成,调整参数改善生成质量 - generated = model.generate( - input_ids, - max_new_tokens=predict_length, - temperature=1.0, - top_p=0.95, - eos_token_id=tokenizer.eos_token_id, - pad_token_id=tokenizer.pad_token_id - ) - - # 提取生成的token(去掉输入部分) - # generated包含完整序列,需要从input_length位置开始提取新生成的部分 - full_generated_tokens = generated[0].tolist() - if len(full_generated_tokens) > input_length: - predicted_tokens = full_generated_tokens[input_length:] - else: - # 如果生成序列长度不够,说明没有新生成内容 - predicted_tokens = [] - - # 检查是否因EOS token提前结束生成 - eos_found = False - eos_position = -1 - actual_predicted_length = len(predicted_tokens) - - if predicted_tokens and tokenizer.eos_token_id is not None: - try: - eos_position = predicted_tokens.index(tokenizer.eos_token_id) - eos_found = True - # 只保留EOS token之前的内容 - predicted_tokens = predicted_tokens[:eos_position] - actual_predicted_length = len(predicted_tokens) - except ValueError: - # 没有找到EOS token - pass - - # 计算loss(使用forward方法) - # 准备用于loss计算的输入 - loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device) - outputs = model(loss_input_ids) # 移除logits_to_keep参数 - - # 计算loss - logits = outputs.logits - loss = None - if logits is not None: - # 重塑logits和目标 - 修复:使用正确的位置切片 - # 在Transformer中,position i的logits预测position i+1的token - # 要预测position input_length到input_length+predict_length-1的token - # 需要使用position input_length-1到input_length+predict_length-2的logits - shift_logits = logits[0, input_length-1:input_length+predict_length-1, :].contiguous() - shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - # 计算交叉熵损失 - loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean') - loss = loss.item() - - # 解码文本 - input_text = tokenizer.decode(input_tokens, skip_special_tokens=True) - # 只解码实际生成的token,限制在predict_length内 - actual_predicted_tokens = predicted_tokens[:predict_length] if predicted_tokens else [] - predicted_text = tokenizer.decode(actual_predicted_tokens, skip_special_tokens=True) if actual_predicted_tokens else "[未生成内容]" - ground_truth_text = tokenizer.decode(target_tokens, skip_special_tokens=True) - - # 返回额外的生成统计信息 - generation_stats = { - 'requested_length': predict_length, - 'actual_length': actual_predicted_length, - 'eos_found': eos_found, - 'eos_position': eos_position if eos_found else None, - 'truncated_by_eos': eos_found and eos_position < predict_length - } - - return input_text, predicted_text, ground_truth_text, loss, generation_stats - - -def main(): - parser = argparse.ArgumentParser(description='评估预训练模型') - parser.add_argument('--model_path', type=str, default='out/experiment_1_4_0/pretrain_512.pth', - help='模型权重文件路径') - parser.add_argument('--model_type', type=str, default='model', - choices=['model', 'model_original', 'model_no_feed'], - help='模型类型') - parser.add_argument('--data_path', type=str, default='dataset/stable/eval_data.json', - help='评估数据集路径') - parser.add_argument('--num_samples', type=int, default=20, - help='评估样本数量') - parser.add_argument('--input_length', type=int, default=100, - help='输入token长度') - parser.add_argument('--predict_length', type=int, default=100, - help='预测token长度') - parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu', - help='运行设备') - - # 模型架构参数 - parser.add_argument('--dim', type=int, default=512, - help='模型维度') - parser.add_argument('--n_layers', type=int, default=8, - help='Transformer层数') - parser.add_argument('--n_heads', type=int, default=32, - help='注意力头数') - parser.add_argument('--n_kv_heads', type=int, default=8, - help='KV注意力头数') - parser.add_argument('--vocab_size', type=int, default=6400, - help='词汇表大小') - parser.add_argument('--max_seq_len', type=int, default=512, - help='最大序列长度') - parser.add_argument('--dropout', type=float, default=0.0, - help='Dropout率') - parser.add_argument('--norm_eps', type=float, default=1e-5, - help='层归一化epsilon') - parser.add_argument('--rope_theta', type=float, default=1e6, - help='RoPE theta参数') - - # KnowledgeDataset相关参数(仅model和model_no_feed使用) - parser.add_argument('--knowledge_num', type=int, default=1048576, - help='知识条目数量') - parser.add_argument('--knowledge_length', type=int, default=32, - help='单条知识长度') - parser.add_argument('--knowledge_dim', type=int, default=128, - help='知识维度') - - # MOE相关参数 - parser.add_argument('--use_moe', action='store_true', - help='是否使用MOE') - parser.add_argument('--num_experts_per_tok', type=int, default=2, - help='每个token激活的专家数') - parser.add_argument('--n_routed_experts', type=int, default=4, - help='路由专家数量') - - args = parser.parse_args() - - print(f"评估配置:") - print(f" 模型路径: {args.model_path}") - print(f" 模型类型: {args.model_type}") - print(f" 数据路径: {args.data_path}") - print(f" 样本数量: {args.num_samples}") - print(f" 输入长度: {args.input_length} tokens") - print(f" 预测长度: {args.predict_length} tokens") - print(f" 运行设备: {args.device}") - print() - - # 构建配置参数字典 - config_params = { - 'dim': args.dim, - 'n_layers': args.n_layers, - 'n_heads': args.n_heads, - 'n_kv_heads': args.n_kv_heads, - 'vocab_size': args.vocab_size, - 'max_seq_len': args.max_seq_len, - 'dropout': args.dropout, - 'norm_eps': args.norm_eps, - 'rope_theta': args.rope_theta, - 'use_moe': args.use_moe, - 'num_experts_per_tok': args.num_experts_per_tok, - 'n_routed_experts': args.n_routed_experts, - } - - # 只有model和model_no_feed需要KnowledgeDataset参数 - if args.model_type in ['model', 'model_no_feed']: - config_params.update({ - 'knowledge_num': args.knowledge_num, - 'knowledge_length': args.knowledge_length, - 'knowledge_dim': args.knowledge_dim, - }) - - # 加载模型 - model, tokenizer = load_model(args.model_path, args.model_type, args.device, config_params) - - # 加载数据 - samples = load_eval_data(args.data_path, args.num_samples) - - # 评估每个样本 - total_loss = 0 - valid_samples = 0 - total_requested_tokens = 0 - total_actual_tokens = 0 - samples_with_eos = 0 - samples_truncated_by_eos = 0 - - for i, sample in enumerate(samples): - print(f"\n{'='*60}") - print(f"样本 {i+1}/{len(samples)}") - print(f"{'='*60}") - - text = sample['text'] - - # 评估样本 - input_text, predicted_text, ground_truth_text, loss, generation_stats = evaluate_sample( - model, tokenizer, text, - args.input_length, args.predict_length, args.device - ) - - if input_text is None: - print("跳过该样本(文本长度不足)") - continue - - # 打印结果 - print(f"\n输入 ({args.input_length} tokens):") - print(f" {input_text}") - print(f"\n预测输出 (请求{generation_stats['requested_length']}个token, 实际生成{generation_stats['actual_length']}个):") - print(f" {predicted_text}") - print(f"\n真实值 ({args.predict_length} tokens):") - print(f" {ground_truth_text}") - - # 打印生成统计信息 - print(f"\n生成统计:") - print(f" 请求生成: {generation_stats['requested_length']} tokens") - print(f" 实际生成: {generation_stats['actual_length']} tokens") - if generation_stats['eos_found']: - print(f" ✅ 发现EOS token在位置 {generation_stats['eos_position']}") - if generation_stats['truncated_by_eos']: - print(f" ⚠️ 因EOS token提前结束生成") - else: - print(f" ✅ EOS token出现在预期位置") - else: - print(f" ❌ 未发现EOS token (可能达到最大长度限制)") - - if loss is not None: - print(f"\nLoss: {loss:.4f}") - total_loss += loss - valid_samples += 1 - - # 更新生成统计 - total_requested_tokens += generation_stats['requested_length'] - total_actual_tokens += generation_stats['actual_length'] - if generation_stats['eos_found']: - samples_with_eos += 1 - if generation_stats['truncated_by_eos']: - samples_truncated_by_eos += 1 - - # 打印总体统计 - if valid_samples > 0: - print(f"\n{'='*60}") - print(f"总体统计:") - print(f" 有效样本数: {valid_samples}") - print(f" 平均Loss: {total_loss / valid_samples:.4f}") - print() - print(f"生成统计:") - print(f" 请求生成总tokens: {total_requested_tokens}") - print(f" 实际生成总tokens: {total_actual_tokens}") - print(f" 生成完成率: {total_actual_tokens / total_requested_tokens * 100:.1f}%" if total_requested_tokens > 0 else " 生成完成率: N/A") - print(f" 发现EOS的样本: {samples_with_eos}/{len(samples)} ({samples_with_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 发现EOS的样本: N/A") - print(f" 被EOS截断的样本: {samples_truncated_by_eos}/{len(samples)} ({samples_truncated_by_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 被EOS截断的样本: N/A") - print(f" 平均每样本生成长度: {total_actual_tokens/len(samples):.1f} tokens" if len(samples) > 0 else " 平均每样本生成长度: N/A") - print(f"{'='*60}") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/eval_model_fixed.py b/eval_model_fixed.py deleted file mode 100644 index c2adfeb..0000000 --- a/eval_model_fixed.py +++ /dev/null @@ -1,516 +0,0 @@ -#!/usr/bin/env python3 -""" -评估预训练模型的推理效果 -用于测试不同实验中训练出来的模型在eval_data.json上的表现 -""" - -import os -import json -import argparse -import torch -import torch.nn.functional as F -from transformers import AutoTokenizer -from model.LMConfig import LMConfig - - -def load_model(model_path, model_type, device, config_params=None): - """ - 加载模型和tokenizer - - Args: - model_path: 模型权重文件路径 - model_type: 模型类型 (model/model_original/model_no_feed) - device: 运行设备 - config_params: 模型配置参数字典 - - Returns: - model: 加载好的模型 - tokenizer: tokenizer实例 - """ - # 初始化配置 - if config_params: - lm_config = LMConfig(**config_params) - else: - lm_config = LMConfig() - - # 打印配置信息 - print(f"模型配置:") - print(f" dim: {lm_config.dim}") - print(f" n_layers: {lm_config.n_layers}") - print(f" n_heads: {lm_config.n_heads}") - print(f" vocab_size: {lm_config.vocab_size}") - print(f" max_seq_len: {lm_config.max_seq_len}") - if hasattr(lm_config, 'knowledge_num'): - print(f" knowledge_num: {lm_config.knowledge_num}") - print(f" knowledge_length: {lm_config.knowledge_length}") - print(f" knowledge_dim: {lm_config.knowledge_dim}") - print() - - # 加载tokenizer - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - - # 根据模型类型导入对应的模型类 - if model_type == "model": - from model.model import MiniMindLM - elif model_type == "model_original": - from model.model_original import MiniMindLM - elif model_type == "model_no_feed": - from model.model_no_feed import MiniMindLM - else: - raise ValueError(f"不支持的模型类型: {model_type}") - - # 初始化模型 - model = MiniMindLM(lm_config) - - # 加载权重 - if os.path.exists(model_path): - print(f"正在从 {model_path} 加载模型权重...") - - # 加载权重文件 - state_dict = torch.load(model_path, map_location=device) - - # 获取模型的参数名称 - model_keys = set(model.state_dict().keys()) - checkpoint_keys = set(state_dict.keys()) - - # 统计权重匹配情况 - matched_keys = model_keys & checkpoint_keys - missing_keys = model_keys - checkpoint_keys - unexpected_keys = checkpoint_keys - model_keys - - print(f"\n权重加载详情:") - print(f" 模型总参数数量: {len(model_keys)}") - print(f" 权重文件参数数量: {len(checkpoint_keys)}") - print(f" 成功匹配参数: {len(matched_keys)}") - print(f" 缺失参数: {len(missing_keys)}") - print(f" 多余参数: {len(unexpected_keys)}") - - # 详细列出缺失和多余的参数 - if missing_keys: - print(f"\n❌ 缺失的参数 ({len(missing_keys)}):") - for key in sorted(missing_keys): - print(f" - {key}") - - if unexpected_keys: - print(f"\n⚠️ 权重文件中多余的参数 ({len(unexpected_keys)}):") - for key in sorted(unexpected_keys): - print(f" + {key}") - - # 加载权重(允许部分匹配) - try: - incompatible_keys = model.load_state_dict(state_dict, strict=False) - - # 检查加载结果 - if len(incompatible_keys.missing_keys) == 0 and len(incompatible_keys.unexpected_keys) == 0: - print(f"\n✅ 权重加载完全成功!") - elif len(incompatible_keys.missing_keys) == 0: - print(f"\n✅ 权重加载成功(忽略多余参数)") - else: - print(f"\n⚠️ 权重加载部分成功,存在缺失参数") - print(f" 这可能影响模型性能,请检查模型配置参数是否正确") - - # 计算加载成功率 - success_rate = len(matched_keys) / len(model_keys) * 100 - print(f" 参数加载成功率: {success_rate:.1f}%") - - if success_rate < 90: - print(f" ❌ 警告:加载成功率过低,模型可能无法正常工作!") - elif success_rate < 100: - print(f" ⚠️ 警告:存在缺失参数,可能影响模型性能") - - except Exception as e: - raise RuntimeError(f"权重加载失败: {e}") - - # 验证关键层的形状 - print("🔍 验证关键层形状:") - key_layers = [ - 'tok_embeddings.weight', - 'output.weight', - 'norm.weight', - ] - - # 添加每一层的验证 - for i in range(lm_config.n_layers): - key_layers.extend([ - f'layers.{i}.attention_norm.weight', - f'layers.{i}.ffn_norm.weight', - f'layers.{i}.self_attention.wq.weight', - f'layers.{i}.self_attention.wk.weight', - f'layers.{i}.self_attention.wv.weight', - f'layers.{i}.self_attention.wo.weight', - ]) - - # FFN层的验证(model_original有FFN,其他模型可能没有) - if f'layers.{i}.feed_forward.w1.weight' in model_keys: - key_layers.extend([ - f'layers.{i}.feed_forward.w1.weight', - f'layers.{i}.feed_forward.w2.weight', - f'layers.{i}.feed_forward.w3.weight', - ]) - - # 验证KnowledgeDataset相关层(仅model和model_no_feed) - if model_type in ['model', 'model_no_feed']: - key_layers.extend([ - 'knowledge_dataset.to_queries.0.weight', - 'knowledge_dataset.keys', - 'knowledge_dataset.knowledge_dataset', - ]) - - # 添加CrossAttention层 - for i in range(lm_config.n_layers): - key_layers.extend([ - f'layers.{i}.cross_attention.to_q.weight', - f'layers.{i}.cross_attention.to_k.weight', - f'layers.{i}.cross_attention.to_v.weight', - f'layers.{i}.cross_attention.to_out.weight', - ]) - - # 检查关键层 - verified_layers = 0 - total_key_layers = 0 - - for layer_name in key_layers: - if layer_name in model_keys: # 只检查模型中实际存在的层 - total_key_layers += 1 - if layer_name in matched_keys: - verified_layers += 1 - expected_shape = model.state_dict()[layer_name].shape - actual_shape = state_dict[layer_name].shape if layer_name in state_dict else "缺失" - if layer_name in state_dict and expected_shape == actual_shape: - print(f" ✅ {layer_name}: {actual_shape}") - else: - print(f" ❌ {layer_name}: 期望 {expected_shape}, 实际 {actual_shape}") - else: - print(f" ❌ {layer_name}: 缺失") - - print(f"\n关键层验证结果: {verified_layers}/{total_key_layers} 层验证成功") - - if verified_layers == total_key_layers: - print("✅ 所有关键层验证通过!") - elif verified_layers / total_key_layers >= 0.9: - print("⚠️ 大部分关键层验证通过,模型应该可以正常工作") - else: - print("❌ 关键层验证失败过多,模型可能无法正常工作!") - - print() - else: - raise FileNotFoundError(f"模型文件不存在: {model_path}") - - model.to(device) - model.eval() - - return model, tokenizer - - -def load_eval_data(data_path, num_samples=20): - """ - 加载评估数据集 - - Args: - data_path: 数据文件路径 - num_samples: 要评估的样本数量 - - Returns: - samples: 数据样本列表 - """ - data = [] - with open(data_path, 'r', encoding='utf-8') as f: - for line_num, line in enumerate(f): - line = line.strip() - if line: # 跳过空行 - try: - sample = json.loads(line) - data.append(sample) - if len(data) >= num_samples: - break - except json.JSONDecodeError as e: - print(f"警告:第{line_num+1}行JSON解析失败: {e}") - continue - - # 只取前num_samples条数据 - samples = data[:num_samples] - print(f"加载了 {len(samples)} 条评估数据") - - return samples - - -def evaluate_sample(model, tokenizer, text, input_length=100, predict_length=100, device='cuda'): - """ - 评估单个样本 - - Args: - model: 模型实例 - tokenizer: tokenizer实例 - text: 输入文本 - input_length: 输入token数量 - predict_length: 预测token数量 - device: 运行设备 - - Returns: - input_text: 输入文本 - predicted_text: 预测文本 - ground_truth_text: 真实文本 - loss: 预测损失(如果可计算) - """ - # 对文本进行分词 - tokens = tokenizer.encode(text, add_special_tokens=False) - - # 确保有足够的token - if len(tokens) < input_length + predict_length: - print(f"警告:文本长度不足,只有 {len(tokens)} 个token") - return None, None, None, None - - # 分割输入和目标 - input_tokens = tokens[:input_length] - target_tokens = tokens[input_length:input_length + predict_length] - - # 转换为张量 - input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device) - - # 生成预测 - with torch.no_grad(): - # 使用generate方法生成,调整参数改善生成质量 - generated = model.generate( - input_ids, - max_new_tokens=predict_length, - temperature=1.0, - top_p=0.95, - eos_token_id=tokenizer.eos_token_id, - pad_token_id=tokenizer.pad_token_id - ) - - # 提取生成的token(去掉输入部分) - # generated包含完整序列,需要从input_length位置开始提取新生成的部分 - full_generated_tokens = generated[0].tolist() - if len(full_generated_tokens) > input_length: - predicted_tokens = full_generated_tokens[input_length:] - else: - # 如果生成序列长度不够,说明没有新生成内容 - predicted_tokens = [] - - # 检查是否因EOS token提前结束生成 - eos_found = False - eos_position = -1 - actual_predicted_length = len(predicted_tokens) - - if predicted_tokens and tokenizer.eos_token_id is not None: - try: - eos_position = predicted_tokens.index(tokenizer.eos_token_id) - eos_found = True - # 只保留EOS token之前的内容 - predicted_tokens = predicted_tokens[:eos_position] - actual_predicted_length = len(predicted_tokens) - except ValueError: - # 没有找到EOS token - pass - - # 计算loss(使用forward方法) - # 准备用于loss计算的输入 - loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device) - outputs = model(loss_input_ids) # 移除logits_to_keep参数 - - # 计算loss - logits = outputs.logits - loss = None - if logits is not None: - # 重塑logits和目标 - 修复:使用正确的位置切片 - shift_logits = logits[0, input_length:input_length + predict_length, :].contiguous() - shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - # 计算交叉熵损失 - loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean') - loss = loss.item() - - # 解码文本 - input_text = tokenizer.decode(input_tokens, skip_special_tokens=True) - # 只解码实际生成的token,限制在predict_length内 - actual_predicted_tokens = predicted_tokens[:predict_length] if predicted_tokens else [] - predicted_text = tokenizer.decode(actual_predicted_tokens, skip_special_tokens=True) if actual_predicted_tokens else "[未生成内容]" - ground_truth_text = tokenizer.decode(target_tokens, skip_special_tokens=True) - - # 返回额外的生成统计信息 - generation_stats = { - 'requested_length': predict_length, - 'actual_length': actual_predicted_length, - 'eos_found': eos_found, - 'eos_position': eos_position if eos_found else None, - 'truncated_by_eos': eos_found and eos_position < predict_length - } - - return input_text, predicted_text, ground_truth_text, loss, generation_stats - - -def main(): - parser = argparse.ArgumentParser(description='评估预训练模型') - parser.add_argument('--model_path', type=str, default='out/experiment_1_4_0/pretrain_512.pth', - help='模型权重文件路径') - parser.add_argument('--model_type', type=str, default='model', - choices=['model', 'model_original', 'model_no_feed'], - help='模型类型') - parser.add_argument('--data_path', type=str, default='dataset/stable/eval_data.json', - help='评估数据集路径') - parser.add_argument('--num_samples', type=int, default=20, - help='评估样本数量') - parser.add_argument('--input_length', type=int, default=100, - help='输入token长度') - parser.add_argument('--predict_length', type=int, default=100, - help='预测token长度') - parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu', - help='运行设备') - - # 模型架构参数 - parser.add_argument('--dim', type=int, default=512, - help='模型维度') - parser.add_argument('--n_layers', type=int, default=8, - help='Transformer层数') - parser.add_argument('--n_heads', type=int, default=32, - help='注意力头数') - parser.add_argument('--n_kv_heads', type=int, default=8, - help='KV注意力头数') - parser.add_argument('--vocab_size', type=int, default=6400, - help='词汇表大小') - parser.add_argument('--max_seq_len', type=int, default=512, - help='最大序列长度') - parser.add_argument('--dropout', type=float, default=0.0, - help='Dropout率') - parser.add_argument('--norm_eps', type=float, default=1e-5, - help='层归一化epsilon') - parser.add_argument('--rope_theta', type=float, default=1e6, - help='RoPE theta参数') - - # KnowledgeDataset相关参数(仅model和model_no_feed使用) - parser.add_argument('--knowledge_num', type=int, default=1048576, - help='知识条目数量') - parser.add_argument('--knowledge_length', type=int, default=32, - help='单条知识长度') - parser.add_argument('--knowledge_dim', type=int, default=128, - help='知识维度') - - # MOE相关参数 - parser.add_argument('--use_moe', action='store_true', - help='是否使用MOE') - parser.add_argument('--num_experts_per_tok', type=int, default=2, - help='每个token激活的专家数') - parser.add_argument('--n_routed_experts', type=int, default=4, - help='路由专家数量') - - args = parser.parse_args() - - print(f"评估配置:") - print(f" 模型路径: {args.model_path}") - print(f" 模型类型: {args.model_type}") - print(f" 数据路径: {args.data_path}") - print(f" 样本数量: {args.num_samples}") - print(f" 输入长度: {args.input_length} tokens") - print(f" 预测长度: {args.predict_length} tokens") - print(f" 运行设备: {args.device}") - print() - - # 构建配置参数字典 - config_params = { - 'dim': args.dim, - 'n_layers': args.n_layers, - 'n_heads': args.n_heads, - 'n_kv_heads': args.n_kv_heads, - 'vocab_size': args.vocab_size, - 'max_seq_len': args.max_seq_len, - 'dropout': args.dropout, - 'norm_eps': args.norm_eps, - 'rope_theta': args.rope_theta, - 'use_moe': args.use_moe, - 'num_experts_per_tok': args.num_experts_per_tok, - 'n_routed_experts': args.n_routed_experts, - } - - # 只有model和model_no_feed需要KnowledgeDataset参数 - if args.model_type in ['model', 'model_no_feed']: - config_params.update({ - 'knowledge_num': args.knowledge_num, - 'knowledge_length': args.knowledge_length, - 'knowledge_dim': args.knowledge_dim, - }) - - # 加载模型 - model, tokenizer = load_model(args.model_path, args.model_type, args.device, config_params) - - # 加载数据 - samples = load_eval_data(args.data_path, args.num_samples) - - # 评估每个样本 - total_loss = 0 - valid_samples = 0 - total_requested_tokens = 0 - total_actual_tokens = 0 - samples_with_eos = 0 - samples_truncated_by_eos = 0 - - for i, sample in enumerate(samples): - print(f"\n{'='*60}") - print(f"样本 {i+1}/{len(samples)}") - print(f"{'='*60}") - - text = sample['text'] - - # 评估样本 - input_text, predicted_text, ground_truth_text, loss, generation_stats = evaluate_sample( - model, tokenizer, text, - args.input_length, args.predict_length, args.device - ) - - if input_text is None: - print("跳过该样本(文本长度不足)") - continue - - # 打印结果 - print(f"\n输入 ({args.input_length} tokens):") - print(f" {input_text}") - print(f"\n预测输出 (请求{generation_stats['requested_length']}个token, 实际生成{generation_stats['actual_length']}个):") - print(f" {predicted_text}") - print(f"\n真实值 ({args.predict_length} tokens):") - print(f" {ground_truth_text}") - - # 打印生成统计信息 - print(f"\n生成统计:") - print(f" 请求生成: {generation_stats['requested_length']} tokens") - print(f" 实际生成: {generation_stats['actual_length']} tokens") - if generation_stats['eos_found']: - print(f" ✅ 发现EOS token在位置 {generation_stats['eos_position']}") - if generation_stats['truncated_by_eos']: - print(f" ⚠️ 因EOS token提前结束生成") - else: - print(f" ✅ EOS token出现在预期位置") - else: - print(f" ❌ 未发现EOS token (可能达到最大长度限制)") - - if loss is not None: - print(f"\nLoss: {loss:.4f}") - total_loss += loss - valid_samples += 1 - - # 更新生成统计 - total_requested_tokens += generation_stats['requested_length'] - total_actual_tokens += generation_stats['actual_length'] - if generation_stats['eos_found']: - samples_with_eos += 1 - if generation_stats['truncated_by_eos']: - samples_truncated_by_eos += 1 - - # 打印总体统计 - if valid_samples > 0: - print(f"\n{'='*60}") - print(f"总体统计:") - print(f" 有效样本数: {valid_samples}") - print(f" 平均Loss: {total_loss / valid_samples:.4f}") - print() - print(f"生成统计:") - print(f" 请求生成总tokens: {total_requested_tokens}") - print(f" 实际生成总tokens: {total_actual_tokens}") - print(f" 生成完成率: {total_actual_tokens / total_requested_tokens * 100:.1f}%" if total_requested_tokens > 0 else " 生成完成率: N/A") - print(f" 发现EOS的样本: {samples_with_eos}/{len(samples)} ({samples_with_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 发现EOS的样本: N/A") - print(f" 被EOS截断的样本: {samples_truncated_by_eos}/{len(samples)} ({samples_truncated_by_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 被EOS截断的样本: N/A") - print(f" 平均每样本生成长度: {total_actual_tokens/len(samples):.1f} tokens" if len(samples) > 0 else " 平均每样本生成长度: N/A") - print(f"{'='*60}") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/final_fix_eval_model.py b/final_fix_eval_model.py deleted file mode 100644 index bd43be0..0000000 --- a/final_fix_eval_model.py +++ /dev/null @@ -1,218 +0,0 @@ -#!/usr/bin/env python3 -""" -最终修复eval_model.py中的位置索引错误 -""" - -import json -import torch -import torch.nn.functional as F -from transformers import AutoTokenizer -from model.LMConfig import LMConfig -from model.model_original import MiniMindLM - - -def demonstrate_correct_fix(): - """ - 演示正确的修复方法 - """ - print("🔧 演示正确的修复方法") - print("="*60) - - device = 'cuda' - model_path = 'out/experiment_1_4_0/pretrain_512.pth' - - # 加载模型 - config = LMConfig( - dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512, - dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False - ) - - model = MiniMindLM(config) - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - - state_dict = torch.load(model_path, map_location=device) - model.load_state_dict(state_dict, strict=False) - model.to(device) - model.eval() - - # 测试多个样本以验证修复效果 - total_loss_wrong = 0 - total_loss_correct = 0 - valid_samples = 0 - - print("测试样本的loss对比:") - print("样本 | 错误方法 | 正确方法 | 差异") - print("-" * 45) - - with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f: - for i, line in enumerate(f): - if i >= 10: # 测试前10个样本 - break - - sample = json.loads(line.strip()) - text = sample['text'] - tokens = tokenizer.encode(text, add_special_tokens=False) - - if len(tokens) < 130: - continue - - input_length = 100 - predict_length = 30 - target_tokens = tokens[input_length:input_length + predict_length] - - with torch.no_grad(): - full_input = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device) - target_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - # 获取完整logits - outputs = model(full_input) - logits = outputs.logits - - # 错误方法 (eval_model.py原来的方法) - wrong_slice = logits[0, -predict_length:, :].contiguous() # 取最后30个 - loss_wrong = F.cross_entropy(wrong_slice, target_labels, reduction='mean') - - # 正确方法 - correct_slice = logits[0, input_length-1:input_length+predict_length-1, :].contiguous() # 取99:129 - loss_correct = F.cross_entropy(correct_slice, target_labels, reduction='mean') - - total_loss_wrong += loss_wrong.item() - total_loss_correct += loss_correct.item() - valid_samples += 1 - - diff = loss_wrong.item() - loss_correct.item() - print(f"{i+1:2} | {loss_wrong.item():8.4f} | {loss_correct.item():8.4f} | {diff:+6.4f}") - - avg_loss_wrong = total_loss_wrong / valid_samples - avg_loss_correct = total_loss_correct / valid_samples - improvement = avg_loss_wrong - avg_loss_correct - - print("-" * 45) - print(f"平均 | {avg_loss_wrong:8.4f} | {avg_loss_correct:8.4f} | {improvement:+6.4f}") - - print(f"\n📊 修复效果:") - print(f" 错误方法平均loss: {avg_loss_wrong:.4f}") - print(f" 正确方法平均loss: {avg_loss_correct:.4f}") - print(f" 改进幅度: {improvement:.4f} ({improvement/avg_loss_wrong*100:.1f}%)") - print(f" 正确方法更接近训练时的教师强制loss (~2.4)") - - -def create_final_fixed_eval_model(): - """ - 创建最终修复版的eval_model.py - """ - print(f"\n🔧 创建最终修复版的eval_model.py") - print("="*60) - - # 读取原始eval_model.py - with open('eval_model.py', 'r', encoding='utf-8') as f: - content = f.read() - - # 修复evaluate_sample函数中的关键部分 - old_loss_calculation = ''' # 计算loss(使用forward方法) - # 准备用于loss计算的输入 - loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device) - outputs = model(loss_input_ids, logits_to_keep=predict_length) - - # 计算loss - logits = outputs.logits - loss = None - if logits is not None: - # 重塑logits和目标 - shift_logits = logits[0, -predict_length:, :].contiguous() - shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - # 计算交叉熵损失 - loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean') - loss = loss.item()''' - - new_loss_calculation = ''' # 计算loss(使用forward方法) - # 准备用于loss计算的输入 - loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device) - outputs = model(loss_input_ids) # 移除logits_to_keep参数 - - # 计算loss - logits = outputs.logits - loss = None - if logits is not None: - # 重塑logits和目标 - 修复:使用正确的位置切片 - # 在Transformer中,position i的logits预测position i+1的token - # 要预测position input_length到input_length+predict_length-1的token - # 需要使用position input_length-1到input_length+predict_length-2的logits - shift_logits = logits[0, input_length-1:input_length+predict_length-1, :].contiguous() - shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - # 计算交叉熵损失 - loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean') - loss = loss.item()''' - - # 替换内容 - fixed_content = content.replace(old_loss_calculation, new_loss_calculation) - - # 保存修复后的文件 - with open('eval_model_final_fixed.py', 'w', encoding='utf-8') as f: - f.write(fixed_content) - - print(f"✅ 创建了最终修复版本:eval_model_final_fixed.py") - print(f"主要修复:") - print(f" 1. 移除 logits_to_keep 参数(避免计算差异)") - print(f" 2. 使用正确的位置切片: [input_length-1:input_length+predict_length-1]") - print(f" 3. 这考虑了Transformer中position i预测position i+1的特性") - - # 直接修复原文件 - with open('eval_model.py', 'w', encoding='utf-8') as f: - f.write(fixed_content) - - print(f"✅ 同时直接修复了原文件:eval_model.py") - - -def test_final_fix(): - """ - 测试最终修复版本 - """ - print(f"\n🧪 测试最终修复版本") - print("="*60) - - import subprocess - - # 运行修复后的eval_model.py,使用较少样本快速测试 - cmd = [ - '.venv/bin/python', 'eval_model.py', - '--model_path', 'out/experiment_1_4_0/pretrain_512.pth', - '--model_type', 'model_original', - '--num_samples', '5', - '--input_length', '100', - '--predict_length', '30' - ] - - print("运行命令:") - print(" ".join(cmd)) - print("\n运行结果:") - - try: - result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) - - # 提取关键信息 - output_lines = result.stdout.split('\n') - for line in output_lines: - if 'Loss:' in line or '平均Loss:' in line or '总体统计:' in line or '有效样本数:' in line: - print(line) - - if result.returncode == 0: - print("\n✅ 修复后的eval_model.py运行成功!") - else: - print(f"\n❌ 运行失败,错误码: {result.returncode}") - if result.stderr: - print("错误信息:") - print(result.stderr[:500]) - - except subprocess.TimeoutExpired: - print("❌ 运行超时") - except Exception as e: - print(f"❌ 运行出错: {e}") - - -if __name__ == "__main__": - demonstrate_correct_fix() - create_final_fixed_eval_model() - test_final_fix() \ No newline at end of file diff --git a/fix_logits_to_keep_issue.py b/fix_logits_to_keep_issue.py deleted file mode 100644 index 3b1d2e6..0000000 --- a/fix_logits_to_keep_issue.py +++ /dev/null @@ -1,247 +0,0 @@ -#!/usr/bin/env python3 -""" -修复logits_to_keep参数导致的loss计算错误 -验证问题并提供解决方案 -""" - -import json -import torch -import torch.nn.functional as F -from transformers import AutoTokenizer -from model.LMConfig import LMConfig -from model.model_original import MiniMindLM - - -def demonstrate_logits_to_keep_issue(): - """ - 演示logits_to_keep参数导致的问题 - """ - print("🔍 验证logits_to_keep参数问题") - print("="*60) - - device = 'cuda' - model_path = 'out/experiment_1_4_0/pretrain_512.pth' - - # 加载模型 - config = LMConfig( - dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512, - dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False - ) - - model = MiniMindLM(config) - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - - state_dict = torch.load(model_path, map_location=device) - model.load_state_dict(state_dict, strict=False) - model.to(device) - model.eval() - - # 加载测试数据 - with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f: - sample = json.loads(f.readline().strip()) - - text = sample['text'] - tokens = tokenizer.encode(text, add_special_tokens=False) - - input_tokens = tokens[:100] - target_tokens = tokens[100:130] # 30个目标token - - print(f"测试样本: {len(tokens)} tokens") - print(f"输入: {len(input_tokens)} tokens") - print(f"目标: {len(target_tokens)} tokens") - - with torch.no_grad(): - full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device) - target_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - print(f"\n🔬 详细对比不同方法:") - - # 方法1: 标准forward (正确方法) - outputs1 = model(full_input) - logits1 = outputs1.logits - correct_logits = logits1[0, 99:129, :].contiguous() # 取position 99-128 - loss1 = F.cross_entropy(correct_logits, target_labels, reduction='mean') - - print(f"1. 标准forward (正确):") - print(f" 完整logits形状: {logits1.shape}") - print(f" 用于计算的logits形状: {correct_logits.shape}") - print(f" Loss: {loss1.item():.4f}") - - # 方法2: 使用logits_to_keep=30 (错误方法) - outputs2 = model(full_input, logits_to_keep=30) - logits2 = outputs2.logits - incorrect_logits = logits2[0, -30:, :].contiguous() # 最后30个 - loss2 = F.cross_entropy(incorrect_logits, target_labels, reduction='mean') - - print(f"\n2. logits_to_keep=30 (eval_model.py方法):") - print(f" 部分logits形状: {logits2.shape}") - print(f" 用于计算的logits形状: {incorrect_logits.shape}") - print(f" Loss: {loss2.item():.4f}") - - # 方法3: 修复后的方法(不使用logits_to_keep) - # 这就是方法1,但为了清晰显示修复方案 - print(f"\n3. 修复方法 (不使用logits_to_keep):") - print(f" 使用完整forward,然后选择正确的logits切片") - print(f" 这与方法1相同,Loss: {loss1.item():.4f}") - - # 分析差异 - print(f"\n📊 数值分析:") - print(f" Loss差异: {abs(loss2.item() - loss1.item()):.4f}") - print(f" Loss增幅: {(loss2.item() / loss1.item() - 1) * 100:.1f}%") - - # 检查logits的微小差异如何被放大 - logits_diff = torch.abs(correct_logits - incorrect_logits).max() - print(f" 最大logits差异: {logits_diff.item():.8f}") - - # 计算softmax概率的差异 - prob1 = F.softmax(correct_logits, dim=-1) - prob2 = F.softmax(incorrect_logits, dim=-1) - prob_diff = torch.abs(prob1 - prob2).max() - print(f" 最大概率差异: {prob_diff.item():.8f}") - - print(f"\n💡 结论:") - print(f" 虽然logits差异很小({logits_diff.item():.8f}),") - print(f" 但在交叉熵损失中被显著放大,导致loss增加{(loss2.item() / loss1.item() - 1) * 100:.1f}%") - - -def create_fixed_eval_model(): - """ - 创建修复后的eval_model.py - """ - print(f"\n🔧 创建修复后的评估脚本") - print("="*60) - - # 读取原始eval_model.py - with open('eval_model.py', 'r', encoding='utf-8') as f: - content = f.read() - - # 修复关键部分:移除logits_to_keep的使用 - fixed_content = content.replace( - """ # 计算loss(使用forward方法) - # 准备用于loss计算的输入 - loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device) - outputs = model(loss_input_ids, logits_to_keep=predict_length) - - # 计算loss - logits = outputs.logits - loss = None - if logits is not None: - # 重塑logits和目标 - shift_logits = logits[0, -predict_length:, :].contiguous() - shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - # 计算交叉熵损失 - loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean') - loss = loss.item()""", - """ # 计算loss(使用forward方法) - # 准备用于loss计算的输入 - loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device) - outputs = model(loss_input_ids) # 移除logits_to_keep参数 - - # 计算loss - logits = outputs.logits - loss = None - if logits is not None: - # 重塑logits和目标 - 修复:使用正确的位置切片 - shift_logits = logits[0, input_length:input_length + predict_length, :].contiguous() - shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - # 计算交叉熵损失 - loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean') - loss = loss.item()""" - ) - - # 保存修复后的文件 - with open('eval_model_fixed.py', 'w', encoding='utf-8') as f: - f.write(fixed_content) - - print(f"✅ 创建了修复版本:eval_model_fixed.py") - print(f"主要修复:") - print(f" 1. 移除 logits_to_keep 参数") - print(f" 2. 使用正确的位置切片: [input_length:input_length + predict_length]") - print(f" 3. 而不是错误的 [-predict_length:]") - - -def test_fixed_evaluation(): - """ - 测试修复后的评估方法 - """ - print(f"\n🧪 测试修复后的评估方法") - print("="*60) - - device = 'cuda' - model_path = 'out/experiment_1_4_0/pretrain_512.pth' - - # 加载模型 - config = LMConfig( - dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512, - dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False - ) - - model = MiniMindLM(config) - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - - state_dict = torch.load(model_path, map_location=device) - model.load_state_dict(state_dict, strict=False) - model.to(device) - model.eval() - - # 测试多个样本 - total_loss_old = 0 - total_loss_fixed = 0 - valid_samples = 0 - - with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f: - for i, line in enumerate(f): - if i >= 10: # 测试前10个样本 - break - - sample = json.loads(line.strip()) - text = sample['text'] - tokens = tokenizer.encode(text, add_special_tokens=False) - - if len(tokens) < 130: - continue - - input_length = 100 - predict_length = 30 - input_tokens = tokens[:input_length] - target_tokens = tokens[input_length:input_length + predict_length] - - with torch.no_grad(): - full_input = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device) - target_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - # 原始错误方法 - outputs_old = model(full_input, logits_to_keep=predict_length) - logits_old = outputs_old.logits - shift_logits_old = logits_old[0, -predict_length:, :].contiguous() - loss_old = F.cross_entropy(shift_logits_old, target_labels, reduction='mean') - - # 修复后方法 - outputs_fixed = model(full_input) - logits_fixed = outputs_fixed.logits - shift_logits_fixed = logits_fixed[0, input_length:input_length + predict_length, :].contiguous() - loss_fixed = F.cross_entropy(shift_logits_fixed, target_labels, reduction='mean') - - total_loss_old += loss_old.item() - total_loss_fixed += loss_fixed.item() - valid_samples += 1 - - print(f"样本{i+1}: 原始{loss_old.item():.4f} -> 修复{loss_fixed.item():.4f}") - - avg_loss_old = total_loss_old / valid_samples - avg_loss_fixed = total_loss_fixed / valid_samples - - print(f"\n📊 测试结果总结:") - print(f" 测试样本数: {valid_samples}") - print(f" 原始方法平均loss: {avg_loss_old:.4f}") - print(f" 修复方法平均loss: {avg_loss_fixed:.4f}") - print(f" 差异: {abs(avg_loss_old - avg_loss_fixed):.4f}") - print(f" 修复后loss更接近训练时的教师强制loss (~2.4)") - - -if __name__ == "__main__": - demonstrate_logits_to_keep_issue() - create_fixed_eval_model() - test_fixed_evaluation() \ No newline at end of file diff --git a/investigate_logits_to_keep.py b/investigate_logits_to_keep.py deleted file mode 100644 index 24ada6c..0000000 --- a/investigate_logits_to_keep.py +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/env python3 -""" -深入调查logits_to_keep参数对loss计算的影响 -""" - -import json -import torch -import torch.nn.functional as F -from transformers import AutoTokenizer -from model.LMConfig import LMConfig -from model.model_original import MiniMindLM - - -def investigate_logits_to_keep_issue(): - """ - 调查logits_to_keep参数的影响 - """ - print("🔍 调查logits_to_keep参数的影响") - print("="*60) - - device = 'cuda' - model_path = 'out/experiment_1_4_0/pretrain_512.pth' - - # 加载模型 - config = LMConfig( - dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512, - dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False - ) - - model = MiniMindLM(config) - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - - state_dict = torch.load(model_path, map_location=device) - model.load_state_dict(state_dict, strict=False) - model.to(device) - model.eval() - - # 加载测试数据 - with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f: - sample = json.loads(f.readline().strip()) - - text = sample['text'] - tokens = tokenizer.encode(text, add_special_tokens=False) - - input_tokens = tokens[:100] - target_tokens = tokens[100:130] # 30个目标token - - print(f"测试文本长度: {len(tokens)} tokens") - print(f"输入: {len(input_tokens)} tokens") - print(f"目标: {len(target_tokens)} tokens") - - with torch.no_grad(): - # 方法1: 标准forward (类似训练时) - full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device) - outputs1 = model(full_input) - logits1 = outputs1.logits - - # 计算loss (训练方式) - shift_logits1 = logits1[0, 99:129, :].contiguous() - shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - loss1 = F.cross_entropy(shift_logits1, shift_labels, reduction='mean') - - print(f"\n方法1 (标准forward):") - print(f" logits形状: {logits1.shape}") - print(f" 用于loss计算的logits形状: {shift_logits1.shape}") - print(f" Loss: {loss1.item():.4f}") - - # 方法2: 使用logits_to_keep=30 (eval_model.py的方式) - outputs2 = model(full_input, logits_to_keep=30) - logits2 = outputs2.logits - - if logits2 is not None: - print(f"\n方法2 (logits_to_keep=30):") - print(f" logits形状: {logits2.shape}") - - # 按照eval_model.py的方式计算loss - shift_logits2 = logits2[0, -30:, :].contiguous() - loss2 = F.cross_entropy(shift_logits2, shift_labels, reduction='mean') - print(f" 用于loss计算的logits形状: {shift_logits2.shape}") - print(f" Loss: {loss2.item():.4f}") - - # 检查logits是否相同 - expected_logits = logits1[0, 100:130, :] # 从position 100-129 - actual_logits = logits2[0, -30:, :] # 最后30个position - - print(f"\n逐项对比:") - print(f" 期望的logits形状: {expected_logits.shape}") - print(f" 实际的logits形状: {actual_logits.shape}") - - # 检查是否相等 - are_equal = torch.allclose(expected_logits, actual_logits, rtol=1e-4) - print(f" logits是否相等: {are_equal}") - - if not are_equal: - diff = torch.abs(expected_logits - actual_logits).max() - print(f" 最大差异: {diff.item():.6f}") - - # 检查前几个position的差异 - for i in range(min(5, expected_logits.shape[0])): - pos_diff = torch.abs(expected_logits[i] - actual_logits[i]).max() - print(f" Position {i} 最大差异: {pos_diff.item():.6f}") - else: - print("\n方法2: logits为None") - - # 方法3: 不同的logits_to_keep值 - print(f"\n测试不同logits_to_keep值:") - for keep_value in [10, 20, 30, 50, 100]: - outputs_test = model(full_input, logits_to_keep=keep_value) - if outputs_test.logits is not None: - test_logits_shape = outputs_test.logits.shape - print(f" logits_to_keep={keep_value}: {test_logits_shape}") - else: - print(f" logits_to_keep={keep_value}: None") - - -def check_model_forward_implementation(): - """检查模型forward方法中logits_to_keep的实现""" - print("\n" + "="*60) - print("🔍 检查模型forward方法的实现") - - # 读取模型代码中关于logits_to_keep的实现 - try: - with open('model/model_original.py', 'r', encoding='utf-8') as f: - content = f.read() - - # 查找logits_to_keep相关的代码 - lines = content.split('\n') - for i, line in enumerate(lines): - if 'logits_to_keep' in line: - print(f"第{i+1}行: {line.strip()}") - # 打印前后几行上下文 - for j in range(max(0, i-2), min(len(lines), i+3)): - if j != i: - print(f"第{j+1}行: {lines[j].strip()}") - print() - except FileNotFoundError: - print("无法读取model_original.py文件") - - -def compare_with_original_eval_script(): - """ - 对比原始eval_model.py脚本的行为 - """ - print("\n" + "="*60) - print("🔍 对比原始eval_model.py的行为") - - device = 'cuda' - model_path = 'out/experiment_1_4_0/pretrain_512.pth' - - # 复制eval_model.py中的相关逻辑 - config = LMConfig( - dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512, - dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False - ) - - model = MiniMindLM(config) - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - - state_dict = torch.load(model_path, map_location=device) - model.load_state_dict(state_dict, strict=False) - model.to(device) - model.eval() - - # 加载数据 - with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f: - sample = json.loads(f.readline().strip()) - - text = sample['text'] - tokens = tokenizer.encode(text, add_special_tokens=False) - - input_length = 100 - predict_length = 30 - - input_tokens = tokens[:input_length] - target_tokens = tokens[input_length:input_length + predict_length] - - print(f"复现eval_model.py的计算:") - print(f" input_length: {input_length}") - print(f" predict_length: {predict_length}") - - with torch.no_grad(): - # 完全按照eval_model.py的方式 - loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device) - outputs = model(loss_input_ids, logits_to_keep=predict_length) - - print(f" loss_input_ids形状: {loss_input_ids.shape}") - print(f" logits_to_keep参数: {predict_length}") - - logits = outputs.logits - loss = None - if logits is not None: - print(f" 输出logits形状: {logits.shape}") - - # 重塑logits和目标 - shift_logits = logits[0, -predict_length:, :].contiguous() - shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) - - print(f" shift_logits形状: {shift_logits.shape}") - print(f" shift_labels形状: {shift_labels.shape}") - - # 计算交叉熵损失 - loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean') - print(f" 计算得到的loss: {loss.item():.4f}") - else: - print(" logits为None") - - -if __name__ == "__main__": - investigate_logits_to_keep_issue() - check_model_forward_implementation() - compare_with_original_eval_script() \ No newline at end of file diff --git a/train_inference_gap_analysis_report.md b/train_inference_gap_analysis_report.md deleted file mode 100644 index 6391cfb..0000000 --- a/train_inference_gap_analysis_report.md +++ /dev/null @@ -1,181 +0,0 @@ -# 训练与推理Loss差距分析报告 - -> **实验**: Experiment 1.4.0 -> **日期**: 2025-07-31 -> **分析师**: Claude AI -> **状态**: 已完成并修复关键问题 - ---- - -## 📋 问题概述 - -### 初始发现 -用户发现训练loss(2.43)和推理loss(12.34)存在巨大差距,要求进行详细分析。 - -**关键数据**: -- 训练Loss: 2.43 -- 初始推理Loss: 12.34 -- 差距: 9.91 (405% 增长) - -### 可能原因假设 -1. 数据差异 -2. 推理脚本问题(权重加载、模型不一致) -3. 训练与推理模式不一致(错误累积) -4. KV cache问题 - ---- - -## 🔍 分析过程 - -### 第一阶段:数据一致性验证 -**方法**: 从训练数据中重新提取20个样本创建eval_data_from_train.json - -**结果**: ✅ 确认评估数据来自训练数据集,排除数据差异问题 - -### 第二阶段:模型加载验证 -**方法**: 检查权重加载匹配情况 - -**结果**: ✅ 权重加载完全成功(75/75参数匹配),排除模型加载问题 - -### 第三阶段:训练vs推理模式对比 -**方法**: 对比教师强制(teacher forcing)与自回归生成 - -**关键发现**: -``` -教师强制loss: ~2.43 (与训练一致) -真实自回归loss: ~10-11 (接近推理loss) -``` - -**初步结论**: 训练与推理的差异主要来自计算方式不同,这本身是正常的 - -### 第四阶段:深入调查logits_to_keep参数 -**方法**: 分析eval_model.py中logits_to_keep参数的影响 - -**震惊发现**: -``` -标准forward: Loss = 3.4188 -使用logits_to_keep=30: Loss = 9.8785 -差距: 188.9% 增长! -``` - -### 第五阶段:位置索引深度分析 -**方法**: 分析Transformer位置索引的正确性 - -**根本原因发现**: -1. **错误方法**: `logits[0, -predict_length:, :]` -2. **正确方法**: `logits[0, input_length-1:input_length+predict_length-1, :]` -3. **关键认知**: Transformer中position i的logits预测position i+1的token - ---- - -## 🛠️ 修复方案 - -### 核心修复 -**文件**: `eval_model.py` - -**修复前**: -```python -outputs = model(loss_input_ids, logits_to_keep=predict_length) -shift_logits = logits[0, -predict_length:, :].contiguous() -``` - -**修复后**: -```python -outputs = model(loss_input_ids) # 移除logits_to_keep -shift_logits = logits[0, input_length-1:input_length+predict_length-1, :].contiguous() -``` - -### 修复原理 -1. **移除logits_to_keep参数**: 避免计算差异 -2. **使用正确位置切片**: 考虑Transformer的位置偏移 -3. **确保一致性**: 与训练时的教师强制计算对齐 - ---- - -## 📊 修复效果验证 - -### 单样本对比 -``` -样本 | 错误方法 | 正确方法 | 改善 ------|----------|----------|------ -1 | 9.88 | 3.42 | 65.3% -2 | 13.56 | 1.50 | 88.9% -3 | 13.62 | 1.78 | 86.9% -... -平均 | 12.34 | 2.73 | 77.9% -``` - -### 最终验证 -**修复后10样本评估**: -- 平均Loss: 2.26 -- 与训练Loss (2.43) 差异: 仅0.17 (7%) -- 改善幅度: 81.7% (从12.34降至2.26) - ---- - -## 🎯 关键发现总结 - -### 主要问题 -1. **eval_model.py存在位置索引错误**: 这是导致loss被严重高估的根本原因 -2. **logits_to_keep参数的误用**: 改变了模型计算方式 -3. **位置偏移的忽略**: 未考虑Transformer的特殊性质 - -### 技术洞察 -1. **Transformer位置特性**: position i的logits预测position i+1 -2. **微小差异的放大效应**: 即使很小的logits差异也会在交叉熵中被显著放大 -3. **评估系统的重要性**: 错误的评估会误导整个研究方向 - -### 修复成果 -1. **训练推理一致性**: ✅ 达到优秀水平(差异<10%) -2. **评估系统可靠性**: ✅ 修复后可信度大幅提升 -3. **技术基础**: ✅ 为后续实验提供可靠基准 - ---- - -## 🔮 后续影响 - -### 立即影响 -- **实验1.4.0评估结果更正**: 推理loss从12.34修正为2.26 -- **模型性能重新评价**: model_original的baseline表现优秀 -- **评估工具可靠性**: 修复后的eval_model.py可用于后续实验 - -### 长期影响 -- **研究方向**: 确认当前训练方法的有效性 -- **技术规范**: 建立正确的模型评估标准 -- **项目信心**: 为KnowledgeDataset研究提供坚实基础 - ---- - -## 📝 经验教训 - -### 技术层面 -1. **系统性调试的重要性**: 逐步排除假设,找到根本原因 -2. **位置索引的细节**: Transformer评估中的关键技术点 -3. **验证的必要性**: 必须验证评估工具的正确性 - -### 方法论层面 -1. **多角度分析**: 从数据、模型、计算三个维度分析问题 -2. **对照实验**: 通过不同方法的对比找到差异来源 -3. **深入理解**: 理解底层原理比表面修复更重要 - -### 质量控制 -1. **评估工具验证**: 在使用前必须验证评估工具的正确性 -2. **一致性检查**: 训练与推理的一致性是重要指标 -3. **文档记录**: 详细记录问题发现和修复过程 - ---- - -## ✅ 结论 - -**问题解决**: ✅ 完全解决 -**根本原因**: eval_model.py中的位置索引错误 -**修复效果**: 推理loss从12.34降至2.26,改善81.7% -**影响评估**: 重大正面影响,为项目建立可靠基础 - -**最终状态**: 训练Loss (2.43) 与推理Loss (2.26) 高度一致,证明模型训练成功且评估系统可靠。 - ---- - -**报告完成时间**: 2025-07-31 -**验证状态**: ✅ 已通过10样本独立验证 -**应用状态**: ✅ 已应用于实验1.4.0分析更新 \ No newline at end of file