#!/usr/bin/env python3 """ 深入调查logits_to_keep参数对loss计算的影响 """ import json import torch import torch.nn.functional as F from transformers import AutoTokenizer from model.LMConfig import LMConfig from model.model_original import MiniMindLM def investigate_logits_to_keep_issue(): """ 调查logits_to_keep参数的影响 """ print("🔍 调查logits_to_keep参数的影响") print("="*60) device = 'cuda' model_path = 'out/experiment_1_4_0/pretrain_512.pth' # 加载模型 config = LMConfig( dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512, dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False ) model = MiniMindLM(config) tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') state_dict = torch.load(model_path, map_location=device) model.load_state_dict(state_dict, strict=False) model.to(device) model.eval() # 加载测试数据 with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f: sample = json.loads(f.readline().strip()) text = sample['text'] tokens = tokenizer.encode(text, add_special_tokens=False) input_tokens = tokens[:100] target_tokens = tokens[100:130] # 30个目标token print(f"测试文本长度: {len(tokens)} tokens") print(f"输入: {len(input_tokens)} tokens") print(f"目标: {len(target_tokens)} tokens") with torch.no_grad(): # 方法1: 标准forward (类似训练时) full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device) outputs1 = model(full_input) logits1 = outputs1.logits # 计算loss (训练方式) shift_logits1 = logits1[0, 99:129, :].contiguous() shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) loss1 = F.cross_entropy(shift_logits1, shift_labels, reduction='mean') print(f"\n方法1 (标准forward):") print(f" logits形状: {logits1.shape}") print(f" 用于loss计算的logits形状: {shift_logits1.shape}") print(f" Loss: {loss1.item():.4f}") # 方法2: 使用logits_to_keep=30 (eval_model.py的方式) outputs2 = model(full_input, logits_to_keep=30) logits2 = outputs2.logits if logits2 is not None: print(f"\n方法2 (logits_to_keep=30):") print(f" logits形状: {logits2.shape}") # 按照eval_model.py的方式计算loss shift_logits2 = logits2[0, -30:, :].contiguous() loss2 = F.cross_entropy(shift_logits2, shift_labels, reduction='mean') print(f" 用于loss计算的logits形状: {shift_logits2.shape}") print(f" Loss: {loss2.item():.4f}") # 检查logits是否相同 expected_logits = logits1[0, 100:130, :] # 从position 100-129 actual_logits = logits2[0, -30:, :] # 最后30个position print(f"\n逐项对比:") print(f" 期望的logits形状: {expected_logits.shape}") print(f" 实际的logits形状: {actual_logits.shape}") # 检查是否相等 are_equal = torch.allclose(expected_logits, actual_logits, rtol=1e-4) print(f" logits是否相等: {are_equal}") if not are_equal: diff = torch.abs(expected_logits - actual_logits).max() print(f" 最大差异: {diff.item():.6f}") # 检查前几个position的差异 for i in range(min(5, expected_logits.shape[0])): pos_diff = torch.abs(expected_logits[i] - actual_logits[i]).max() print(f" Position {i} 最大差异: {pos_diff.item():.6f}") else: print("\n方法2: logits为None") # 方法3: 不同的logits_to_keep值 print(f"\n测试不同logits_to_keep值:") for keep_value in [10, 20, 30, 50, 100]: outputs_test = model(full_input, logits_to_keep=keep_value) if outputs_test.logits is not None: test_logits_shape = outputs_test.logits.shape print(f" logits_to_keep={keep_value}: {test_logits_shape}") else: print(f" logits_to_keep={keep_value}: None") def check_model_forward_implementation(): """检查模型forward方法中logits_to_keep的实现""" print("\n" + "="*60) print("🔍 检查模型forward方法的实现") # 读取模型代码中关于logits_to_keep的实现 try: with open('model/model_original.py', 'r', encoding='utf-8') as f: content = f.read() # 查找logits_to_keep相关的代码 lines = content.split('\n') for i, line in enumerate(lines): if 'logits_to_keep' in line: print(f"第{i+1}行: {line.strip()}") # 打印前后几行上下文 for j in range(max(0, i-2), min(len(lines), i+3)): if j != i: print(f"第{j+1}行: {lines[j].strip()}") print() except FileNotFoundError: print("无法读取model_original.py文件") def compare_with_original_eval_script(): """ 对比原始eval_model.py脚本的行为 """ print("\n" + "="*60) print("🔍 对比原始eval_model.py的行为") device = 'cuda' model_path = 'out/experiment_1_4_0/pretrain_512.pth' # 复制eval_model.py中的相关逻辑 config = LMConfig( dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512, dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False ) model = MiniMindLM(config) tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') state_dict = torch.load(model_path, map_location=device) model.load_state_dict(state_dict, strict=False) model.to(device) model.eval() # 加载数据 with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f: sample = json.loads(f.readline().strip()) text = sample['text'] tokens = tokenizer.encode(text, add_special_tokens=False) input_length = 100 predict_length = 30 input_tokens = tokens[:input_length] target_tokens = tokens[input_length:input_length + predict_length] print(f"复现eval_model.py的计算:") print(f" input_length: {input_length}") print(f" predict_length: {predict_length}") with torch.no_grad(): # 完全按照eval_model.py的方式 loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device) outputs = model(loss_input_ids, logits_to_keep=predict_length) print(f" loss_input_ids形状: {loss_input_ids.shape}") print(f" logits_to_keep参数: {predict_length}") logits = outputs.logits loss = None if logits is not None: print(f" 输出logits形状: {logits.shape}") # 重塑logits和目标 shift_logits = logits[0, -predict_length:, :].contiguous() shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) print(f" shift_logits形状: {shift_logits.shape}") print(f" shift_labels形状: {shift_labels.shape}") # 计算交叉熵损失 loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean') print(f" 计算得到的loss: {loss.item():.4f}") else: print(" logits为None") if __name__ == "__main__": investigate_logits_to_keep_issue() check_model_forward_implementation() compare_with_original_eval_script()