Minimind/investigate_logits_to_keep.py
2025-08-01 15:54:21 +08:00

211 lines
7.8 KiB
Python

#!/usr/bin/env python3
"""
深入调查logits_to_keep参数对loss计算的影响
"""
import json
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from model.LMConfig import LMConfig
from model.model_original import MiniMindLM
def investigate_logits_to_keep_issue():
"""
调查logits_to_keep参数的影响
"""
print("🔍 调查logits_to_keep参数的影响")
print("="*60)
device = 'cuda'
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
# 加载模型
config = LMConfig(
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
)
model = MiniMindLM(config)
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict, strict=False)
model.to(device)
model.eval()
# 加载测试数据
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
sample = json.loads(f.readline().strip())
text = sample['text']
tokens = tokenizer.encode(text, add_special_tokens=False)
input_tokens = tokens[:100]
target_tokens = tokens[100:130] # 30个目标token
print(f"测试文本长度: {len(tokens)} tokens")
print(f"输入: {len(input_tokens)} tokens")
print(f"目标: {len(target_tokens)} tokens")
with torch.no_grad():
# 方法1: 标准forward (类似训练时)
full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
outputs1 = model(full_input)
logits1 = outputs1.logits
# 计算loss (训练方式)
shift_logits1 = logits1[0, 99:129, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
loss1 = F.cross_entropy(shift_logits1, shift_labels, reduction='mean')
print(f"\n方法1 (标准forward):")
print(f" logits形状: {logits1.shape}")
print(f" 用于loss计算的logits形状: {shift_logits1.shape}")
print(f" Loss: {loss1.item():.4f}")
# 方法2: 使用logits_to_keep=30 (eval_model.py的方式)
outputs2 = model(full_input, logits_to_keep=30)
logits2 = outputs2.logits
if logits2 is not None:
print(f"\n方法2 (logits_to_keep=30):")
print(f" logits形状: {logits2.shape}")
# 按照eval_model.py的方式计算loss
shift_logits2 = logits2[0, -30:, :].contiguous()
loss2 = F.cross_entropy(shift_logits2, shift_labels, reduction='mean')
print(f" 用于loss计算的logits形状: {shift_logits2.shape}")
print(f" Loss: {loss2.item():.4f}")
# 检查logits是否相同
expected_logits = logits1[0, 100:130, :] # 从position 100-129
actual_logits = logits2[0, -30:, :] # 最后30个position
print(f"\n逐项对比:")
print(f" 期望的logits形状: {expected_logits.shape}")
print(f" 实际的logits形状: {actual_logits.shape}")
# 检查是否相等
are_equal = torch.allclose(expected_logits, actual_logits, rtol=1e-4)
print(f" logits是否相等: {are_equal}")
if not are_equal:
diff = torch.abs(expected_logits - actual_logits).max()
print(f" 最大差异: {diff.item():.6f}")
# 检查前几个position的差异
for i in range(min(5, expected_logits.shape[0])):
pos_diff = torch.abs(expected_logits[i] - actual_logits[i]).max()
print(f" Position {i} 最大差异: {pos_diff.item():.6f}")
else:
print("\n方法2: logits为None")
# 方法3: 不同的logits_to_keep值
print(f"\n测试不同logits_to_keep值:")
for keep_value in [10, 20, 30, 50, 100]:
outputs_test = model(full_input, logits_to_keep=keep_value)
if outputs_test.logits is not None:
test_logits_shape = outputs_test.logits.shape
print(f" logits_to_keep={keep_value}: {test_logits_shape}")
else:
print(f" logits_to_keep={keep_value}: None")
def check_model_forward_implementation():
"""检查模型forward方法中logits_to_keep的实现"""
print("\n" + "="*60)
print("🔍 检查模型forward方法的实现")
# 读取模型代码中关于logits_to_keep的实现
try:
with open('model/model_original.py', 'r', encoding='utf-8') as f:
content = f.read()
# 查找logits_to_keep相关的代码
lines = content.split('\n')
for i, line in enumerate(lines):
if 'logits_to_keep' in line:
print(f"{i+1}行: {line.strip()}")
# 打印前后几行上下文
for j in range(max(0, i-2), min(len(lines), i+3)):
if j != i:
print(f"{j+1}行: {lines[j].strip()}")
print()
except FileNotFoundError:
print("无法读取model_original.py文件")
def compare_with_original_eval_script():
"""
对比原始eval_model.py脚本的行为
"""
print("\n" + "="*60)
print("🔍 对比原始eval_model.py的行为")
device = 'cuda'
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
# 复制eval_model.py中的相关逻辑
config = LMConfig(
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
)
model = MiniMindLM(config)
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict, strict=False)
model.to(device)
model.eval()
# 加载数据
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
sample = json.loads(f.readline().strip())
text = sample['text']
tokens = tokenizer.encode(text, add_special_tokens=False)
input_length = 100
predict_length = 30
input_tokens = tokens[:input_length]
target_tokens = tokens[input_length:input_length + predict_length]
print(f"复现eval_model.py的计算:")
print(f" input_length: {input_length}")
print(f" predict_length: {predict_length}")
with torch.no_grad():
# 完全按照eval_model.py的方式
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
outputs = model(loss_input_ids, logits_to_keep=predict_length)
print(f" loss_input_ids形状: {loss_input_ids.shape}")
print(f" logits_to_keep参数: {predict_length}")
logits = outputs.logits
loss = None
if logits is not None:
print(f" 输出logits形状: {logits.shape}")
# 重塑logits和目标
shift_logits = logits[0, -predict_length:, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
print(f" shift_logits形状: {shift_logits.shape}")
print(f" shift_labels形状: {shift_labels.shape}")
# 计算交叉熵损失
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
print(f" 计算得到的loss: {loss.item():.4f}")
else:
print(" logits为None")
if __name__ == "__main__":
investigate_logits_to_keep_issue()
check_model_forward_implementation()
compare_with_original_eval_script()