211 lines
7.8 KiB
Python
211 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
深入调查logits_to_keep参数对loss计算的影响
|
|
"""
|
|
|
|
import json
|
|
import torch
|
|
import torch.nn.functional as F
|
|
from transformers import AutoTokenizer
|
|
from model.LMConfig import LMConfig
|
|
from model.model_original import MiniMindLM
|
|
|
|
|
|
def investigate_logits_to_keep_issue():
|
|
"""
|
|
调查logits_to_keep参数的影响
|
|
"""
|
|
print("🔍 调查logits_to_keep参数的影响")
|
|
print("="*60)
|
|
|
|
device = 'cuda'
|
|
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
|
|
|
|
# 加载模型
|
|
config = LMConfig(
|
|
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
|
|
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
|
|
)
|
|
|
|
model = MiniMindLM(config)
|
|
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
|
|
|
state_dict = torch.load(model_path, map_location=device)
|
|
model.load_state_dict(state_dict, strict=False)
|
|
model.to(device)
|
|
model.eval()
|
|
|
|
# 加载测试数据
|
|
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
|
|
sample = json.loads(f.readline().strip())
|
|
|
|
text = sample['text']
|
|
tokens = tokenizer.encode(text, add_special_tokens=False)
|
|
|
|
input_tokens = tokens[:100]
|
|
target_tokens = tokens[100:130] # 30个目标token
|
|
|
|
print(f"测试文本长度: {len(tokens)} tokens")
|
|
print(f"输入: {len(input_tokens)} tokens")
|
|
print(f"目标: {len(target_tokens)} tokens")
|
|
|
|
with torch.no_grad():
|
|
# 方法1: 标准forward (类似训练时)
|
|
full_input = torch.tensor([tokens[:130]], dtype=torch.long).to(device)
|
|
outputs1 = model(full_input)
|
|
logits1 = outputs1.logits
|
|
|
|
# 计算loss (训练方式)
|
|
shift_logits1 = logits1[0, 99:129, :].contiguous()
|
|
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
|
loss1 = F.cross_entropy(shift_logits1, shift_labels, reduction='mean')
|
|
|
|
print(f"\n方法1 (标准forward):")
|
|
print(f" logits形状: {logits1.shape}")
|
|
print(f" 用于loss计算的logits形状: {shift_logits1.shape}")
|
|
print(f" Loss: {loss1.item():.4f}")
|
|
|
|
# 方法2: 使用logits_to_keep=30 (eval_model.py的方式)
|
|
outputs2 = model(full_input, logits_to_keep=30)
|
|
logits2 = outputs2.logits
|
|
|
|
if logits2 is not None:
|
|
print(f"\n方法2 (logits_to_keep=30):")
|
|
print(f" logits形状: {logits2.shape}")
|
|
|
|
# 按照eval_model.py的方式计算loss
|
|
shift_logits2 = logits2[0, -30:, :].contiguous()
|
|
loss2 = F.cross_entropy(shift_logits2, shift_labels, reduction='mean')
|
|
print(f" 用于loss计算的logits形状: {shift_logits2.shape}")
|
|
print(f" Loss: {loss2.item():.4f}")
|
|
|
|
# 检查logits是否相同
|
|
expected_logits = logits1[0, 100:130, :] # 从position 100-129
|
|
actual_logits = logits2[0, -30:, :] # 最后30个position
|
|
|
|
print(f"\n逐项对比:")
|
|
print(f" 期望的logits形状: {expected_logits.shape}")
|
|
print(f" 实际的logits形状: {actual_logits.shape}")
|
|
|
|
# 检查是否相等
|
|
are_equal = torch.allclose(expected_logits, actual_logits, rtol=1e-4)
|
|
print(f" logits是否相等: {are_equal}")
|
|
|
|
if not are_equal:
|
|
diff = torch.abs(expected_logits - actual_logits).max()
|
|
print(f" 最大差异: {diff.item():.6f}")
|
|
|
|
# 检查前几个position的差异
|
|
for i in range(min(5, expected_logits.shape[0])):
|
|
pos_diff = torch.abs(expected_logits[i] - actual_logits[i]).max()
|
|
print(f" Position {i} 最大差异: {pos_diff.item():.6f}")
|
|
else:
|
|
print("\n方法2: logits为None")
|
|
|
|
# 方法3: 不同的logits_to_keep值
|
|
print(f"\n测试不同logits_to_keep值:")
|
|
for keep_value in [10, 20, 30, 50, 100]:
|
|
outputs_test = model(full_input, logits_to_keep=keep_value)
|
|
if outputs_test.logits is not None:
|
|
test_logits_shape = outputs_test.logits.shape
|
|
print(f" logits_to_keep={keep_value}: {test_logits_shape}")
|
|
else:
|
|
print(f" logits_to_keep={keep_value}: None")
|
|
|
|
|
|
def check_model_forward_implementation():
|
|
"""检查模型forward方法中logits_to_keep的实现"""
|
|
print("\n" + "="*60)
|
|
print("🔍 检查模型forward方法的实现")
|
|
|
|
# 读取模型代码中关于logits_to_keep的实现
|
|
try:
|
|
with open('model/model_original.py', 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# 查找logits_to_keep相关的代码
|
|
lines = content.split('\n')
|
|
for i, line in enumerate(lines):
|
|
if 'logits_to_keep' in line:
|
|
print(f"第{i+1}行: {line.strip()}")
|
|
# 打印前后几行上下文
|
|
for j in range(max(0, i-2), min(len(lines), i+3)):
|
|
if j != i:
|
|
print(f"第{j+1}行: {lines[j].strip()}")
|
|
print()
|
|
except FileNotFoundError:
|
|
print("无法读取model_original.py文件")
|
|
|
|
|
|
def compare_with_original_eval_script():
|
|
"""
|
|
对比原始eval_model.py脚本的行为
|
|
"""
|
|
print("\n" + "="*60)
|
|
print("🔍 对比原始eval_model.py的行为")
|
|
|
|
device = 'cuda'
|
|
model_path = 'out/experiment_1_4_0/pretrain_512.pth'
|
|
|
|
# 复制eval_model.py中的相关逻辑
|
|
config = LMConfig(
|
|
dim=512, n_layers=8, n_heads=32, vocab_size=6400, max_seq_len=512,
|
|
dropout=0.0, norm_eps=1e-5, rope_theta=1e6, use_moe=False
|
|
)
|
|
|
|
model = MiniMindLM(config)
|
|
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
|
|
|
state_dict = torch.load(model_path, map_location=device)
|
|
model.load_state_dict(state_dict, strict=False)
|
|
model.to(device)
|
|
model.eval()
|
|
|
|
# 加载数据
|
|
with open('dataset/stable/eval_data_from_train.json', 'r', encoding='utf-8') as f:
|
|
sample = json.loads(f.readline().strip())
|
|
|
|
text = sample['text']
|
|
tokens = tokenizer.encode(text, add_special_tokens=False)
|
|
|
|
input_length = 100
|
|
predict_length = 30
|
|
|
|
input_tokens = tokens[:input_length]
|
|
target_tokens = tokens[input_length:input_length + predict_length]
|
|
|
|
print(f"复现eval_model.py的计算:")
|
|
print(f" input_length: {input_length}")
|
|
print(f" predict_length: {predict_length}")
|
|
|
|
with torch.no_grad():
|
|
# 完全按照eval_model.py的方式
|
|
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
|
|
outputs = model(loss_input_ids, logits_to_keep=predict_length)
|
|
|
|
print(f" loss_input_ids形状: {loss_input_ids.shape}")
|
|
print(f" logits_to_keep参数: {predict_length}")
|
|
|
|
logits = outputs.logits
|
|
loss = None
|
|
if logits is not None:
|
|
print(f" 输出logits形状: {logits.shape}")
|
|
|
|
# 重塑logits和目标
|
|
shift_logits = logits[0, -predict_length:, :].contiguous()
|
|
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
|
|
|
print(f" shift_logits形状: {shift_logits.shape}")
|
|
print(f" shift_labels形状: {shift_labels.shape}")
|
|
|
|
# 计算交叉熵损失
|
|
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
|
print(f" 计算得到的loss: {loss.item():.4f}")
|
|
else:
|
|
print(" logits为None")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
investigate_logits_to_keep_issue()
|
|
check_model_forward_implementation()
|
|
compare_with_original_eval_script() |