Minimind/eval_model.py

533 lines
22 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
评估预训练模型的推理效果
用于测试不同实验中训练出来的模型在eval_data.json上的表现
"""
import os
import json
import argparse
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer
from model.LMConfig import LMConfig
def load_model(model_path, model_type, device, config_params=None):
"""
加载模型和tokenizer
Args:
model_path: 模型权重文件路径
model_type: 模型类型 (model/model_original/model_no_feed)
device: 运行设备
config_params: 模型配置参数字典
Returns:
model: 加载好的模型
tokenizer: tokenizer实例
"""
# 初始化配置
if config_params:
lm_config = LMConfig(**config_params)
else:
lm_config = LMConfig()
# 打印配置信息
print(f"模型配置:")
print(f" dim: {lm_config.dim}")
print(f" n_layers: {lm_config.n_layers}")
print(f" n_heads: {lm_config.n_heads}")
print(f" vocab_size: {lm_config.vocab_size}")
print(f" max_seq_len: {lm_config.max_seq_len}")
if hasattr(lm_config, 'knowledge_num'):
print(f" knowledge_num: {lm_config.knowledge_num}")
print(f" knowledge_length: {lm_config.knowledge_length}")
print(f" knowledge_dim: {lm_config.knowledge_dim}")
print()
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
# 根据模型类型导入对应的模型类
if model_type == "model":
from model.model import MiniMindLM
elif model_type == "model_original":
from model.model_original import MiniMindLM
elif model_type == "model_no_feed":
from model.model_no_feed import MiniMindLM
elif model_type == "model_memory":
from model.model_memory import MiniMindLM
elif model_type.startswith("model_memory_"):
# 支持通用的model_memory_X_X_X格式
try:
module = __import__(f"model.{model_type}", fromlist=["MiniMindLM"])
MiniMindLM = getattr(module, "MiniMindLM")
except (ImportError, AttributeError) as e:
raise ValueError(f"无法导入模型类型 {model_type}: {e}")
else:
raise ValueError(f"不支持的模型类型: {model_type}")
# 初始化模型
model = MiniMindLM(lm_config)
# 加载权重
if os.path.exists(model_path):
print(f"正在从 {model_path} 加载模型权重...")
# 加载权重文件
state_dict = torch.load(model_path, map_location=device)
# 获取模型的参数名称
model_keys = set(model.state_dict().keys())
checkpoint_keys = set(state_dict.keys())
# 统计权重匹配情况
matched_keys = model_keys & checkpoint_keys
missing_keys = model_keys - checkpoint_keys
unexpected_keys = checkpoint_keys - model_keys
print(f"\n权重加载详情:")
print(f" 模型总参数数量: {len(model_keys)}")
print(f" 权重文件参数数量: {len(checkpoint_keys)}")
print(f" 成功匹配参数: {len(matched_keys)}")
print(f" 缺失参数: {len(missing_keys)}")
print(f" 多余参数: {len(unexpected_keys)}")
# 详细列出缺失和多余的参数
if missing_keys:
print(f"\n❌ 缺失的参数 ({len(missing_keys)}):")
for key in sorted(missing_keys):
print(f" - {key}")
if unexpected_keys:
print(f"\n⚠️ 权重文件中多余的参数 ({len(unexpected_keys)}):")
for key in sorted(unexpected_keys):
print(f" + {key}")
# 加载权重(允许部分匹配)
try:
incompatible_keys = model.load_state_dict(state_dict, strict=False)
# 检查加载结果
if len(incompatible_keys.missing_keys) == 0 and len(incompatible_keys.unexpected_keys) == 0:
print(f"\n✅ 权重加载完全成功!")
elif len(incompatible_keys.missing_keys) == 0:
print(f"\n✅ 权重加载成功(忽略多余参数)")
else:
print(f"\n⚠️ 权重加载部分成功,存在缺失参数")
print(f" 这可能影响模型性能,请检查模型配置参数是否正确")
# 计算加载成功率
success_rate = len(matched_keys) / len(model_keys) * 100
print(f" 参数加载成功率: {success_rate:.1f}%")
if success_rate < 90:
print(f" ❌ 警告:加载成功率过低,模型可能无法正常工作!")
elif success_rate < 100:
print(f" ⚠️ 警告:存在缺失参数,可能影响模型性能")
except Exception as e:
raise RuntimeError(f"权重加载失败: {e}")
# 验证关键层的形状
print("🔍 验证关键层形状:")
key_layers = [
'tok_embeddings.weight',
'output.weight',
'norm.weight',
]
# 添加每一层的验证
for i in range(lm_config.n_layers):
key_layers.extend([
f'layers.{i}.attention_norm.weight',
f'layers.{i}.ffn_norm.weight',
f'layers.{i}.self_attention.wq.weight',
f'layers.{i}.self_attention.wk.weight',
f'layers.{i}.self_attention.wv.weight',
f'layers.{i}.self_attention.wo.weight',
])
# FFN层的验证model_original有FFN其他模型可能没有
if f'layers.{i}.feed_forward.w1.weight' in model_keys:
key_layers.extend([
f'layers.{i}.feed_forward.w1.weight',
f'layers.{i}.feed_forward.w2.weight',
f'layers.{i}.feed_forward.w3.weight',
])
# 验证KnowledgeDataset相关层仅model和model_no_feed
if model_type in ['model', 'model_no_feed']:
key_layers.extend([
'knowledge_dataset.to_queries.0.weight',
'knowledge_dataset.keys',
'knowledge_dataset.knowledge_dataset',
])
# 添加CrossAttention层
for i in range(lm_config.n_layers):
key_layers.extend([
f'layers.{i}.cross_attention.to_q.weight',
f'layers.{i}.cross_attention.to_k.weight',
f'layers.{i}.cross_attention.to_v.weight',
f'layers.{i}.cross_attention.to_out.weight',
])
# 检查关键层
verified_layers = 0
total_key_layers = 0
for layer_name in key_layers:
if layer_name in model_keys: # 只检查模型中实际存在的层
total_key_layers += 1
if layer_name in matched_keys:
verified_layers += 1
expected_shape = model.state_dict()[layer_name].shape
actual_shape = state_dict[layer_name].shape if layer_name in state_dict else "缺失"
if layer_name in state_dict and expected_shape == actual_shape:
print(f"{layer_name}: {actual_shape}")
else:
print(f"{layer_name}: 期望 {expected_shape}, 实际 {actual_shape}")
else:
print(f"{layer_name}: 缺失")
print(f"\n关键层验证结果: {verified_layers}/{total_key_layers} 层验证成功")
if verified_layers == total_key_layers:
print("✅ 所有关键层验证通过!")
elif verified_layers / total_key_layers >= 0.9:
print("⚠️ 大部分关键层验证通过,模型应该可以正常工作")
else:
print("❌ 关键层验证失败过多,模型可能无法正常工作!")
print()
else:
raise FileNotFoundError(f"模型文件不存在: {model_path}")
model.to(device)
model.eval()
return model, tokenizer
def load_eval_data(data_path, num_samples=20):
"""
加载评估数据集
Args:
data_path: 数据文件路径
num_samples: 要评估的样本数量
Returns:
samples: 数据样本列表
"""
data = []
with open(data_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f):
line = line.strip()
if line: # 跳过空行
try:
sample = json.loads(line)
data.append(sample)
if len(data) >= num_samples:
break
except json.JSONDecodeError as e:
print(f"警告:第{line_num+1}行JSON解析失败: {e}")
continue
# 只取前num_samples条数据
samples = data[:num_samples]
print(f"加载了 {len(samples)} 条评估数据")
return samples
def evaluate_sample(model, tokenizer, text, input_length=100, predict_length=100, device='cuda'):
"""
评估单个样本
Args:
model: 模型实例
tokenizer: tokenizer实例
text: 输入文本
input_length: 输入token数量
predict_length: 预测token数量
device: 运行设备
Returns:
input_text: 输入文本
predicted_text: 预测文本
ground_truth_text: 真实文本
loss: 预测损失(如果可计算)
"""
# 添加与训练时一致的BOS/EOS token处理
if not text.startswith(tokenizer.bos_token):
text = f"{tokenizer.bos_token}{text}"
if not text.endswith(tokenizer.eos_token):
text = f"{text}{tokenizer.eos_token}"
# 对文本进行分词
tokens = tokenizer.encode(text, add_special_tokens=False)
# 确保有足够的token
if len(tokens) < input_length + predict_length:
print(f"警告:文本长度不足,只有 {len(tokens)} 个token")
return None, None, None, None
# 分割输入和目标
input_tokens = tokens[:input_length]
target_tokens = tokens[input_length:input_length + predict_length]
# 转换为张量
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
# 生成预测
with torch.no_grad():
# 使用generate方法生成调整参数改善生成质量
generated = model.generate(
input_ids,
max_new_tokens=predict_length,
temperature=1.0,
top_p=0.95,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id
)
# 提取生成的token去掉输入部分
# generated包含完整序列需要从input_length位置开始提取新生成的部分
full_generated_tokens = generated[0].tolist()
if len(full_generated_tokens) > input_length:
predicted_tokens = full_generated_tokens[input_length:]
else:
# 如果生成序列长度不够,说明没有新生成内容
predicted_tokens = []
# 检查是否因EOS token提前结束生成
eos_found = False
eos_position = -1
actual_predicted_length = len(predicted_tokens)
if predicted_tokens and tokenizer.eos_token_id is not None:
try:
eos_position = predicted_tokens.index(tokenizer.eos_token_id)
eos_found = True
# 只保留EOS token之前的内容
predicted_tokens = predicted_tokens[:eos_position]
actual_predicted_length = len(predicted_tokens)
except ValueError:
# 没有找到EOS token
pass
# 计算loss使用forward方法
# 准备用于loss计算的输入
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
outputs = model(loss_input_ids) # 移除logits_to_keep参数
# 计算loss
logits = outputs.logits
loss = None
if logits is not None:
# 重塑logits和目标 - 修复:使用正确的位置切片
# 在Transformer中position i的logits预测position i+1的token
# 要预测position input_length到input_length+predict_length-1的token
# 需要使用position input_length-1到input_length+predict_length-2的logits
shift_logits = logits[0, input_length-1:input_length+predict_length-1, :].contiguous()
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
# 计算交叉熵损失
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
loss = loss.item()
# 解码文本
input_text = tokenizer.decode(input_tokens, skip_special_tokens=True)
# 只解码实际生成的token限制在predict_length内
actual_predicted_tokens = predicted_tokens[:predict_length] if predicted_tokens else []
predicted_text = tokenizer.decode(actual_predicted_tokens, skip_special_tokens=True) if actual_predicted_tokens else "[未生成内容]"
ground_truth_text = tokenizer.decode(target_tokens, skip_special_tokens=True)
# 返回额外的生成统计信息
generation_stats = {
'requested_length': predict_length,
'actual_length': actual_predicted_length,
'eos_found': eos_found,
'eos_position': eos_position if eos_found else None,
'truncated_by_eos': eos_found and eos_position < predict_length
}
return input_text, predicted_text, ground_truth_text, loss, generation_stats
def main():
parser = argparse.ArgumentParser(description='评估预训练模型')
parser.add_argument('--model_path', type=str, default='out/experiment_1_4_1/pretrain_512.pth',
help='模型权重文件路径')
parser.add_argument('--model_type', type=str, default='model_memory',
help='模型类型 (支持model, model_original, model_no_feed, model_memory, model_memory_X_X_X等)')
parser.add_argument('--data_path', type=str, default='dataset/stable/eval_data.json',
help='评估数据集路径')
parser.add_argument('--num_samples', type=int, default=20,
help='评估样本数量')
parser.add_argument('--input_length', type=int, default=100,
help='输入token长度')
parser.add_argument('--predict_length', type=int, default=100,
help='预测token长度')
parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
help='运行设备')
# 模型架构参数
parser.add_argument('--dim', type=int, default=512,
help='模型维度')
parser.add_argument('--n_layers', type=int, default=8,
help='Transformer层数')
parser.add_argument('--n_heads', type=int, default=32,
help='注意力头数')
parser.add_argument('--n_kv_heads', type=int, default=8,
help='KV注意力头数')
parser.add_argument('--vocab_size', type=int, default=6400,
help='词汇表大小')
parser.add_argument('--max_seq_len', type=int, default=512,
help='最大序列长度')
parser.add_argument('--dropout', type=float, default=0.0,
help='Dropout率')
parser.add_argument('--norm_eps', type=float, default=1e-5,
help='层归一化epsilon')
parser.add_argument('--rope_theta', type=float, default=1e6,
help='RoPE theta参数')
# KnowledgeDataset相关参数仅model和model_no_feed使用
parser.add_argument('--knowledge_num', type=int, default=1048576,
help='知识条目数量')
parser.add_argument('--knowledge_length', type=int, default=32,
help='单条知识长度')
parser.add_argument('--knowledge_dim', type=int, default=128,
help='知识维度')
# MOE相关参数
parser.add_argument('--use_moe', action='store_true',
help='是否使用MOE')
parser.add_argument('--num_experts_per_tok', type=int, default=2,
help='每个token激活的专家数')
parser.add_argument('--n_routed_experts', type=int, default=4,
help='路由专家数量')
args = parser.parse_args()
print(f"评估配置:")
print(f" 模型路径: {args.model_path}")
print(f" 模型类型: {args.model_type}")
print(f" 数据路径: {args.data_path}")
print(f" 样本数量: {args.num_samples}")
print(f" 输入长度: {args.input_length} tokens")
print(f" 预测长度: {args.predict_length} tokens")
print(f" 运行设备: {args.device}")
print()
# 构建配置参数字典
config_params = {
'dim': args.dim,
'n_layers': args.n_layers,
'n_heads': args.n_heads,
'n_kv_heads': args.n_kv_heads,
'vocab_size': args.vocab_size,
'max_seq_len': args.max_seq_len,
'dropout': args.dropout,
'norm_eps': args.norm_eps,
'rope_theta': args.rope_theta,
'use_moe': args.use_moe,
'num_experts_per_tok': args.num_experts_per_tok,
'n_routed_experts': args.n_routed_experts,
}
# 只有model、model_no_feed和model_memory系列需要KnowledgeDataset参数
if args.model_type in ['model', 'model_no_feed', 'model_memory'] or args.model_type.startswith('model_memory_'):
config_params.update({
'knowledge_num': args.knowledge_num,
'knowledge_length': args.knowledge_length,
'knowledge_dim': args.knowledge_dim,
})
# 加载模型
model, tokenizer = load_model(args.model_path, args.model_type, args.device, config_params)
# 加载数据
samples = load_eval_data(args.data_path, args.num_samples)
# 评估每个样本
total_loss = 0
valid_samples = 0
total_requested_tokens = 0
total_actual_tokens = 0
samples_with_eos = 0
samples_truncated_by_eos = 0
for i, sample in enumerate(samples):
print(f"\n{'='*60}")
print(f"样本 {i+1}/{len(samples)}")
print(f"{'='*60}")
text = sample['text']
# 评估样本
input_text, predicted_text, ground_truth_text, loss, generation_stats = evaluate_sample(
model, tokenizer, text,
args.input_length, args.predict_length, args.device
)
if input_text is None:
print("跳过该样本(文本长度不足)")
continue
# 打印结果
print(f"\n输入 ({args.input_length} tokens):")
print(f" {input_text}")
print(f"\n预测输出 (请求{generation_stats['requested_length']}个token, 实际生成{generation_stats['actual_length']}个):")
print(f" {predicted_text}")
print(f"\n真实值 ({args.predict_length} tokens):")
print(f" {ground_truth_text}")
# 打印生成统计信息
print(f"\n生成统计:")
print(f" 请求生成: {generation_stats['requested_length']} tokens")
print(f" 实际生成: {generation_stats['actual_length']} tokens")
if generation_stats['eos_found']:
print(f" ✅ 发现EOS token在位置 {generation_stats['eos_position']}")
if generation_stats['truncated_by_eos']:
print(f" ⚠️ 因EOS token提前结束生成")
else:
print(f" ✅ EOS token出现在预期位置")
else:
print(f" ❌ 未发现EOS token (可能达到最大长度限制)")
if loss is not None:
print(f"\nLoss: {loss:.4f}")
total_loss += loss
valid_samples += 1
# 更新生成统计
total_requested_tokens += generation_stats['requested_length']
total_actual_tokens += generation_stats['actual_length']
if generation_stats['eos_found']:
samples_with_eos += 1
if generation_stats['truncated_by_eos']:
samples_truncated_by_eos += 1
# 打印总体统计
if valid_samples > 0:
print(f"\n{'='*60}")
print(f"总体统计:")
print(f" 有效样本数: {valid_samples}")
print(f" 平均Loss: {total_loss / valid_samples:.4f}")
print()
print(f"生成统计:")
print(f" 请求生成总tokens: {total_requested_tokens}")
print(f" 实际生成总tokens: {total_actual_tokens}")
print(f" 生成完成率: {total_actual_tokens / total_requested_tokens * 100:.1f}%" if total_requested_tokens > 0 else " 生成完成率: N/A")
print(f" 发现EOS的样本: {samples_with_eos}/{len(samples)} ({samples_with_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 发现EOS的样本: N/A")
print(f" 被EOS截断的样本: {samples_truncated_by_eos}/{len(samples)} ({samples_truncated_by_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 被EOS截断的样本: N/A")
print(f" 平均每样本生成长度: {total_actual_tokens/len(samples):.1f} tokens" if len(samples) > 0 else " 平均每样本生成长度: N/A")
print(f"{'='*60}")
if __name__ == "__main__":
main()