#!/usr/bin/env python3 """ 评估预训练模型的推理效果 用于测试不同实验中训练出来的模型在eval_data.json上的表现 """ import os import json import argparse import torch import torch.nn.functional as F from transformers import AutoTokenizer from model.LMConfig import LMConfig def load_model(model_path, model_type, device, config_params=None): """ 加载模型和tokenizer Args: model_path: 模型权重文件路径 model_type: 模型类型 (model/model_original/model_no_feed) device: 运行设备 config_params: 模型配置参数字典 Returns: model: 加载好的模型 tokenizer: tokenizer实例 """ # 初始化配置 if config_params: lm_config = LMConfig(**config_params) else: lm_config = LMConfig() # 打印配置信息 print(f"模型配置:") print(f" dim: {lm_config.dim}") print(f" n_layers: {lm_config.n_layers}") print(f" n_heads: {lm_config.n_heads}") print(f" vocab_size: {lm_config.vocab_size}") print(f" max_seq_len: {lm_config.max_seq_len}") if hasattr(lm_config, 'knowledge_num'): print(f" knowledge_num: {lm_config.knowledge_num}") print(f" knowledge_length: {lm_config.knowledge_length}") print(f" knowledge_dim: {lm_config.knowledge_dim}") print() # 加载tokenizer tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') # 根据模型类型导入对应的模型类 if model_type == "model": from model.model import MiniMindLM elif model_type == "model_original": from model.model_original import MiniMindLM elif model_type == "model_no_feed": from model.model_no_feed import MiniMindLM else: raise ValueError(f"不支持的模型类型: {model_type}") # 初始化模型 model = MiniMindLM(lm_config) # 加载权重 if os.path.exists(model_path): print(f"正在从 {model_path} 加载模型权重...") # 加载权重文件 state_dict = torch.load(model_path, map_location=device) # 获取模型的参数名称 model_keys = set(model.state_dict().keys()) checkpoint_keys = set(state_dict.keys()) # 统计权重匹配情况 matched_keys = model_keys & checkpoint_keys missing_keys = model_keys - checkpoint_keys unexpected_keys = checkpoint_keys - model_keys print(f"\n权重加载详情:") print(f" 模型总参数数量: {len(model_keys)}") print(f" 权重文件参数数量: {len(checkpoint_keys)}") print(f" 成功匹配参数: {len(matched_keys)}") print(f" 缺失参数: {len(missing_keys)}") print(f" 多余参数: {len(unexpected_keys)}") # 详细列出缺失和多余的参数 if missing_keys: print(f"\n❌ 缺失的参数 ({len(missing_keys)}):") for key in sorted(missing_keys): print(f" - {key}") if unexpected_keys: print(f"\n⚠️ 权重文件中多余的参数 ({len(unexpected_keys)}):") for key in sorted(unexpected_keys): print(f" + {key}") # 加载权重(允许部分匹配) try: incompatible_keys = model.load_state_dict(state_dict, strict=False) # 检查加载结果 if len(incompatible_keys.missing_keys) == 0 and len(incompatible_keys.unexpected_keys) == 0: print(f"\n✅ 权重加载完全成功!") elif len(incompatible_keys.missing_keys) == 0: print(f"\n✅ 权重加载成功(忽略多余参数)") else: print(f"\n⚠️ 权重加载部分成功,存在缺失参数") print(f" 这可能影响模型性能,请检查模型配置参数是否正确") # 计算加载成功率 success_rate = len(matched_keys) / len(model_keys) * 100 print(f" 参数加载成功率: {success_rate:.1f}%") if success_rate < 90: print(f" ❌ 警告:加载成功率过低,模型可能无法正常工作!") elif success_rate < 100: print(f" ⚠️ 警告:存在缺失参数,可能影响模型性能") except Exception as e: raise RuntimeError(f"权重加载失败: {e}") # 验证关键层的形状 print("🔍 验证关键层形状:") key_layers = [ 'tok_embeddings.weight', 'output.weight', 'norm.weight', ] # 添加每一层的验证 for i in range(lm_config.n_layers): key_layers.extend([ f'layers.{i}.attention_norm.weight', f'layers.{i}.ffn_norm.weight', f'layers.{i}.self_attention.wq.weight', f'layers.{i}.self_attention.wk.weight', f'layers.{i}.self_attention.wv.weight', f'layers.{i}.self_attention.wo.weight', ]) # FFN层的验证(model_original有FFN,其他模型可能没有) if f'layers.{i}.feed_forward.w1.weight' in model_keys: key_layers.extend([ f'layers.{i}.feed_forward.w1.weight', f'layers.{i}.feed_forward.w2.weight', f'layers.{i}.feed_forward.w3.weight', ]) # 验证KnowledgeDataset相关层(仅model和model_no_feed) if model_type in ['model', 'model_no_feed']: key_layers.extend([ 'knowledge_dataset.to_queries.0.weight', 'knowledge_dataset.keys', 'knowledge_dataset.knowledge_dataset', ]) # 添加CrossAttention层 for i in range(lm_config.n_layers): key_layers.extend([ f'layers.{i}.cross_attention.to_q.weight', f'layers.{i}.cross_attention.to_k.weight', f'layers.{i}.cross_attention.to_v.weight', f'layers.{i}.cross_attention.to_out.weight', ]) # 检查关键层 verified_layers = 0 total_key_layers = 0 for layer_name in key_layers: if layer_name in model_keys: # 只检查模型中实际存在的层 total_key_layers += 1 if layer_name in matched_keys: verified_layers += 1 expected_shape = model.state_dict()[layer_name].shape actual_shape = state_dict[layer_name].shape if layer_name in state_dict else "缺失" if layer_name in state_dict and expected_shape == actual_shape: print(f" ✅ {layer_name}: {actual_shape}") else: print(f" ❌ {layer_name}: 期望 {expected_shape}, 实际 {actual_shape}") else: print(f" ❌ {layer_name}: 缺失") print(f"\n关键层验证结果: {verified_layers}/{total_key_layers} 层验证成功") if verified_layers == total_key_layers: print("✅ 所有关键层验证通过!") elif verified_layers / total_key_layers >= 0.9: print("⚠️ 大部分关键层验证通过,模型应该可以正常工作") else: print("❌ 关键层验证失败过多,模型可能无法正常工作!") print() else: raise FileNotFoundError(f"模型文件不存在: {model_path}") model.to(device) model.eval() return model, tokenizer def load_eval_data(data_path, num_samples=20): """ 加载评估数据集 Args: data_path: 数据文件路径 num_samples: 要评估的样本数量 Returns: samples: 数据样本列表 """ data = [] with open(data_path, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f): line = line.strip() if line: # 跳过空行 try: sample = json.loads(line) data.append(sample) if len(data) >= num_samples: break except json.JSONDecodeError as e: print(f"警告:第{line_num+1}行JSON解析失败: {e}") continue # 只取前num_samples条数据 samples = data[:num_samples] print(f"加载了 {len(samples)} 条评估数据") return samples def evaluate_sample(model, tokenizer, text, input_length=100, predict_length=100, device='cuda'): """ 评估单个样本 Args: model: 模型实例 tokenizer: tokenizer实例 text: 输入文本 input_length: 输入token数量 predict_length: 预测token数量 device: 运行设备 Returns: input_text: 输入文本 predicted_text: 预测文本 ground_truth_text: 真实文本 loss: 预测损失(如果可计算) """ # 对文本进行分词 tokens = tokenizer.encode(text, add_special_tokens=False) # 确保有足够的token if len(tokens) < input_length + predict_length: print(f"警告:文本长度不足,只有 {len(tokens)} 个token") return None, None, None, None # 分割输入和目标 input_tokens = tokens[:input_length] target_tokens = tokens[input_length:input_length + predict_length] # 转换为张量 input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device) # 生成预测 with torch.no_grad(): # 使用generate方法生成,调整参数改善生成质量 generated = model.generate( input_ids, max_new_tokens=predict_length, temperature=1.0, top_p=0.95, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id ) # 提取生成的token(去掉输入部分) # generated包含完整序列,需要从input_length位置开始提取新生成的部分 full_generated_tokens = generated[0].tolist() if len(full_generated_tokens) > input_length: predicted_tokens = full_generated_tokens[input_length:] else: # 如果生成序列长度不够,说明没有新生成内容 predicted_tokens = [] # 检查是否因EOS token提前结束生成 eos_found = False eos_position = -1 actual_predicted_length = len(predicted_tokens) if predicted_tokens and tokenizer.eos_token_id is not None: try: eos_position = predicted_tokens.index(tokenizer.eos_token_id) eos_found = True # 只保留EOS token之前的内容 predicted_tokens = predicted_tokens[:eos_position] actual_predicted_length = len(predicted_tokens) except ValueError: # 没有找到EOS token pass # 计算loss(使用forward方法) # 准备用于loss计算的输入 loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device) outputs = model(loss_input_ids) # 移除logits_to_keep参数 # 计算loss logits = outputs.logits loss = None if logits is not None: # 重塑logits和目标 - 修复:使用正确的位置切片 shift_logits = logits[0, input_length:input_length + predict_length, :].contiguous() shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device) # 计算交叉熵损失 loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean') loss = loss.item() # 解码文本 input_text = tokenizer.decode(input_tokens, skip_special_tokens=True) # 只解码实际生成的token,限制在predict_length内 actual_predicted_tokens = predicted_tokens[:predict_length] if predicted_tokens else [] predicted_text = tokenizer.decode(actual_predicted_tokens, skip_special_tokens=True) if actual_predicted_tokens else "[未生成内容]" ground_truth_text = tokenizer.decode(target_tokens, skip_special_tokens=True) # 返回额外的生成统计信息 generation_stats = { 'requested_length': predict_length, 'actual_length': actual_predicted_length, 'eos_found': eos_found, 'eos_position': eos_position if eos_found else None, 'truncated_by_eos': eos_found and eos_position < predict_length } return input_text, predicted_text, ground_truth_text, loss, generation_stats def main(): parser = argparse.ArgumentParser(description='评估预训练模型') parser.add_argument('--model_path', type=str, default='out/experiment_1_4_0/pretrain_512.pth', help='模型权重文件路径') parser.add_argument('--model_type', type=str, default='model', choices=['model', 'model_original', 'model_no_feed'], help='模型类型') parser.add_argument('--data_path', type=str, default='dataset/stable/eval_data.json', help='评估数据集路径') parser.add_argument('--num_samples', type=int, default=20, help='评估样本数量') parser.add_argument('--input_length', type=int, default=100, help='输入token长度') parser.add_argument('--predict_length', type=int, default=100, help='预测token长度') parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu', help='运行设备') # 模型架构参数 parser.add_argument('--dim', type=int, default=512, help='模型维度') parser.add_argument('--n_layers', type=int, default=8, help='Transformer层数') parser.add_argument('--n_heads', type=int, default=32, help='注意力头数') parser.add_argument('--n_kv_heads', type=int, default=8, help='KV注意力头数') parser.add_argument('--vocab_size', type=int, default=6400, help='词汇表大小') parser.add_argument('--max_seq_len', type=int, default=512, help='最大序列长度') parser.add_argument('--dropout', type=float, default=0.0, help='Dropout率') parser.add_argument('--norm_eps', type=float, default=1e-5, help='层归一化epsilon') parser.add_argument('--rope_theta', type=float, default=1e6, help='RoPE theta参数') # KnowledgeDataset相关参数(仅model和model_no_feed使用) parser.add_argument('--knowledge_num', type=int, default=1048576, help='知识条目数量') parser.add_argument('--knowledge_length', type=int, default=32, help='单条知识长度') parser.add_argument('--knowledge_dim', type=int, default=128, help='知识维度') # MOE相关参数 parser.add_argument('--use_moe', action='store_true', help='是否使用MOE') parser.add_argument('--num_experts_per_tok', type=int, default=2, help='每个token激活的专家数') parser.add_argument('--n_routed_experts', type=int, default=4, help='路由专家数量') args = parser.parse_args() print(f"评估配置:") print(f" 模型路径: {args.model_path}") print(f" 模型类型: {args.model_type}") print(f" 数据路径: {args.data_path}") print(f" 样本数量: {args.num_samples}") print(f" 输入长度: {args.input_length} tokens") print(f" 预测长度: {args.predict_length} tokens") print(f" 运行设备: {args.device}") print() # 构建配置参数字典 config_params = { 'dim': args.dim, 'n_layers': args.n_layers, 'n_heads': args.n_heads, 'n_kv_heads': args.n_kv_heads, 'vocab_size': args.vocab_size, 'max_seq_len': args.max_seq_len, 'dropout': args.dropout, 'norm_eps': args.norm_eps, 'rope_theta': args.rope_theta, 'use_moe': args.use_moe, 'num_experts_per_tok': args.num_experts_per_tok, 'n_routed_experts': args.n_routed_experts, } # 只有model和model_no_feed需要KnowledgeDataset参数 if args.model_type in ['model', 'model_no_feed']: config_params.update({ 'knowledge_num': args.knowledge_num, 'knowledge_length': args.knowledge_length, 'knowledge_dim': args.knowledge_dim, }) # 加载模型 model, tokenizer = load_model(args.model_path, args.model_type, args.device, config_params) # 加载数据 samples = load_eval_data(args.data_path, args.num_samples) # 评估每个样本 total_loss = 0 valid_samples = 0 total_requested_tokens = 0 total_actual_tokens = 0 samples_with_eos = 0 samples_truncated_by_eos = 0 for i, sample in enumerate(samples): print(f"\n{'='*60}") print(f"样本 {i+1}/{len(samples)}") print(f"{'='*60}") text = sample['text'] # 评估样本 input_text, predicted_text, ground_truth_text, loss, generation_stats = evaluate_sample( model, tokenizer, text, args.input_length, args.predict_length, args.device ) if input_text is None: print("跳过该样本(文本长度不足)") continue # 打印结果 print(f"\n输入 ({args.input_length} tokens):") print(f" {input_text}") print(f"\n预测输出 (请求{generation_stats['requested_length']}个token, 实际生成{generation_stats['actual_length']}个):") print(f" {predicted_text}") print(f"\n真实值 ({args.predict_length} tokens):") print(f" {ground_truth_text}") # 打印生成统计信息 print(f"\n生成统计:") print(f" 请求生成: {generation_stats['requested_length']} tokens") print(f" 实际生成: {generation_stats['actual_length']} tokens") if generation_stats['eos_found']: print(f" ✅ 发现EOS token在位置 {generation_stats['eos_position']}") if generation_stats['truncated_by_eos']: print(f" ⚠️ 因EOS token提前结束生成") else: print(f" ✅ EOS token出现在预期位置") else: print(f" ❌ 未发现EOS token (可能达到最大长度限制)") if loss is not None: print(f"\nLoss: {loss:.4f}") total_loss += loss valid_samples += 1 # 更新生成统计 total_requested_tokens += generation_stats['requested_length'] total_actual_tokens += generation_stats['actual_length'] if generation_stats['eos_found']: samples_with_eos += 1 if generation_stats['truncated_by_eos']: samples_truncated_by_eos += 1 # 打印总体统计 if valid_samples > 0: print(f"\n{'='*60}") print(f"总体统计:") print(f" 有效样本数: {valid_samples}") print(f" 平均Loss: {total_loss / valid_samples:.4f}") print() print(f"生成统计:") print(f" 请求生成总tokens: {total_requested_tokens}") print(f" 实际生成总tokens: {total_actual_tokens}") print(f" 生成完成率: {total_actual_tokens / total_requested_tokens * 100:.1f}%" if total_requested_tokens > 0 else " 生成完成率: N/A") print(f" 发现EOS的样本: {samples_with_eos}/{len(samples)} ({samples_with_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 发现EOS的样本: N/A") print(f" 被EOS截断的样本: {samples_truncated_by_eos}/{len(samples)} ({samples_truncated_by_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 被EOS截断的样本: N/A") print(f" 平均每样本生成长度: {total_actual_tokens/len(samples):.1f} tokens" if len(samples) > 0 else " 平均每样本生成长度: N/A") print(f"{'='*60}") if __name__ == "__main__": main()