516 lines
20 KiB
Python
516 lines
20 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
评估预训练模型的推理效果
|
|||
|
|
用于测试不同实验中训练出来的模型在eval_data.json上的表现
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import json
|
|||
|
|
import argparse
|
|||
|
|
import torch
|
|||
|
|
import torch.nn.functional as F
|
|||
|
|
from transformers import AutoTokenizer
|
|||
|
|
from model.LMConfig import LMConfig
|
|||
|
|
|
|||
|
|
|
|||
|
|
def load_model(model_path, model_type, device, config_params=None):
|
|||
|
|
"""
|
|||
|
|
加载模型和tokenizer
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
model_path: 模型权重文件路径
|
|||
|
|
model_type: 模型类型 (model/model_original/model_no_feed)
|
|||
|
|
device: 运行设备
|
|||
|
|
config_params: 模型配置参数字典
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
model: 加载好的模型
|
|||
|
|
tokenizer: tokenizer实例
|
|||
|
|
"""
|
|||
|
|
# 初始化配置
|
|||
|
|
if config_params:
|
|||
|
|
lm_config = LMConfig(**config_params)
|
|||
|
|
else:
|
|||
|
|
lm_config = LMConfig()
|
|||
|
|
|
|||
|
|
# 打印配置信息
|
|||
|
|
print(f"模型配置:")
|
|||
|
|
print(f" dim: {lm_config.dim}")
|
|||
|
|
print(f" n_layers: {lm_config.n_layers}")
|
|||
|
|
print(f" n_heads: {lm_config.n_heads}")
|
|||
|
|
print(f" vocab_size: {lm_config.vocab_size}")
|
|||
|
|
print(f" max_seq_len: {lm_config.max_seq_len}")
|
|||
|
|
if hasattr(lm_config, 'knowledge_num'):
|
|||
|
|
print(f" knowledge_num: {lm_config.knowledge_num}")
|
|||
|
|
print(f" knowledge_length: {lm_config.knowledge_length}")
|
|||
|
|
print(f" knowledge_dim: {lm_config.knowledge_dim}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# 加载tokenizer
|
|||
|
|
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
|||
|
|
|
|||
|
|
# 根据模型类型导入对应的模型类
|
|||
|
|
if model_type == "model":
|
|||
|
|
from model.model import MiniMindLM
|
|||
|
|
elif model_type == "model_original":
|
|||
|
|
from model.model_original import MiniMindLM
|
|||
|
|
elif model_type == "model_no_feed":
|
|||
|
|
from model.model_no_feed import MiniMindLM
|
|||
|
|
else:
|
|||
|
|
raise ValueError(f"不支持的模型类型: {model_type}")
|
|||
|
|
|
|||
|
|
# 初始化模型
|
|||
|
|
model = MiniMindLM(lm_config)
|
|||
|
|
|
|||
|
|
# 加载权重
|
|||
|
|
if os.path.exists(model_path):
|
|||
|
|
print(f"正在从 {model_path} 加载模型权重...")
|
|||
|
|
|
|||
|
|
# 加载权重文件
|
|||
|
|
state_dict = torch.load(model_path, map_location=device)
|
|||
|
|
|
|||
|
|
# 获取模型的参数名称
|
|||
|
|
model_keys = set(model.state_dict().keys())
|
|||
|
|
checkpoint_keys = set(state_dict.keys())
|
|||
|
|
|
|||
|
|
# 统计权重匹配情况
|
|||
|
|
matched_keys = model_keys & checkpoint_keys
|
|||
|
|
missing_keys = model_keys - checkpoint_keys
|
|||
|
|
unexpected_keys = checkpoint_keys - model_keys
|
|||
|
|
|
|||
|
|
print(f"\n权重加载详情:")
|
|||
|
|
print(f" 模型总参数数量: {len(model_keys)}")
|
|||
|
|
print(f" 权重文件参数数量: {len(checkpoint_keys)}")
|
|||
|
|
print(f" 成功匹配参数: {len(matched_keys)}")
|
|||
|
|
print(f" 缺失参数: {len(missing_keys)}")
|
|||
|
|
print(f" 多余参数: {len(unexpected_keys)}")
|
|||
|
|
|
|||
|
|
# 详细列出缺失和多余的参数
|
|||
|
|
if missing_keys:
|
|||
|
|
print(f"\n❌ 缺失的参数 ({len(missing_keys)}):")
|
|||
|
|
for key in sorted(missing_keys):
|
|||
|
|
print(f" - {key}")
|
|||
|
|
|
|||
|
|
if unexpected_keys:
|
|||
|
|
print(f"\n⚠️ 权重文件中多余的参数 ({len(unexpected_keys)}):")
|
|||
|
|
for key in sorted(unexpected_keys):
|
|||
|
|
print(f" + {key}")
|
|||
|
|
|
|||
|
|
# 加载权重(允许部分匹配)
|
|||
|
|
try:
|
|||
|
|
incompatible_keys = model.load_state_dict(state_dict, strict=False)
|
|||
|
|
|
|||
|
|
# 检查加载结果
|
|||
|
|
if len(incompatible_keys.missing_keys) == 0 and len(incompatible_keys.unexpected_keys) == 0:
|
|||
|
|
print(f"\n✅ 权重加载完全成功!")
|
|||
|
|
elif len(incompatible_keys.missing_keys) == 0:
|
|||
|
|
print(f"\n✅ 权重加载成功(忽略多余参数)")
|
|||
|
|
else:
|
|||
|
|
print(f"\n⚠️ 权重加载部分成功,存在缺失参数")
|
|||
|
|
print(f" 这可能影响模型性能,请检查模型配置参数是否正确")
|
|||
|
|
|
|||
|
|
# 计算加载成功率
|
|||
|
|
success_rate = len(matched_keys) / len(model_keys) * 100
|
|||
|
|
print(f" 参数加载成功率: {success_rate:.1f}%")
|
|||
|
|
|
|||
|
|
if success_rate < 90:
|
|||
|
|
print(f" ❌ 警告:加载成功率过低,模型可能无法正常工作!")
|
|||
|
|
elif success_rate < 100:
|
|||
|
|
print(f" ⚠️ 警告:存在缺失参数,可能影响模型性能")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
raise RuntimeError(f"权重加载失败: {e}")
|
|||
|
|
|
|||
|
|
# 验证关键层的形状
|
|||
|
|
print("🔍 验证关键层形状:")
|
|||
|
|
key_layers = [
|
|||
|
|
'tok_embeddings.weight',
|
|||
|
|
'output.weight',
|
|||
|
|
'norm.weight',
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# 添加每一层的验证
|
|||
|
|
for i in range(lm_config.n_layers):
|
|||
|
|
key_layers.extend([
|
|||
|
|
f'layers.{i}.attention_norm.weight',
|
|||
|
|
f'layers.{i}.ffn_norm.weight',
|
|||
|
|
f'layers.{i}.self_attention.wq.weight',
|
|||
|
|
f'layers.{i}.self_attention.wk.weight',
|
|||
|
|
f'layers.{i}.self_attention.wv.weight',
|
|||
|
|
f'layers.{i}.self_attention.wo.weight',
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
# FFN层的验证(model_original有FFN,其他模型可能没有)
|
|||
|
|
if f'layers.{i}.feed_forward.w1.weight' in model_keys:
|
|||
|
|
key_layers.extend([
|
|||
|
|
f'layers.{i}.feed_forward.w1.weight',
|
|||
|
|
f'layers.{i}.feed_forward.w2.weight',
|
|||
|
|
f'layers.{i}.feed_forward.w3.weight',
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
# 验证KnowledgeDataset相关层(仅model和model_no_feed)
|
|||
|
|
if model_type in ['model', 'model_no_feed']:
|
|||
|
|
key_layers.extend([
|
|||
|
|
'knowledge_dataset.to_queries.0.weight',
|
|||
|
|
'knowledge_dataset.keys',
|
|||
|
|
'knowledge_dataset.knowledge_dataset',
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
# 添加CrossAttention层
|
|||
|
|
for i in range(lm_config.n_layers):
|
|||
|
|
key_layers.extend([
|
|||
|
|
f'layers.{i}.cross_attention.to_q.weight',
|
|||
|
|
f'layers.{i}.cross_attention.to_k.weight',
|
|||
|
|
f'layers.{i}.cross_attention.to_v.weight',
|
|||
|
|
f'layers.{i}.cross_attention.to_out.weight',
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
# 检查关键层
|
|||
|
|
verified_layers = 0
|
|||
|
|
total_key_layers = 0
|
|||
|
|
|
|||
|
|
for layer_name in key_layers:
|
|||
|
|
if layer_name in model_keys: # 只检查模型中实际存在的层
|
|||
|
|
total_key_layers += 1
|
|||
|
|
if layer_name in matched_keys:
|
|||
|
|
verified_layers += 1
|
|||
|
|
expected_shape = model.state_dict()[layer_name].shape
|
|||
|
|
actual_shape = state_dict[layer_name].shape if layer_name in state_dict else "缺失"
|
|||
|
|
if layer_name in state_dict and expected_shape == actual_shape:
|
|||
|
|
print(f" ✅ {layer_name}: {actual_shape}")
|
|||
|
|
else:
|
|||
|
|
print(f" ❌ {layer_name}: 期望 {expected_shape}, 实际 {actual_shape}")
|
|||
|
|
else:
|
|||
|
|
print(f" ❌ {layer_name}: 缺失")
|
|||
|
|
|
|||
|
|
print(f"\n关键层验证结果: {verified_layers}/{total_key_layers} 层验证成功")
|
|||
|
|
|
|||
|
|
if verified_layers == total_key_layers:
|
|||
|
|
print("✅ 所有关键层验证通过!")
|
|||
|
|
elif verified_layers / total_key_layers >= 0.9:
|
|||
|
|
print("⚠️ 大部分关键层验证通过,模型应该可以正常工作")
|
|||
|
|
else:
|
|||
|
|
print("❌ 关键层验证失败过多,模型可能无法正常工作!")
|
|||
|
|
|
|||
|
|
print()
|
|||
|
|
else:
|
|||
|
|
raise FileNotFoundError(f"模型文件不存在: {model_path}")
|
|||
|
|
|
|||
|
|
model.to(device)
|
|||
|
|
model.eval()
|
|||
|
|
|
|||
|
|
return model, tokenizer
|
|||
|
|
|
|||
|
|
|
|||
|
|
def load_eval_data(data_path, num_samples=20):
|
|||
|
|
"""
|
|||
|
|
加载评估数据集
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
data_path: 数据文件路径
|
|||
|
|
num_samples: 要评估的样本数量
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
samples: 数据样本列表
|
|||
|
|
"""
|
|||
|
|
data = []
|
|||
|
|
with open(data_path, 'r', encoding='utf-8') as f:
|
|||
|
|
for line_num, line in enumerate(f):
|
|||
|
|
line = line.strip()
|
|||
|
|
if line: # 跳过空行
|
|||
|
|
try:
|
|||
|
|
sample = json.loads(line)
|
|||
|
|
data.append(sample)
|
|||
|
|
if len(data) >= num_samples:
|
|||
|
|
break
|
|||
|
|
except json.JSONDecodeError as e:
|
|||
|
|
print(f"警告:第{line_num+1}行JSON解析失败: {e}")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 只取前num_samples条数据
|
|||
|
|
samples = data[:num_samples]
|
|||
|
|
print(f"加载了 {len(samples)} 条评估数据")
|
|||
|
|
|
|||
|
|
return samples
|
|||
|
|
|
|||
|
|
|
|||
|
|
def evaluate_sample(model, tokenizer, text, input_length=100, predict_length=100, device='cuda'):
|
|||
|
|
"""
|
|||
|
|
评估单个样本
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
model: 模型实例
|
|||
|
|
tokenizer: tokenizer实例
|
|||
|
|
text: 输入文本
|
|||
|
|
input_length: 输入token数量
|
|||
|
|
predict_length: 预测token数量
|
|||
|
|
device: 运行设备
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
input_text: 输入文本
|
|||
|
|
predicted_text: 预测文本
|
|||
|
|
ground_truth_text: 真实文本
|
|||
|
|
loss: 预测损失(如果可计算)
|
|||
|
|
"""
|
|||
|
|
# 对文本进行分词
|
|||
|
|
tokens = tokenizer.encode(text, add_special_tokens=False)
|
|||
|
|
|
|||
|
|
# 确保有足够的token
|
|||
|
|
if len(tokens) < input_length + predict_length:
|
|||
|
|
print(f"警告:文本长度不足,只有 {len(tokens)} 个token")
|
|||
|
|
return None, None, None, None
|
|||
|
|
|
|||
|
|
# 分割输入和目标
|
|||
|
|
input_tokens = tokens[:input_length]
|
|||
|
|
target_tokens = tokens[input_length:input_length + predict_length]
|
|||
|
|
|
|||
|
|
# 转换为张量
|
|||
|
|
input_ids = torch.tensor([input_tokens], dtype=torch.long).to(device)
|
|||
|
|
|
|||
|
|
# 生成预测
|
|||
|
|
with torch.no_grad():
|
|||
|
|
# 使用generate方法生成,调整参数改善生成质量
|
|||
|
|
generated = model.generate(
|
|||
|
|
input_ids,
|
|||
|
|
max_new_tokens=predict_length,
|
|||
|
|
temperature=1.0,
|
|||
|
|
top_p=0.95,
|
|||
|
|
eos_token_id=tokenizer.eos_token_id,
|
|||
|
|
pad_token_id=tokenizer.pad_token_id
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 提取生成的token(去掉输入部分)
|
|||
|
|
# generated包含完整序列,需要从input_length位置开始提取新生成的部分
|
|||
|
|
full_generated_tokens = generated[0].tolist()
|
|||
|
|
if len(full_generated_tokens) > input_length:
|
|||
|
|
predicted_tokens = full_generated_tokens[input_length:]
|
|||
|
|
else:
|
|||
|
|
# 如果生成序列长度不够,说明没有新生成内容
|
|||
|
|
predicted_tokens = []
|
|||
|
|
|
|||
|
|
# 检查是否因EOS token提前结束生成
|
|||
|
|
eos_found = False
|
|||
|
|
eos_position = -1
|
|||
|
|
actual_predicted_length = len(predicted_tokens)
|
|||
|
|
|
|||
|
|
if predicted_tokens and tokenizer.eos_token_id is not None:
|
|||
|
|
try:
|
|||
|
|
eos_position = predicted_tokens.index(tokenizer.eos_token_id)
|
|||
|
|
eos_found = True
|
|||
|
|
# 只保留EOS token之前的内容
|
|||
|
|
predicted_tokens = predicted_tokens[:eos_position]
|
|||
|
|
actual_predicted_length = len(predicted_tokens)
|
|||
|
|
except ValueError:
|
|||
|
|
# 没有找到EOS token
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# 计算loss(使用forward方法)
|
|||
|
|
# 准备用于loss计算的输入
|
|||
|
|
loss_input_ids = torch.tensor([tokens[:input_length + predict_length]], dtype=torch.long).to(device)
|
|||
|
|
outputs = model(loss_input_ids) # 移除logits_to_keep参数
|
|||
|
|
|
|||
|
|
# 计算loss
|
|||
|
|
logits = outputs.logits
|
|||
|
|
loss = None
|
|||
|
|
if logits is not None:
|
|||
|
|
# 重塑logits和目标 - 修复:使用正确的位置切片
|
|||
|
|
shift_logits = logits[0, input_length:input_length + predict_length, :].contiguous()
|
|||
|
|
shift_labels = torch.tensor(target_tokens, dtype=torch.long).to(device)
|
|||
|
|
|
|||
|
|
# 计算交叉熵损失
|
|||
|
|
loss = F.cross_entropy(shift_logits, shift_labels, reduction='mean')
|
|||
|
|
loss = loss.item()
|
|||
|
|
|
|||
|
|
# 解码文本
|
|||
|
|
input_text = tokenizer.decode(input_tokens, skip_special_tokens=True)
|
|||
|
|
# 只解码实际生成的token,限制在predict_length内
|
|||
|
|
actual_predicted_tokens = predicted_tokens[:predict_length] if predicted_tokens else []
|
|||
|
|
predicted_text = tokenizer.decode(actual_predicted_tokens, skip_special_tokens=True) if actual_predicted_tokens else "[未生成内容]"
|
|||
|
|
ground_truth_text = tokenizer.decode(target_tokens, skip_special_tokens=True)
|
|||
|
|
|
|||
|
|
# 返回额外的生成统计信息
|
|||
|
|
generation_stats = {
|
|||
|
|
'requested_length': predict_length,
|
|||
|
|
'actual_length': actual_predicted_length,
|
|||
|
|
'eos_found': eos_found,
|
|||
|
|
'eos_position': eos_position if eos_found else None,
|
|||
|
|
'truncated_by_eos': eos_found and eos_position < predict_length
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return input_text, predicted_text, ground_truth_text, loss, generation_stats
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
parser = argparse.ArgumentParser(description='评估预训练模型')
|
|||
|
|
parser.add_argument('--model_path', type=str, default='out/experiment_1_4_0/pretrain_512.pth',
|
|||
|
|
help='模型权重文件路径')
|
|||
|
|
parser.add_argument('--model_type', type=str, default='model',
|
|||
|
|
choices=['model', 'model_original', 'model_no_feed'],
|
|||
|
|
help='模型类型')
|
|||
|
|
parser.add_argument('--data_path', type=str, default='dataset/stable/eval_data.json',
|
|||
|
|
help='评估数据集路径')
|
|||
|
|
parser.add_argument('--num_samples', type=int, default=20,
|
|||
|
|
help='评估样本数量')
|
|||
|
|
parser.add_argument('--input_length', type=int, default=100,
|
|||
|
|
help='输入token长度')
|
|||
|
|
parser.add_argument('--predict_length', type=int, default=100,
|
|||
|
|
help='预测token长度')
|
|||
|
|
parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
|
|||
|
|
help='运行设备')
|
|||
|
|
|
|||
|
|
# 模型架构参数
|
|||
|
|
parser.add_argument('--dim', type=int, default=512,
|
|||
|
|
help='模型维度')
|
|||
|
|
parser.add_argument('--n_layers', type=int, default=8,
|
|||
|
|
help='Transformer层数')
|
|||
|
|
parser.add_argument('--n_heads', type=int, default=32,
|
|||
|
|
help='注意力头数')
|
|||
|
|
parser.add_argument('--n_kv_heads', type=int, default=8,
|
|||
|
|
help='KV注意力头数')
|
|||
|
|
parser.add_argument('--vocab_size', type=int, default=6400,
|
|||
|
|
help='词汇表大小')
|
|||
|
|
parser.add_argument('--max_seq_len', type=int, default=512,
|
|||
|
|
help='最大序列长度')
|
|||
|
|
parser.add_argument('--dropout', type=float, default=0.0,
|
|||
|
|
help='Dropout率')
|
|||
|
|
parser.add_argument('--norm_eps', type=float, default=1e-5,
|
|||
|
|
help='层归一化epsilon')
|
|||
|
|
parser.add_argument('--rope_theta', type=float, default=1e6,
|
|||
|
|
help='RoPE theta参数')
|
|||
|
|
|
|||
|
|
# KnowledgeDataset相关参数(仅model和model_no_feed使用)
|
|||
|
|
parser.add_argument('--knowledge_num', type=int, default=1048576,
|
|||
|
|
help='知识条目数量')
|
|||
|
|
parser.add_argument('--knowledge_length', type=int, default=32,
|
|||
|
|
help='单条知识长度')
|
|||
|
|
parser.add_argument('--knowledge_dim', type=int, default=128,
|
|||
|
|
help='知识维度')
|
|||
|
|
|
|||
|
|
# MOE相关参数
|
|||
|
|
parser.add_argument('--use_moe', action='store_true',
|
|||
|
|
help='是否使用MOE')
|
|||
|
|
parser.add_argument('--num_experts_per_tok', type=int, default=2,
|
|||
|
|
help='每个token激活的专家数')
|
|||
|
|
parser.add_argument('--n_routed_experts', type=int, default=4,
|
|||
|
|
help='路由专家数量')
|
|||
|
|
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
print(f"评估配置:")
|
|||
|
|
print(f" 模型路径: {args.model_path}")
|
|||
|
|
print(f" 模型类型: {args.model_type}")
|
|||
|
|
print(f" 数据路径: {args.data_path}")
|
|||
|
|
print(f" 样本数量: {args.num_samples}")
|
|||
|
|
print(f" 输入长度: {args.input_length} tokens")
|
|||
|
|
print(f" 预测长度: {args.predict_length} tokens")
|
|||
|
|
print(f" 运行设备: {args.device}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
# 构建配置参数字典
|
|||
|
|
config_params = {
|
|||
|
|
'dim': args.dim,
|
|||
|
|
'n_layers': args.n_layers,
|
|||
|
|
'n_heads': args.n_heads,
|
|||
|
|
'n_kv_heads': args.n_kv_heads,
|
|||
|
|
'vocab_size': args.vocab_size,
|
|||
|
|
'max_seq_len': args.max_seq_len,
|
|||
|
|
'dropout': args.dropout,
|
|||
|
|
'norm_eps': args.norm_eps,
|
|||
|
|
'rope_theta': args.rope_theta,
|
|||
|
|
'use_moe': args.use_moe,
|
|||
|
|
'num_experts_per_tok': args.num_experts_per_tok,
|
|||
|
|
'n_routed_experts': args.n_routed_experts,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 只有model和model_no_feed需要KnowledgeDataset参数
|
|||
|
|
if args.model_type in ['model', 'model_no_feed']:
|
|||
|
|
config_params.update({
|
|||
|
|
'knowledge_num': args.knowledge_num,
|
|||
|
|
'knowledge_length': args.knowledge_length,
|
|||
|
|
'knowledge_dim': args.knowledge_dim,
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 加载模型
|
|||
|
|
model, tokenizer = load_model(args.model_path, args.model_type, args.device, config_params)
|
|||
|
|
|
|||
|
|
# 加载数据
|
|||
|
|
samples = load_eval_data(args.data_path, args.num_samples)
|
|||
|
|
|
|||
|
|
# 评估每个样本
|
|||
|
|
total_loss = 0
|
|||
|
|
valid_samples = 0
|
|||
|
|
total_requested_tokens = 0
|
|||
|
|
total_actual_tokens = 0
|
|||
|
|
samples_with_eos = 0
|
|||
|
|
samples_truncated_by_eos = 0
|
|||
|
|
|
|||
|
|
for i, sample in enumerate(samples):
|
|||
|
|
print(f"\n{'='*60}")
|
|||
|
|
print(f"样本 {i+1}/{len(samples)}")
|
|||
|
|
print(f"{'='*60}")
|
|||
|
|
|
|||
|
|
text = sample['text']
|
|||
|
|
|
|||
|
|
# 评估样本
|
|||
|
|
input_text, predicted_text, ground_truth_text, loss, generation_stats = evaluate_sample(
|
|||
|
|
model, tokenizer, text,
|
|||
|
|
args.input_length, args.predict_length, args.device
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if input_text is None:
|
|||
|
|
print("跳过该样本(文本长度不足)")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 打印结果
|
|||
|
|
print(f"\n输入 ({args.input_length} tokens):")
|
|||
|
|
print(f" {input_text}")
|
|||
|
|
print(f"\n预测输出 (请求{generation_stats['requested_length']}个token, 实际生成{generation_stats['actual_length']}个):")
|
|||
|
|
print(f" {predicted_text}")
|
|||
|
|
print(f"\n真实值 ({args.predict_length} tokens):")
|
|||
|
|
print(f" {ground_truth_text}")
|
|||
|
|
|
|||
|
|
# 打印生成统计信息
|
|||
|
|
print(f"\n生成统计:")
|
|||
|
|
print(f" 请求生成: {generation_stats['requested_length']} tokens")
|
|||
|
|
print(f" 实际生成: {generation_stats['actual_length']} tokens")
|
|||
|
|
if generation_stats['eos_found']:
|
|||
|
|
print(f" ✅ 发现EOS token在位置 {generation_stats['eos_position']}")
|
|||
|
|
if generation_stats['truncated_by_eos']:
|
|||
|
|
print(f" ⚠️ 因EOS token提前结束生成")
|
|||
|
|
else:
|
|||
|
|
print(f" ✅ EOS token出现在预期位置")
|
|||
|
|
else:
|
|||
|
|
print(f" ❌ 未发现EOS token (可能达到最大长度限制)")
|
|||
|
|
|
|||
|
|
if loss is not None:
|
|||
|
|
print(f"\nLoss: {loss:.4f}")
|
|||
|
|
total_loss += loss
|
|||
|
|
valid_samples += 1
|
|||
|
|
|
|||
|
|
# 更新生成统计
|
|||
|
|
total_requested_tokens += generation_stats['requested_length']
|
|||
|
|
total_actual_tokens += generation_stats['actual_length']
|
|||
|
|
if generation_stats['eos_found']:
|
|||
|
|
samples_with_eos += 1
|
|||
|
|
if generation_stats['truncated_by_eos']:
|
|||
|
|
samples_truncated_by_eos += 1
|
|||
|
|
|
|||
|
|
# 打印总体统计
|
|||
|
|
if valid_samples > 0:
|
|||
|
|
print(f"\n{'='*60}")
|
|||
|
|
print(f"总体统计:")
|
|||
|
|
print(f" 有效样本数: {valid_samples}")
|
|||
|
|
print(f" 平均Loss: {total_loss / valid_samples:.4f}")
|
|||
|
|
print()
|
|||
|
|
print(f"生成统计:")
|
|||
|
|
print(f" 请求生成总tokens: {total_requested_tokens}")
|
|||
|
|
print(f" 实际生成总tokens: {total_actual_tokens}")
|
|||
|
|
print(f" 生成完成率: {total_actual_tokens / total_requested_tokens * 100:.1f}%" if total_requested_tokens > 0 else " 生成完成率: N/A")
|
|||
|
|
print(f" 发现EOS的样本: {samples_with_eos}/{len(samples)} ({samples_with_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 发现EOS的样本: N/A")
|
|||
|
|
print(f" 被EOS截断的样本: {samples_truncated_by_eos}/{len(samples)} ({samples_truncated_by_eos/len(samples)*100:.1f}%)" if len(samples) > 0 else " 被EOS截断的样本: N/A")
|
|||
|
|
print(f" 平均每样本生成长度: {total_actual_tokens/len(samples):.1f} tokens" if len(samples) > 0 else " 平均每样本生成长度: N/A")
|
|||
|
|
print(f"{'='*60}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|