Minimind/preprocessing/test_preprocess_small.py

61 lines
1.6 KiB
Python

#!/usr/bin/env python3
"""
小规模测试预处理脚本
"""
import sys
import os
# 添加路径
sys.path.append('/home/pci/nas/AI_Large_Model_Team/ycz/Minimind/preprocessing')
# 导入主模块
from preprocess_pretrain import *
# 修改配置为小规模测试
DATASET_CONFIG["wikipedia"]["max_samples"] = 100
DATASET_CONFIG["gutenberg"]["max_samples"] = 50
DATASET_CONFIG["openwebtext"]["max_samples"] = 20
DATASET_CONFIG_EXTRA["wikipedia"]["max_samples"] = 50
DATASET_CONFIG_EXTRA["gutenberg"]["max_samples"] = 30
DATASET_CONFIG_EXTRA["openwebtext"]["max_samples"] = 15
# 修改输出路径
OUTPUT_FILE = "/tmp/test_main.jsonl"
OUTPUT_FILE_EXTRA = "/tmp/test_extra.jsonl"
def test_small_scale():
"""小规模测试"""
print("Starting small scale test...")
# 设置随机种子
random.seed(42)
try:
# 初始化tokenizer
init_tokenizer()
# 开始合并数据集
merge_datasets()
# 检查输出文件
if os.path.exists(OUTPUT_FILE):
with open(OUTPUT_FILE, 'r') as f:
main_lines = len(f.readlines())
print(f"Main file created: {main_lines} lines")
if os.path.exists(OUTPUT_FILE_EXTRA):
with open(OUTPUT_FILE_EXTRA, 'r') as f:
extra_lines = len(f.readlines())
print(f"Extra file created: {extra_lines} lines")
print("Small scale test completed successfully!")
except Exception as e:
print(f"Test failed: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
test_small_scale()