Minimind/preprocessing/test_preprocess_small.py

#!/usr/bin/env python3
"""
小规模测试预处理脚本
"""

import sys
import os

# 添加路径
sys.path.append('/home/pci/nas/AI_Large_Model_Team/ycz/Minimind/preprocessing')

# 导入主模块
from preprocess_pretrain import *

# 修改配置为小规模测试
DATASET_CONFIG["wikipedia"]["max_samples"] = 100
DATASET_CONFIG["gutenberg"]["max_samples"] = 50
DATASET_CONFIG["openwebtext"]["max_samples"] = 20

DATASET_CONFIG_EXTRA["wikipedia"]["max_samples"] = 50
DATASET_CONFIG_EXTRA["gutenberg"]["max_samples"] = 30
DATASET_CONFIG_EXTRA["openwebtext"]["max_samples"] = 15

# 修改输出路径
OUTPUT_FILE = "/tmp/test_main.jsonl"
OUTPUT_FILE_EXTRA = "/tmp/test_extra.jsonl"

def test_small_scale():
    """小规模测试"""
    print("Starting small scale test...")

    # 设置随机种子
    random.seed(42)

    try:
        # 初始化tokenizer
        init_tokenizer()

        # 开始合并数据集
        merge_datasets()

        # 检查输出文件
        if os.path.exists(OUTPUT_FILE):
            with open(OUTPUT_FILE, 'r') as f:
                main_lines = len(f.readlines())
            print(f"Main file created: {main_lines} lines")

        if os.path.exists(OUTPUT_FILE_EXTRA):
            with open(OUTPUT_FILE_EXTRA, 'r') as f:
                extra_lines = len(f.readlines())
            print(f"Extra file created: {extra_lines} lines")

        print("Small scale test completed successfully!")

    except Exception as e:
        print(f"Test failed: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    test_small_scale()