#!/usr/bin/env python3 """ 小规模测试预处理脚本 """ import sys import os # 添加路径 sys.path.append('/home/pci/nas/AI_Large_Model_Team/ycz/Minimind/preprocessing') # 导入主模块 from preprocess_pretrain import * # 修改配置为小规模测试 DATASET_CONFIG["wikipedia"]["max_samples"] = 100 DATASET_CONFIG["gutenberg"]["max_samples"] = 50 DATASET_CONFIG["openwebtext"]["max_samples"] = 20 DATASET_CONFIG_EXTRA["wikipedia"]["max_samples"] = 50 DATASET_CONFIG_EXTRA["gutenberg"]["max_samples"] = 30 DATASET_CONFIG_EXTRA["openwebtext"]["max_samples"] = 15 # 修改输出路径 OUTPUT_FILE = "/tmp/test_main.jsonl" OUTPUT_FILE_EXTRA = "/tmp/test_extra.jsonl" def test_small_scale(): """小规模测试""" print("Starting small scale test...") # 设置随机种子 random.seed(42) try: # 初始化tokenizer init_tokenizer() # 开始合并数据集 merge_datasets() # 检查输出文件 if os.path.exists(OUTPUT_FILE): with open(OUTPUT_FILE, 'r') as f: main_lines = len(f.readlines()) print(f"Main file created: {main_lines} lines") if os.path.exists(OUTPUT_FILE_EXTRA): with open(OUTPUT_FILE_EXTRA, 'r') as f: extra_lines = len(f.readlines()) print(f"Extra file created: {extra_lines} lines") print("Small scale test completed successfully!") except Exception as e: print(f"Test failed: {e}") import traceback traceback.print_exc() if __name__ == "__main__": test_small_scale()