61 lines
1.6 KiB
Python
61 lines
1.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
小规模测试预处理脚本
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
|
|
# 添加路径
|
|
sys.path.append('/home/pci/nas/AI_Large_Model_Team/ycz/Minimind/preprocessing')
|
|
|
|
# 导入主模块
|
|
from preprocess_pretrain import *
|
|
|
|
# 修改配置为小规模测试
|
|
DATASET_CONFIG["wikipedia"]["max_samples"] = 100
|
|
DATASET_CONFIG["gutenberg"]["max_samples"] = 50
|
|
DATASET_CONFIG["openwebtext"]["max_samples"] = 20
|
|
|
|
DATASET_CONFIG_EXTRA["wikipedia"]["max_samples"] = 50
|
|
DATASET_CONFIG_EXTRA["gutenberg"]["max_samples"] = 30
|
|
DATASET_CONFIG_EXTRA["openwebtext"]["max_samples"] = 15
|
|
|
|
# 修改输出路径
|
|
OUTPUT_FILE = "/tmp/test_main.jsonl"
|
|
OUTPUT_FILE_EXTRA = "/tmp/test_extra.jsonl"
|
|
|
|
def test_small_scale():
|
|
"""小规模测试"""
|
|
print("Starting small scale test...")
|
|
|
|
# 设置随机种子
|
|
random.seed(42)
|
|
|
|
try:
|
|
# 初始化tokenizer
|
|
init_tokenizer()
|
|
|
|
# 开始合并数据集
|
|
merge_datasets()
|
|
|
|
# 检查输出文件
|
|
if os.path.exists(OUTPUT_FILE):
|
|
with open(OUTPUT_FILE, 'r') as f:
|
|
main_lines = len(f.readlines())
|
|
print(f"Main file created: {main_lines} lines")
|
|
|
|
if os.path.exists(OUTPUT_FILE_EXTRA):
|
|
with open(OUTPUT_FILE_EXTRA, 'r') as f:
|
|
extra_lines = len(f.readlines())
|
|
print(f"Extra file created: {extra_lines} lines")
|
|
|
|
print("Small scale test completed successfully!")
|
|
|
|
except Exception as e:
|
|
print(f"Test failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
if __name__ == "__main__":
|
|
test_small_scale() |