diff --git a/run_file/experiment_1.3.1.sh b/run_file/experiment_1.3.1.sh new file mode 100644 index 0000000..d64cd79 --- /dev/null +++ b/run_file/experiment_1.3.1.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# 激活conda环境 +source $(conda info --base)/etc/profile.d/conda.sh +conda activate ycz_accelerate + +# 设置环境变量以帮助调试 +export NCCL_DEBUG=INFO +export PYTHONFAULTHANDLER=1 + +# 实验1.3.0 - 使用命令行参数直接配置accelerate +CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \ + --multi_gpu \ + --num_processes=4 \ + --mixed_precision=bf16 \ + --main_process_port=29500 \ + train_pretrain_accelerate.py \ + --out_dir "out" \ + --epochs 3 \ + --embedding_epoch 2 \ + --batch_size 48 \ + --learning_rate 2e-4 \ + --dtype bfloat16 \ + --use_swanlab \ + --swanlab_project "MiniMind-Pretrain" \ + --num_workers 1 \ + --accumulation_steps 32 \ + --grad_clip 1.0 \ + --warmup_iters 0 \ + --log_interval 100 \ + --save_interval 10000 \ + --dim 1024 \ + --n_layers 18 \ + --max_seq_len 512 \ + --use_moe False \ + --data_path "./dataset/stable/merged_pretrain.jsonl" \ + --profile \ + --profile_interval 10 \ + --use_flash_attn \ + --knowledge_num 1048576 \ + --knowledge_length 32 \ + --database_init_path "./dataset/stable/sentence_trex_data.json" \ + --fast_clustering \ + --cluster_cache_path "./cache/cluster_tokens_single.pt" \ + --memory_monitor_interval 10 \ + --model_type "model" \ + --model_size 814.724