48 lines
1.3 KiB
Bash
48 lines
1.3 KiB
Bash
#!/bin/bash
|
|
|
|
# 激活conda环境
|
|
source $(conda info --base)/etc/profile.d/conda.sh
|
|
conda activate ycz_accelerate
|
|
|
|
# 设置环境变量以帮助调试
|
|
export NCCL_DEBUG=INFO
|
|
export PYTHONFAULTHANDLER=1
|
|
|
|
# 实验1.3.0 - 使用命令行参数直接配置accelerate
|
|
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
|
|
--multi_gpu \
|
|
--num_processes=4 \
|
|
--mixed_precision=bf16 \
|
|
--main_process_port=29500 \
|
|
train_pretrain_accelerate.py \
|
|
--out_dir "out" \
|
|
--epochs 3 \
|
|
--embedding_epoch 2 \
|
|
--batch_size 48 \
|
|
--learning_rate 2e-4 \
|
|
--dtype bfloat16 \
|
|
--use_swanlab \
|
|
--swanlab_project "MiniMind-Pretrain" \
|
|
--num_workers 1 \
|
|
--accumulation_steps 32 \
|
|
--grad_clip 1.0 \
|
|
--warmup_iters 0 \
|
|
--log_interval 100 \
|
|
--save_interval 10000 \
|
|
--dim 1024 \
|
|
--n_layers 18 \
|
|
--max_seq_len 512 \
|
|
--use_moe False \
|
|
--data_path "./dataset/stable/merged_pretrain.jsonl" \
|
|
--profile \
|
|
--profile_interval 10 \
|
|
--use_flash_attn \
|
|
--knowledge_num 1048576 \
|
|
--knowledge_length 32 \
|
|
--database_init_path "./dataset/stable/sentence_trex_data.json" \
|
|
--fast_clustering \
|
|
--cluster_cache_path "./cache/cluster_tokens_single.pt" \
|
|
--memory_monitor_interval 10 \
|
|
--model_type "model" \
|
|
--model_size 814.724
|