46 lines
1.2 KiB
Bash
46 lines
1.2 KiB
Bash
#!/bin/bash
|
|
|
|
# 激活conda环境
|
|
# source $(conda info --base)/etc/profile.d/conda.sh
|
|
# conda activate ycz_accelerate
|
|
|
|
# 设置环境变量以帮助调试
|
|
export NCCL_DEBUG=INFO
|
|
export PYTHONFAULTHANDLER=1
|
|
|
|
# 实验1.3.0 - 使用命令行参数直接配置accelerate
|
|
CUDA_VISIBLE_DEVICES=0 accelerate launch \
|
|
--num_processes=1 \
|
|
--mixed_precision=bf16 \
|
|
--main_process_port=29500 \
|
|
train_pretrain_accelerate.py \
|
|
--out_dir "out" \
|
|
--epochs 3 \
|
|
--embedding_epoch 2 \
|
|
--batch_size 128 \
|
|
--learning_rate 8e-5 \
|
|
--dtype bfloat16 \
|
|
--use_swanlab \
|
|
--swanlab_project "MiniMind-Pretrain" \
|
|
--num_workers 1 \
|
|
--accumulation_steps 16 \
|
|
--grad_clip 0.5 \
|
|
--warmup_iters 0 \
|
|
--log_interval 100 \
|
|
--save_interval 10000 \
|
|
--dim 512 \
|
|
--n_layers 8 \
|
|
--max_seq_len 512 \
|
|
--data_path "./dataset/stable/merged_pretrain.jsonl" \
|
|
--profile \
|
|
--profile_interval 10 \
|
|
--use_flash_attn \
|
|
--knowledge_num 1048576 \
|
|
--knowledge_length 32 \
|
|
--database_init_path "./dataset/stable/sentence_trex_data.json" \
|
|
--fast_clustering \
|
|
--cluster_cache_path "./cache/cluster_tokens_single.pt" \
|
|
--memory_monitor_interval 10 \
|
|
--model_type "model" \
|
|
--model_size 538
|