Minimind/run_file/experiment_1.3.0.sh

47 lines
1.3 KiB
Bash

#!/bin/bash
# 激活conda环境
source $(conda info --base)/etc/profile.d/conda.sh
conda activate ycz_accelerate
# 设置环境变量以帮助调试
export NCCL_DEBUG=INFO
export PYTHONFAULTHANDLER=1
# 实验1.3.0 - 使用命令行参数直接配置accelerate
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
--multi_gpu \
--num_processes=4 \
--mixed_precision=bf16 \
--main_process_port=29500 \
train_pretrain_accelerate.py \
--out_dir "out" \
--epochs 3 \
--embedding_epoch 2 \
--batch_size 64 \
--learning_rate 8e-5 \
--dtype bfloat16 \
--use_swanlab \
--swanlab_project "MiniMind-Pretrain" \
--num_workers 1 \
--accumulation_steps 16 \
--grad_clip 0.5 \
--warmup_iters 0 \
--log_interval 100 \
--save_interval 10000 \
--dim 1024 \
--n_layers 48 \
--max_seq_len 512 \
--data_path "./dataset/stable/merged_pretrain.jsonl" \
--profile \
--profile_interval 10 \
--use_flash_attn \
--knowledge_num 1048576 \
--knowledge_length 32 \
--database_init_path "./dataset/stable/sentence_trex_data.json" \
--fast_clustering \
--cluster_cache_path "./cache/cluster_tokens_single.pt" \
--memory_monitor_interval 10 \
--model_type "model_original" \
--model_size 538