Minimind/run_file/experiment_template.sh
2025-09-06 15:12:05 +08:00

363 lines
14 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# ============================================================================
# MiniMind 实验脚本模版 - Experiment [VERSION]
# ============================================================================
#
# 🎯 使用说明:
# - 🧑‍🔬 [人类填写] - 实验开始前由人类研究者配置
# - 🤖 [AI构建] - 实验构建过程中由AI自动替换占位符
#
# 使用方法:
# 1. 复制此模版为 experiment_X.X.X.sh
# 2. 替换所有 [PLACEHOLDER] 占位符
# 3. 执行: bash run_file/experiment_X.X.X.sh
# ============================================================================
# ----------------------------------------------------------------------------
# 🧑‍🔬 [人类填写] 实验基本信息
# ----------------------------------------------------------------------------
EXPERIMENT_VERSION="[VERSION]" # 实验版本号,如: 1.4.1
EXPERIMENT_DESCRIPTION="[DESCRIPTION]" # 实验简短描述
RESEARCHER_NAME="[RESEARCHER]" # 研究者姓名
EXPERIMENT_DATE="$(date '+%Y-%m-%d %H:%M:%S')" # 自动记录实验开始时间
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 环境配置
# ----------------------------------------------------------------------------
# Python环境设置
# 注意: 根据实际环境选择激活方式
# Option 1: Conda环境 (如果使用conda)
# source $(conda info --base)/etc/profile.d/conda.sh
# conda activate [CONDA_ENV]
# Option 2: UV虚拟环境 (推荐)
# export VIRTUAL_ENV="[VENV_PATH]"
# source "$VIRTUAL_ENV/bin/activate"
# 调试和监控环境变量
export PYTHONFAULTHANDLER=1 # Python 故障处理
# export NCCL_DEBUG=INFO # NCCL 调试信息(仅调试时启用)
# export CUDA_LAUNCH_BLOCKING=1 # CUDA 同步执行(严重影响性能,仅调试时启用)
# 🔥 强制禁用输出缓冲确保日志立即写入不影响GPU性能
export PYTHONUNBUFFERED=1 # Python 解释器不缓冲输出
export PYTHONIOENCODING=utf-8 # 确保编码一致性
# SwanLab 配置
export SWANLAB_API_KEY="[SWANLAB_API_KEY]" # 🤖 [AI构建] SwanLab API密钥
export SWANLAB_PROJECT="[SWANLAB_PROJECT]" # 🤖 [AI构建] SwanLab项目名
# 日志配置
LOG_DIR="out/experiment_${EXPERIMENT_VERSION}"
mkdir -p "$LOG_DIR"
LOG_FILE="$LOG_DIR/experiment.log"
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 硬件配置
# ----------------------------------------------------------------------------
CUDA_VISIBLE_DEVICES="[CUDA_DEVICES]" # GPU设备如: 0 或 0,1,2,3
NUM_PROCESSES="[NUM_PROCESSES]" # 进程数通常等于GPU数量
MIXED_PRECISION="[MIXED_PRECISION]" # 混合精度: bf16, fp16, no
MAIN_PROCESS_PORT="[MAIN_PROCESS_PORT]" # 主进程端口,默认: 29500
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 模型架构参数
# ----------------------------------------------------------------------------
MODEL_TYPE="[MODEL_TYPE]" # 模型类型: model, model_original, model_no_feed
MODEL_SIZE="[MODEL_SIZE]" # 模型大小 (MB)
DIM="[DIM]" # 模型维度
N_LAYERS="[N_LAYERS]" # Transformer层数
N_HEADS="[N_HEADS]" # 注意力头数
MAX_SEQ_LEN="[MAX_SEQ_LEN]" # 最大序列长度
USE_MOE="[USE_MOE]" # 是否使用MOE: true/false
# 知识库配置
KNOWLEDGE_NUM="[KNOWLEDGE_NUM]" # 知识条目数量
KNOWLEDGE_LENGTH="[KNOWLEDGE_LENGTH]" # 单条知识长度
DISABLE_DB="[DISABLE_DB]" # 是否禁用数据库: true/false
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 训练超参数
# ----------------------------------------------------------------------------
EPOCHS="[EPOCHS]" # 训练轮次
EMBEDDING_EPOCH="[EMBEDDING_EPOCH]" # 嵌入层训练轮次
BATCH_SIZE="[BATCH_SIZE]" # 批次大小
ACCUMULATION_STEPS="[ACCUMULATION_STEPS]" # 梯度累积步数
LEARNING_RATE="[LEARNING_RATE]" # 学习率
DTYPE="[DTYPE]" # 数据类型: bfloat16, float16, float32
GRAD_CLIP="[GRAD_CLIP]" # 梯度裁剪阈值
WARMUP_ITERS="[WARMUP_ITERS]" # 预热迭代数
# 数据和缓存路径
DATA_PATH="[DATA_PATH]" # 训练数据路径
DATABASE_INIT_PATH="[DATABASE_INIT_PATH]" # 数据库初始化路径
CLUSTER_CACHE_PATH="[CLUSTER_CACHE_PATH]" # 聚类缓存路径
# 训练配置
NUM_WORKERS="[NUM_WORKERS]" # 数据加载工作进程数
LOG_INTERVAL="[LOG_INTERVAL]" # 日志记录间隔
SAVE_INTERVAL="[SAVE_INTERVAL]" # 模型保存间隔
# 性能分析配置
USE_PROFILE="[USE_PROFILE]" # 是否启用性能分析: true/false
PROFILE_INTERVAL="[PROFILE_INTERVAL]" # 性能分析间隔
MEMORY_MONITOR_INTERVAL="[MEMORY_MONITOR_INTERVAL]" # 内存监控间隔
# 高级功能
USE_FLASH_ATTN="[USE_FLASH_ATTN]" # 是否使用Flash Attention: true/false
FAST_CLUSTERING="[FAST_CLUSTERING]" # 是否使用快速聚类: true/false
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 预检查函数
# ----------------------------------------------------------------------------
check_environment() {
echo "🔍 环境检查中..."
# 检查GPU可用性
if ! nvidia-smi &> /dev/null; then
echo "❌ 错误: 未检测到GPU或nvidia-smi不可用"
exit 1
fi
# 检查CUDA设备
IFS=',' read -ra DEVICES <<< "$CUDA_VISIBLE_DEVICES"
for device in "${DEVICES[@]}"; do
if ! nvidia-smi -i "$device" &> /dev/null; then
echo "❌ 错误: GPU $device 不可用"
exit 1
fi
done
# 检查Python环境
if ! python -c "import torch; print(f'PyTorch: {torch.__version__}')" 2>/dev/null; then
echo "❌ 错误: PyTorch未正确安装"
exit 1
fi
# 检查数据文件
if [[ ! -f "$DATA_PATH" ]]; then
echo "❌ 错误: 训练数据文件不存在: $DATA_PATH"
exit 1
fi
if [[ "$DATABASE_INIT_PATH" != "None" && ! -f "$DATABASE_INIT_PATH" ]]; then
echo "❌ 错误: 数据库初始化文件不存在: $DATABASE_INIT_PATH"
exit 1
fi
echo "✅ 环境检查通过"
}
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 实验信息记录
# ----------------------------------------------------------------------------
log_experiment_info() {
echo "📝 记录实验信息..."
cat > "$LOG_DIR/experiment_info.txt" << EOF
========================================
MiniMind 实验信息
========================================
实验版本: $EXPERIMENT_VERSION
实验描述: $EXPERIMENT_DESCRIPTION
研究者: $RESEARCHER_NAME
开始时间: $EXPERIMENT_DATE
========================================
硬件配置:
GPU设备: $CUDA_VISIBLE_DEVICES
进程数: $NUM_PROCESSES
混合精度: $MIXED_PRECISION
========================================
模型配置:
模型类型: $MODEL_TYPE
模型大小: $MODEL_SIZE MB
维度: $DIM
层数: $N_LAYERS
注意力头数: $N_HEADS
最大序列长度: $MAX_SEQ_LEN
使用MOE: $USE_MOE
========================================
训练配置:
训练轮次: $EPOCHS
批次大小: $BATCH_SIZE
学习率: $LEARNING_RATE
梯度累积: $ACCUMULATION_STEPS
数据类型: $DTYPE
========================================
数据路径:
训练数据: $DATA_PATH
数据库初始化: $DATABASE_INIT_PATH
聚类缓存: $CLUSTER_CACHE_PATH
========================================
EOF
}
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 主执行函数
# ----------------------------------------------------------------------------
run_experiment() {
echo "🚀 开始执行实验 $EXPERIMENT_VERSION"
echo "📄 实验描述: $EXPERIMENT_DESCRIPTION"
echo "⏰ 开始时间: $EXPERIMENT_DATE"
# 构建accelerate命令
local accelerate_cmd="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
# 根据是否使用uv选择执行方式
if command -v uv &> /dev/null && [[ -f "pyproject.toml" ]]; then
accelerate_cmd+=" uv run -p .venv python -m accelerate.commands.launch"
else
accelerate_cmd+=" accelerate launch"
fi
# 添加accelerate参数
if [[ "$NUM_PROCESSES" -gt 1 ]]; then
accelerate_cmd+=" --multi_gpu"
fi
accelerate_cmd+=" --num_processes=$NUM_PROCESSES"
accelerate_cmd+=" --mixed_precision=$MIXED_PRECISION"
accelerate_cmd+=" --main_process_port=$MAIN_PROCESS_PORT"
accelerate_cmd+=" train_pretrain_accelerate.py"
# 添加训练参数
accelerate_cmd+=" --out_dir \"$LOG_DIR\""
accelerate_cmd+=" --epochs $EPOCHS"
accelerate_cmd+=" --embedding_epoch $EMBEDDING_EPOCH"
accelerate_cmd+=" --batch_size $BATCH_SIZE"
accelerate_cmd+=" --learning_rate $LEARNING_RATE"
accelerate_cmd+=" --dtype $DTYPE"
accelerate_cmd+=" --num_workers $NUM_WORKERS"
accelerate_cmd+=" --accumulation_steps $ACCUMULATION_STEPS"
accelerate_cmd+=" --grad_clip $GRAD_CLIP"
accelerate_cmd+=" --warmup_iters $WARMUP_ITERS"
accelerate_cmd+=" --log_interval $LOG_INTERVAL"
accelerate_cmd+=" --save_interval $SAVE_INTERVAL"
accelerate_cmd+=" --dim $DIM"
accelerate_cmd+=" --n_layers $N_LAYERS"
accelerate_cmd+=" --n_heads $N_HEADS"
accelerate_cmd+=" --max_seq_len $MAX_SEQ_LEN"
accelerate_cmd+=" --data_path \"$DATA_PATH\""
accelerate_cmd+=" --knowledge_num $KNOWLEDGE_NUM"
accelerate_cmd+=" --knowledge_length $KNOWLEDGE_LENGTH"
accelerate_cmd+=" --database_init_path \"$DATABASE_INIT_PATH\""
accelerate_cmd+=" --memory_monitor_interval $MEMORY_MONITOR_INTERVAL"
accelerate_cmd+=" --model_type \"$MODEL_TYPE\""
accelerate_cmd+=" --model_size $MODEL_SIZE"
# 可选参数
if [[ "$USE_PROFILE" == "true" ]]; then
accelerate_cmd+=" --profile"
accelerate_cmd+=" --profile_interval $PROFILE_INTERVAL"
fi
if [[ "$USE_FLASH_ATTN" == "true" ]]; then
accelerate_cmd+=" --use_flash_attn"
fi
if [[ "$FAST_CLUSTERING" == "true" ]]; then
accelerate_cmd+=" --fast_clustering"
fi
if [[ "$DISABLE_DB" == "true" ]]; then
accelerate_cmd+=" --disable_db"
fi
if [[ "$CLUSTER_CACHE_PATH" != "None" ]]; then
accelerate_cmd+=" --cluster_cache_path \"$CLUSTER_CACHE_PATH\""
fi
# SwanLab配置
accelerate_cmd+=" --use_swanlab"
accelerate_cmd+=" --swanlab_project \"$SWANLAB_PROJECT\""
echo "📋 执行命令:"
echo "$accelerate_cmd"
echo
# 记录命令到日志文件
echo "执行命令: $accelerate_cmd" >> "$LOG_FILE"
echo "开始时间: $(date)" >> "$LOG_FILE"
# 使用nohup执行训练后台运行输出写入日志文件
echo "🔄 使用nohup后台运行训练输出将写入日志文件: $LOG_FILE"
echo "开始时间: $(date)" >> "$LOG_FILE"
# 创建训练脚本
train_script="/tmp/train_${EXPERIMENT_VERSION}.sh"
cat > "$train_script" << EOF
#!/bin/bash
cd /home/pci/ycz/Code/pretrain-worktree
source /home/pci/ycz/Code/pretrain-worktree/.venv/bin/activate
$accelerate_cmd
echo "结束时间: \$(date)"
echo "退出代码: \$?"
EOF
chmod +x "$train_script"
# 使用nohup后台运行并使用stdbuf禁用缓冲
nohup stdbuf -oL -eL bash "$train_script" >> "$LOG_FILE" 2>&1 &
local train_pid=$!
echo "🔥 训练进程已启动PID: $train_pid"
echo "训练PID: $train_pid" >> "$LOG_FILE"
echo "训练脚本: $train_script" >> "$LOG_FILE"
# 等待几秒确保进程启动
sleep 5
# 检查进程是否还在运行
if kill -0 $train_pid 2>/dev/null; then
echo "✅ 训练进程正在后台运行"
echo "📋 实时查看日志: tail -f $LOG_FILE"
echo "📋 检查进程状态: ps -p $train_pid"
echo "🛑 停止训练: kill $train_pid"
echo "⏰ 预计训练时间: 根据配置而定"
echo "📈 SwanLab: https://swanlab.cn/project/$SWANLAB_PROJECT"
echo ""
echo "训练正在后台运行,可以安全关闭终端。"
else
echo "❌ 训练进程启动失败"
echo "📋 查看日志: $LOG_FILE"
exit 1
fi
}
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 清理函数
# ----------------------------------------------------------------------------
cleanup() {
echo "🧹 清理临时文件..."
# 在这里添加清理逻辑
}
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 信号处理
# ----------------------------------------------------------------------------
trap cleanup EXIT
trap 'echo "❌ 实验被中断"; cleanup; exit 130' INT TERM
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 主程序入口
# ----------------------------------------------------------------------------
main() {
echo "============================================================================"
echo "🧠 MiniMind 预训练实验"
echo "============================================================================"
# 执行检查和初始化
check_environment
log_experiment_info
# 运行实验
run_experiment
echo "============================================================================"
echo "✅ 实验 $EXPERIMENT_VERSION 完成"
echo "📅 完成时间: $(date)"
echo "============================================================================"
}
# 执行主程序
main "$@"