Minimind/run_file/experiment_1_4_0.sh
2025-08-01 15:54:21 +08:00

330 lines
12 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# ============================================================================
# MiniMind 实验脚本 - Experiment 1.4.0
# ============================================================================
#
# 🎯 实验目标: 构建baseline使用model_original和默认参数配置
# 🤖 AI构建完成时间: $(date '+%Y-%m-%d %H:%M:%S')
# ============================================================================
# ----------------------------------------------------------------------------
# 🧑‍🔬 [人类填写] 实验基本信息
# ----------------------------------------------------------------------------
EXPERIMENT_VERSION="1_4_0"
EXPERIMENT_DESCRIPTION="Baseline实验使用model_original构建基准性能指标"
RESEARCHER_NAME="Human+Claude"
EXPERIMENT_DATE="$(date '+%Y-%m-%d %H:%M:%S')"
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 环境配置
# ----------------------------------------------------------------------------
# Python环境设置 - 使用UV虚拟环境
export VIRTUAL_ENV="/home/pci/ycz/Code/pretrain-worktree/.venv"
source "$VIRTUAL_ENV/bin/activate"
# 调试和监控环境变量
export NCCL_DEBUG=INFO
export PYTHONFAULTHANDLER=1
export CUDA_LAUNCH_BLOCKING=0 # 关闭同步执行以提高性能
# SwanLab 配置
export SWANLAB_PROJECT="MiniMind-Baseline-Experiment"
# 日志配置
LOG_DIR="out/experiment_${EXPERIMENT_VERSION}"
mkdir -p "$LOG_DIR"
LOG_FILE="$LOG_DIR/experiment.log"
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 硬件配置
# ----------------------------------------------------------------------------
CUDA_VISIBLE_DEVICES="0" # 单GPU训练
NUM_PROCESSES="1" # 单进程
MIXED_PRECISION="bf16" # bfloat16混合精度
MAIN_PROCESS_PORT="29500" # 默认端口
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 模型架构参数 - Baseline配置
# ----------------------------------------------------------------------------
MODEL_TYPE="model_original" # 使用原始Transformer架构作为baseline
MODEL_SIZE="26.0" # 预估模型大小
DIM="512" # 模型维度
N_LAYERS="8" # Transformer层数
N_HEADS="32" # 注意力头数
MAX_SEQ_LEN="512" # 最大序列长度
USE_MOE="false" # 不使用MOE
# 知识库配置 - 对于baseline不需要
KNOWLEDGE_NUM="1048576" # 保持默认值但不会使用
KNOWLEDGE_LENGTH="32" # 保持默认值但不会使用
DISABLE_DB="true" # 禁用数据库功能
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 训练超参数 - 默认配置
# ----------------------------------------------------------------------------
EPOCHS="3" # 训练轮次
EMBEDDING_EPOCH="2" # 嵌入层训练轮次
BATCH_SIZE="128" # 批次大小
ACCUMULATION_STEPS="8" # 梯度累积步数(减少显存需求)
LEARNING_RATE="2e-4" # 学习率
DTYPE="bfloat16" # 数据类型
GRAD_CLIP="1.0" # 梯度裁剪阈值
WARMUP_ITERS="0" # 预热迭代数
# 数据和缓存路径
DATA_PATH="/home/pci/ycz/Code/Minimind/dataset/stable/merged_pretrain.jsonl"
DATABASE_INIT_PATH="None" # Baseline不使用数据库
CLUSTER_CACHE_PATH="None" # Baseline不使用聚类缓存
# 训练配置
NUM_WORKERS="1" # 数据加载工作进程数
LOG_INTERVAL="1" # 日志记录间隔
SAVE_INTERVAL="10000" # 模型保存间隔
# 性能分析配置
USE_PROFILE="true" # 启用性能分析
PROFILE_INTERVAL="10" # 性能分析间隔
MEMORY_MONITOR_INTERVAL="10" # 内存监控间隔
# 高级功能
USE_FLASH_ATTN="true" # 使用Flash Attention
FAST_CLUSTERING="false" # 不使用聚类
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 预检查函数
# ----------------------------------------------------------------------------
check_environment() {
echo "🔍 环境检查中..."
# 检查GPU可用性
if ! nvidia-smi &> /dev/null; then
echo "❌ 错误: 未检测到GPU或nvidia-smi不可用"
exit 1
fi
# 检查CUDA设备
IFS=',' read -ra DEVICES <<< "$CUDA_VISIBLE_DEVICES"
for device in "${DEVICES[@]}"; do
if ! nvidia-smi -i "$device" &> /dev/null; then
echo "❌ 错误: GPU $device 不可用"
exit 1
fi
done
# 检查Python环境
if ! python -c "import torch; print(f'PyTorch: {torch.__version__}')" 2>/dev/null; then
echo "❌ 错误: PyTorch未正确安装"
exit 1
fi
# 检查数据文件
if [[ ! -f "$DATA_PATH" ]]; then
echo "❌ 错误: 训练数据文件不存在: $DATA_PATH"
exit 1
fi
echo "✅ 环境检查通过"
}
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 实验信息记录
# ----------------------------------------------------------------------------
log_experiment_info() {
echo "📝 记录实验信息..."
cat > "$LOG_DIR/experiment_info.txt" << EOF
========================================
MiniMind Baseline实验信息
========================================
实验版本: $EXPERIMENT_VERSION
实验描述: $EXPERIMENT_DESCRIPTION
研究者: $RESEARCHER_NAME
开始时间: $EXPERIMENT_DATE
========================================
硬件配置:
GPU设备: $CUDA_VISIBLE_DEVICES
进程数: $NUM_PROCESSES
混合精度: $MIXED_PRECISION
========================================
模型配置:
模型类型: $MODEL_TYPE (Baseline)
模型大小: $MODEL_SIZE MB
维度: $DIM
层数: $N_LAYERS
注意力头数: $N_HEADS
最大序列长度: $MAX_SEQ_LEN
使用MOE: $USE_MOE
禁用数据库: $DISABLE_DB
========================================
训练配置:
训练轮次: $EPOCHS
批次大小: $BATCH_SIZE
学习率: $LEARNING_RATE
梯度累积: $ACCUMULATION_STEPS
数据类型: $DTYPE
========================================
数据路径:
训练数据: $DATA_PATH
数据库初始化: $DATABASE_INIT_PATH
聚类缓存: $CLUSTER_CACHE_PATH
========================================
EOF
}
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 主执行函数
# ----------------------------------------------------------------------------
run_experiment() {
echo "🚀 开始执行Baseline实验 $EXPERIMENT_VERSION"
echo "📄 实验描述: $EXPERIMENT_DESCRIPTION"
echo "⏰ 开始时间: $EXPERIMENT_DATE"
# 构建accelerate命令
local accelerate_cmd="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
# 根据是否使用uv选择执行方式
if command -v uv &> /dev/null && [[ -f "pyproject.toml" ]]; then
accelerate_cmd+=" uv run python -m accelerate.commands.launch"
else
accelerate_cmd+=" accelerate launch"
fi
# 添加accelerate参数
accelerate_cmd+=" --num_processes=$NUM_PROCESSES"
accelerate_cmd+=" --mixed_precision=$MIXED_PRECISION"
accelerate_cmd+=" --main_process_port=$MAIN_PROCESS_PORT"
accelerate_cmd+=" train_pretrain_accelerate.py"
# 添加训练参数
accelerate_cmd+=" --out_dir \"$LOG_DIR\""
accelerate_cmd+=" --epochs $EPOCHS"
accelerate_cmd+=" --embedding_epoch $EMBEDDING_EPOCH"
accelerate_cmd+=" --batch_size $BATCH_SIZE"
accelerate_cmd+=" --learning_rate $LEARNING_RATE"
accelerate_cmd+=" --dtype $DTYPE"
accelerate_cmd+=" --num_workers $NUM_WORKERS"
accelerate_cmd+=" --accumulation_steps $ACCUMULATION_STEPS"
accelerate_cmd+=" --grad_clip $GRAD_CLIP"
accelerate_cmd+=" --warmup_iters $WARMUP_ITERS"
accelerate_cmd+=" --log_interval $LOG_INTERVAL"
accelerate_cmd+=" --save_interval $SAVE_INTERVAL"
accelerate_cmd+=" --dim $DIM"
accelerate_cmd+=" --n_layers $N_LAYERS"
accelerate_cmd+=" --n_heads $N_HEADS"
accelerate_cmd+=" --max_seq_len $MAX_SEQ_LEN"
accelerate_cmd+=" --data_path \"$DATA_PATH\""
accelerate_cmd+=" --knowledge_num $KNOWLEDGE_NUM"
accelerate_cmd+=" --knowledge_length $KNOWLEDGE_LENGTH"
accelerate_cmd+=" --memory_monitor_interval $MEMORY_MONITOR_INTERVAL"
accelerate_cmd+=" --model_type \"$MODEL_TYPE\""
accelerate_cmd+=" --model_size $MODEL_SIZE"
accelerate_cmd+=" --swanlab_online false"
# 可选参数
if [[ "$USE_PROFILE" == "true" ]]; then
accelerate_cmd+=" --profile"
accelerate_cmd+=" --profile_interval $PROFILE_INTERVAL"
fi
if [[ "$USE_FLASH_ATTN" == "true" ]]; then
accelerate_cmd+=" --use_flash_attn"
fi
if [[ "$DISABLE_DB" == "true" ]]; then
accelerate_cmd+=" --disable_db"
fi
# SwanLab配置
accelerate_cmd+=" --use_swanlab"
accelerate_cmd+=" --swanlab_project \"$SWANLAB_PROJECT\""
echo "📋 执行命令:"
echo "$accelerate_cmd"
echo
# 记录命令到日志文件
echo "执行命令: $accelerate_cmd" >> "$LOG_FILE"
echo "开始时间: $(date)" >> "$LOG_FILE"
# 使用nohup执行训练后台运行输出写入日志文件
echo "🔄 使用nohup后台运行训练输出将写入日志文件: $LOG_FILE"
echo "开始时间: $(date)" >> "$LOG_FILE"
# 创建训练脚本
train_script="/tmp/train_${EXPERIMENT_VERSION}.sh"
cat > "$train_script" << EOF
#!/bin/bash
cd /home/pci/ycz/Code/pretrain-worktree
source /home/pci/ycz/Code/pretrain-worktree/.venv/bin/activate
$accelerate_cmd
echo "结束时间: \$(date)"
echo "退出代码: \$?"
EOF
chmod +x "$train_script"
# 使用nohup后台运行
nohup bash "$train_script" >> "$LOG_FILE" 2>&1 &
local train_pid=$!
echo "🔥 训练进程已启动PID: $train_pid"
echo "训练PID: $train_pid" >> "$LOG_FILE"
echo "训练脚本: $train_script" >> "$LOG_FILE"
# 等待几秒确保进程启动
sleep 5
# 检查进程是否还在运行
if kill -0 $train_pid 2>/dev/null; then
echo "✅ 训练进程正在后台运行"
echo "📋 实时查看日志: tail -f $LOG_FILE"
echo "📋 检查进程状态: ps -p $train_pid"
echo "🛑 停止训练: kill $train_pid"
echo "⏰ 预计训练时间: 约17小时"
echo "📈 SwanLab: https://swanlab.cn/project/$SWANLAB_PROJECT"
echo ""
echo "训练正在后台运行,可以安全关闭终端。"
else
echo "❌ 训练进程启动失败"
echo "📋 查看日志: $LOG_FILE"
exit 1
fi
}
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 清理函数
# ----------------------------------------------------------------------------
cleanup() {
echo "🧹 清理临时文件..."
# 在这里添加清理逻辑
}
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 信号处理
# ----------------------------------------------------------------------------
trap cleanup EXIT
trap 'echo "❌ 实验被中断"; cleanup; exit 130' INT TERM
# ----------------------------------------------------------------------------
# 🤖 [AI构建] 主程序入口
# ----------------------------------------------------------------------------
main() {
echo "============================================================================"
echo "🧠 MiniMind Baseline预训练实验"
echo "============================================================================"
# 执行检查和初始化
check_environment
log_experiment_info
# 运行实验
run_experiment
echo "============================================================================"
echo "✅ Baseline实验 $EXPERIMENT_VERSION 完成"
echo "📅 完成时间: $(date)"
echo "============================================================================"
}
# 执行主程序
main "$@"