Minimind/run_file/experiment_1_4_3.sh

354 lines
12 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# ============================================================================
# MiniMind 实验脚本 - Experiment 1.4.3
# ============================================================================
#
# 🎯 实验目标: 验证完整信息对记忆查询效果的影响
# 📝 实验描述: 使用完整信息h替代注意力输出h_attn进行记忆查询和交叉注意力
# 🔬 研究假设: 完整信息包含更丰富的上下文,能提升记忆查询精度和文本连贯性
# ============================================================================
# ----------------------------------------------------------------------------
# 🧑‍🔬 实验基本信息
# ----------------------------------------------------------------------------
EXPERIMENT_VERSION="1_4_3"
EXPERIMENT_DESCRIPTION="Complete information (h) for memory query instead of attention output (h_attn)"
RESEARCHER_NAME="Human-AI Collaboration"
EXPERIMENT_DATE="$(date '+%Y-%m-%d %H:%M:%S')"
# ----------------------------------------------------------------------------
# 🤖 环境配置
# ----------------------------------------------------------------------------
# UV虚拟环境激活
export PYTHONFAULTHANDLER=1
export CUDA_LAUNCH_BLOCKING=0 # 设为0以提高性能
# SwanLab 配置
export SWANLAB_PROJECT="MiniMind-Memory-Query-Enhancement"
# 日志配置
LOG_DIR="out/experiment_${EXPERIMENT_VERSION}"
mkdir -p "$LOG_DIR"
LOG_FILE="$LOG_DIR/experiment.log"
# ----------------------------------------------------------------------------
# 🤖 硬件配置
# ----------------------------------------------------------------------------
CUDA_VISIBLE_DEVICES="0"
NUM_PROCESSES="1"
MIXED_PRECISION="bf16"
MAIN_PROCESS_PORT="29500"
# ----------------------------------------------------------------------------
# 🤖 模型架构参数
# ----------------------------------------------------------------------------
MODEL_TYPE="model" # 使用标准model已修改为完整信息查询
MODEL_SIZE="26.0"
DIM="512"
N_LAYERS="8"
N_HEADS="32"
MAX_SEQ_LEN="512"
USE_MOE="false"
# 记忆库配置与1.4.2保持一致以便对比)
KNOWLEDGE_NUM="65536" # 64K条记忆256x256完全平方数
KNOWLEDGE_DIM="128" # 记忆向量维度
KNOWLEDGE_LENGTH="32" # 单条记忆长度
NUM_SELECTED="8" # 每次选择的记忆数
# ----------------------------------------------------------------------------
# 🤖 训练超参数与1.4.2完全一致)
# ----------------------------------------------------------------------------
EPOCHS="3"
EMBEDDING_EPOCH="2"
BATCH_SIZE="64" # 与对照实验保持一致
ACCUMULATION_STEPS="8"
LEARNING_RATE="2e-4"
DTYPE="bfloat16"
GRAD_CLIP="1.0"
WARMUP_ITERS="0"
# 数据路径
DATA_PATH="/home/pci/ycz/Code/Minimind/dataset/stable/merged_pretrain.jsonl"
DATABASE_INIT_PATH="None" # 随机初始化记忆库,保持一致性
CLUSTER_CACHE_PATH="None"
# 训练配置
NUM_WORKERS="1"
LOG_INTERVAL="1"
SAVE_INTERVAL="10000"
# 性能分析配置
USE_PROFILE="true"
PROFILE_INTERVAL="10"
MEMORY_MONITOR_INTERVAL="10"
# 高级功能
USE_FLASH_ATTN="true"
USE_SWANLAB="true"
SWANLAB_ONLINE="false"
# ----------------------------------------------------------------------------
# 🤖 预检查函数
# ----------------------------------------------------------------------------
check_environment() {
echo "🔍 环境检查中..."
# 检查GPU可用性
if ! nvidia-smi &> /dev/null; then
echo "❌ 错误: 未检测到GPU或nvidia-smi不可用"
exit 1
fi
# 检查CUDA设备
if ! nvidia-smi -i "$CUDA_VISIBLE_DEVICES" &> /dev/null; then
echo "❌ 错误: GPU $CUDA_VISIBLE_DEVICES 不可用"
exit 1
fi
# 检查Python环境
if ! .venv/bin/python -c "import torch; print(f'PyTorch: {torch.__version__}')" 2>/dev/null; then
echo "❌ 错误: PyTorch未正确安装"
exit 1
fi
# 检查数据文件
if [[ ! -f "$DATA_PATH" ]]; then
echo "❌ 错误: 训练数据文件不存在: $DATA_PATH"
exit 1
fi
# 检查model.py中的修改是否正确
if ! grep -q "h = x + h_attn # 计算完整信息" model/model.py; then
echo "❌ 错误: model.py中未找到完整信息查询的修改"
echo "请确认已正确修改MiniMindBlock.forward方法"
exit 1
fi
echo "✅ 环境检查通过"
}
# ----------------------------------------------------------------------------
# 🤖 实验信息记录
# ----------------------------------------------------------------------------
log_experiment_info() {
echo "📝 记录实验信息..."
cat > "$LOG_DIR/experiment_info.txt" << EOF
========================================
MiniMind 记忆查询增强实验信息
========================================
实验版本: $EXPERIMENT_VERSION
实验描述: $EXPERIMENT_DESCRIPTION
研究者: $RESEARCHER_NAME
开始时间: $EXPERIMENT_DATE
========================================
核心改进:
- 记忆查询使用完整信息h替代注意力输出h_attn
- 交叉注意力输入也使用完整信息h
- 保持Product Key Memory选择机制不变
- 保持交叉注意力架构不变
========================================
技术细节:
原方案: db, db_embeddings = self.knowledge_dataset.search_index(h_attn)
h_attn = self.cross_attention(h_attn, db_embeddings)
新方案: h = x + h_attn # 计算完整信息
db, db_embeddings = self.knowledge_dataset.search_index(h)
memory_output = self.cross_attention(h, db_embeddings)
========================================
对照实验:
- 基准实验: 1.4.0 (model_original, Loss: 1.9)
- 对比实验: 1.4.1 (h_attn查询, Loss: 0.6, 但文本碎片化)
- 本实验: 1.4.3 (h完整信息查询)
========================================
硬件配置:
GPU设备: $CUDA_VISIBLE_DEVICES
进程数: $NUM_PROCESSES
混合精度: $MIXED_PRECISION
========================================
模型配置:
模型类型: $MODEL_TYPE (完整信息查询版本)
模型大小: $MODEL_SIZE MB
维度: $DIM
层数: $N_LAYERS
注意力头数: $N_HEADS
最大序列长度: $MAX_SEQ_LEN
记忆库条目数: $KNOWLEDGE_NUM
记忆向量维度: $KNOWLEDGE_DIM
每次选择记忆数: $NUM_SELECTED
========================================
训练配置:
训练轮次: $EPOCHS
批次大小: $BATCH_SIZE
学习率: $LEARNING_RATE
梯度累积: $ACCUMULATION_STEPS
数据类型: $DTYPE
========================================
数据路径:
训练数据: $DATA_PATH
记忆库初始化: $DATABASE_INIT_PATH
========================================
EOF
}
# ----------------------------------------------------------------------------
# 🤖 主执行函数
# ----------------------------------------------------------------------------
run_experiment() {
echo "🚀 开始执行实验 $EXPERIMENT_VERSION"
echo "📄 实验描述: $EXPERIMENT_DESCRIPTION"
echo "⏰ 开始时间: $EXPERIMENT_DATE"
# 构建训练命令
local train_cmd="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES uv run python -m accelerate.commands.launch"
train_cmd+=" --num_processes=$NUM_PROCESSES"
train_cmd+=" --mixed_precision=$MIXED_PRECISION"
train_cmd+=" --main_process_port=$MAIN_PROCESS_PORT"
train_cmd+=" train_pretrain_accelerate.py"
# 添加训练参数
train_cmd+=" --out_dir \"$LOG_DIR\""
train_cmd+=" --epochs $EPOCHS"
train_cmd+=" --embedding_epoch $EMBEDDING_EPOCH"
train_cmd+=" --batch_size $BATCH_SIZE"
train_cmd+=" --learning_rate $LEARNING_RATE"
train_cmd+=" --dtype $DTYPE"
train_cmd+=" --num_workers $NUM_WORKERS"
train_cmd+=" --accumulation_steps $ACCUMULATION_STEPS"
train_cmd+=" --grad_clip $GRAD_CLIP"
train_cmd+=" --warmup_iters $WARMUP_ITERS"
train_cmd+=" --log_interval $LOG_INTERVAL"
train_cmd+=" --save_interval $SAVE_INTERVAL"
train_cmd+=" --dim $DIM"
train_cmd+=" --n_layers $N_LAYERS"
train_cmd+=" --n_heads $N_HEADS"
train_cmd+=" --max_seq_len $MAX_SEQ_LEN"
train_cmd+=" --data_path \"$DATA_PATH\""
train_cmd+=" --knowledge_num $KNOWLEDGE_NUM"
train_cmd+=" --knowledge_length $KNOWLEDGE_LENGTH"
train_cmd+=" --knowledge_dim $KNOWLEDGE_DIM"
train_cmd+=" --memory_monitor_interval $MEMORY_MONITOR_INTERVAL"
train_cmd+=" --model_type \"$MODEL_TYPE\""
train_cmd+=" --model_size $MODEL_SIZE"
train_cmd+=" --swanlab_online $SWANLAB_ONLINE"
train_cmd+=" --database_init_path \"$DATABASE_INIT_PATH\""
# 可选参数
if [[ "$USE_PROFILE" == "true" ]]; then
train_cmd+=" --profile"
train_cmd+=" --profile_interval $PROFILE_INTERVAL"
fi
if [[ "$USE_FLASH_ATTN" == "true" ]]; then
train_cmd+=" --use_flash_attn"
fi
if [[ "$USE_SWANLAB" == "true" ]]; then
train_cmd+=" --use_swanlab"
train_cmd+=" --swanlab_project \"$SWANLAB_PROJECT\""
fi
echo "📋 执行命令:"
echo "$train_cmd"
echo
# 记录命令到日志文件
echo "执行命令: $train_cmd" >> "$LOG_FILE"
echo "开始时间: $(date)" >> "$LOG_FILE"
# 使用nohup执行训练后台运行
echo "🔄 使用nohup后台运行训练输出将写入日志文件: $LOG_FILE"
# 创建训练脚本
train_script="/tmp/train_${EXPERIMENT_VERSION}.sh"
cat > "$train_script" << EOF
#!/bin/bash
cd /home/pci/ycz/Code/pretrain-worktree
export PYTHONFAULTHANDLER=1
export SWANLAB_PROJECT="$SWANLAB_PROJECT"
$train_cmd
echo "结束时间: \$(date)"
echo "退出代码: \$?"
EOF
chmod +x "$train_script"
# 使用nohup后台运行
nohup bash "$train_script" >> "$LOG_FILE" 2>&1 &
local train_pid=$!
echo "🔥 训练进程已启动PID: $train_pid"
echo "训练PID: $train_pid" >> "$LOG_FILE"
echo "训练脚本: $train_script" >> "$LOG_FILE"
# 等待几秒确保进程启动
sleep 5
# 检查进程是否还在运行
if kill -0 $train_pid 2>/dev/null; then
echo "✅ 训练进程正在后台运行"
echo "📋 实时查看日志: tail -f $LOG_FILE"
echo "📋 检查进程状态: ps aux | grep train_pretrain_accelerate"
echo "🛑 停止训练: kill $train_pid"
echo "⏰ 预计训练时间: 10-15小时 (3 epochs, RTX 4090)"
echo "📈 SwanLab: 本地模式,输出目录中查看"
echo ""
echo "🎯 实验重点:"
echo " - 对比完整信息h vs 注意力输出h_attn的查询效果"
echo " - 验证是否能改善文本连贯性问题"
echo " - 观察Loss收敛情况和生成质量"
echo " - 期望: Loss保持低水平文本连贯性提升"
echo ""
echo "训练正在后台运行,可以安全关闭终端。"
else
echo "❌ 训练进程启动失败"
echo "📋 查看日志: $LOG_FILE"
exit 1
fi
}
# ----------------------------------------------------------------------------
# 🤖 清理函数
# ----------------------------------------------------------------------------
cleanup() {
echo "🧹 清理临时文件..."
# 清理临时脚本
if [[ -f "/tmp/train_${EXPERIMENT_VERSION}.sh" ]]; then
rm -f "/tmp/train_${EXPERIMENT_VERSION}.sh"
fi
}
# ----------------------------------------------------------------------------
# 🤖 信号处理
# ----------------------------------------------------------------------------
trap cleanup EXIT
trap 'echo "❌ 实验被中断"; cleanup; exit 130' INT TERM
# ----------------------------------------------------------------------------
# 🤖 主程序入口
# ----------------------------------------------------------------------------
main() {
echo "============================================================================"
echo "🧠 MiniMind 记忆查询增强实验"
echo "============================================================================"
echo "🎯 实验版本: $EXPERIMENT_VERSION"
echo "📝 实验目标: 完整信息查询vs注意力输出查询"
echo "🔬 核心假设: 完整信息能提升记忆查询精度和文本连贯性"
echo "============================================================================"
# 执行检查和初始化
check_environment
log_experiment_info
# 运行实验
run_experiment
echo "============================================================================"
echo "✅ 实验 $EXPERIMENT_VERSION 已启动"
echo "📅 启动时间: $(date)"
echo "🔍 对照实验: 1.4.1 (h_attn查询) vs 1.4.3 (h完整信息查询)"
echo "============================================================================"
}
# 执行主程序
main "$@"