Minimind/run_file/experiment_1_4_8.sh
Yu Chengzhang 495fc412cd Experiment 1.4.8: Memory Bank多样性检查 + knowledge_num优化
- 修改 model_memory_1_4_8.py: 增加记忆选择多样性监控机制
- 优化 ds_config.json: 调整DeepSpeed配置以支持更大知识库
- 更新 experiment_1_4_8.sh: 配置knowledge_num=1048576提升记忆容量
- 新增 experiment_1_4_7-04.sh: 补充实验对比脚本
- 模型版本管理: 创建model_memory_1_4_8.py用于后续评估

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-01 15:35:14 +08:00

395 lines
14 KiB
Bash
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# ============================================================================
# MiniMind 实验脚本 - Experiment 1.4.8
# ============================================================================
#
# 🎯 实验目标:
# 基于实验1.4.7升级GatedMemoryFusion从门控MLP为交叉注意力机制
#
# 使用方法:
# bash run_file/experiment_1_4_8.sh
# ============================================================================
# ----------------------------------------------------------------------------
# 🧑‍🔬 实验基本信息
# ----------------------------------------------------------------------------
EXPERIMENT_VERSION="1.4.8"
EXPERIMENT_DESCRIPTION="交叉注意力记忆融合机制实验 - 从门控MLP升级为Cross-Attention"
RESEARCHER_NAME="AI Assistant"
EXPERIMENT_DATE="$(date '+%Y-%m-%d %H:%M:%S')"
# ----------------------------------------------------------------------------
# 🤖 环境配置
# ----------------------------------------------------------------------------
# 调试和监控环境变量
export NCCL_DEBUG=INFO
export PYTHONFAULTHANDLER=1
export CUDA_LAUNCH_BLOCKING=1
# SwanLab 配置
export SWANLAB_PROJECT="MiniMind-Experiment-1.4.8"
# 日志配置
LOG_DIR="out/experiment_${EXPERIMENT_VERSION//./_}"
mkdir -p "$LOG_DIR"
LOG_FILE="$LOG_DIR/experiment.log"
# ----------------------------------------------------------------------------
# 🤖 硬件配置
# ----------------------------------------------------------------------------
CUDA_VISIBLE_DEVICES="0"
NUM_PROCESSES="1"
MIXED_PRECISION="bf16"
MAIN_PROCESS_PORT="29500"
# ----------------------------------------------------------------------------
# 🤖 模型架构参数
# ----------------------------------------------------------------------------
MODEL_TYPE="model_memory" # 🔥 使用升级的Cross-Attention Memory模型
MODEL_SIZE="50.0"
DIM="512"
N_LAYERS="8"
N_HEADS="32"
MAX_SEQ_LEN="512"
USE_MOE="false"
# 知识库配置沿用1.4.7配置确保对比公平)
KNOWLEDGE_NUM="1048576" # 1024x1024 = 1048576 (1M entries)
KNOWLEDGE_LENGTH="8" # 每个记忆条目32个token与1.4.7保持一致)
KNOWLEDGE_DIM="128" # 知识向量维度
DISABLE_DB="false"
# ----------------------------------------------------------------------------
# 🤖 训练超参数
# ----------------------------------------------------------------------------
EPOCHS="3"
EMBEDDING_EPOCH="2"
BATCH_SIZE="48" # 与1.4.7保持一致
ACCUMULATION_STEPS="8" # 与1.4.7保持一致
LEARNING_RATE="2e-4"
DTYPE="bfloat16"
GRAD_CLIP="1.0"
WARMUP_ITERS="0"
# 平衡损失配置
BALANCE_LOSS_COEF="0.01" # 与1.4.7保持一致
# 数据和缓存路径沿用1.4.7保证对比公平性)
DATA_PATH="/home/zym/Code/stable/merged_pretrain.jsonl"
DATABASE_INIT_PATH="/home/zym/Code/stable/sentence_trex_data.json"
CLUSTER_CACHE_PATH="cache/memory_bank_init_1048576_32.pt" # 使用1.4.7的缓存配置
VAL_DATA_PATH="/home/zym/Code/stable/eval_data.json"
# 训练配置
NUM_WORKERS="1"
LOG_INTERVAL="100"
VAL_INTERVAL="100"
SAVE_INTERVAL="10000"
# 性能分析配置
USE_PROFILE="true"
PROFILE_INTERVAL="10"
MEMORY_MONITOR_INTERVAL="100"
# 高级功能
USE_FLASH_ATTN="true"
FAST_CLUSTERING="true"
# ----------------------------------------------------------------------------
# 🤖 预检查函数
# ----------------------------------------------------------------------------
check_environment() {
echo "🔍 环境检查中..."
# 检查GPU可用性
if ! nvidia-smi &> /dev/null; then
echo "❌ 错误: 未检测到GPU或nvidia-smi不可用"
exit 1
fi
# 检查CUDA设备
if ! nvidia-smi -i "$CUDA_VISIBLE_DEVICES" &> /dev/null; then
echo "❌ 错误: GPU $CUDA_VISIBLE_DEVICES 不可用"
exit 1
fi
# # 检查Python环境
# if ! .venv/bin/python -c "import torch; print(f'PyTorch: {torch.__version__}')" 2>/dev/null; then
# echo "❌ 错误: PyTorch未正确安装"
# exit 1
# fi
# 检查数据文件
if [[ ! -f "$DATA_PATH" ]]; then
echo "❌ 错误: 训练数据文件不存在: $DATA_PATH"
exit 1
fi
if [[ ! -f "$DATABASE_INIT_PATH" ]]; then
echo "❌ 错误: 数据库初始化文件不存在: $DATABASE_INIT_PATH"
exit 1
fi
# # 🔥 检查Cross-Attention Memory模型实现
# if ! .venv/bin/python -c "from model.model_memory import *; print('Cross-Attention Memory模型实现检查通过')" 2>/dev/null; then
# echo "❌ 错误: Cross-Attention Memory模型实现存在问题"
# echo "请确保model/model_memory.py文件存在且可正常导入"
# exit 1
# fi
# # 检查新的GatedMemoryFusion实现
# if ! .venv/bin/python -c "from model.model_memory import GatedMemoryFusion; import torch.nn as nn; fusion = GatedMemoryFusion(type('Config', (), {'dim': 512})()); assert hasattr(fusion, 'cross_attention'), 'Missing cross_attention'; print('GatedMemoryFusion交叉注意力检查通过')" 2>/dev/null; then
# echo "❌ 错误: GatedMemoryFusion缺少交叉注意力机制"
# exit 1
# fi
echo "✅ 环境检查通过"
}
# ----------------------------------------------------------------------------
# 🤖 实验信息记录
# ----------------------------------------------------------------------------
log_experiment_info() {
echo "📝 记录实验信息..."
cat > "$LOG_DIR/experiment_info.txt" << EOF
========================================
MiniMind 实验信息
========================================
实验版本: $EXPERIMENT_VERSION
实验描述: $EXPERIMENT_DESCRIPTION
研究者: $RESEARCHER_NAME
开始时间: $EXPERIMENT_DATE
========================================
硬件配置:
GPU设备: $CUDA_VISIBLE_DEVICES
进程数: $NUM_PROCESSES
混合精度: $MIXED_PRECISION
========================================
模型配置:
模型类型: $MODEL_TYPE (Cross-Attention Memory)
模型大小: $MODEL_SIZE MB
维度: $DIM
层数: $N_LAYERS
注意力头数: $N_HEADS
最大序列长度: $MAX_SEQ_LEN
知识库大小: $KNOWLEDGE_NUM (1M entries)
知识长度: $KNOWLEDGE_LENGTH (token序列)
知识维度: $KNOWLEDGE_DIM (兼容性保留)
========================================
训练配置:
训练轮次: $EPOCHS
批次大小: $BATCH_SIZE
学习率: $LEARNING_RATE
梯度累积: $ACCUMULATION_STEPS
数据类型: $DTYPE
平衡损失系数: $BALANCE_LOSS_COEF
========================================
Cross-Attention Memory配置:
融合机制: Cross-Attention (vs 1.4.6的门控MLP)
注意力头数: 8头 (dim=512 -> 8*64)
注意力Dropout: 0.1
融合Dropout: 0.15 (比普通Dropout稍高)
层标准化: 是 (残差连接后)
注意力熵正则化: 0.01 (可调整)
温度参数: 可训练 (防止过度集中)
========================================
数据路径:
训练数据: $DATA_PATH
验证数据: $VAL_DATA_PATH
数据库初始化: $DATABASE_INIT_PATH
聚类缓存: $CLUSTER_CACHE_PATH
========================================
EOF
}
# ----------------------------------------------------------------------------
# 🤖 主执行函数
# ----------------------------------------------------------------------------
run_experiment() {
echo "🚀 开始执行实验 $EXPERIMENT_VERSION"
echo "📄 实验描述: $EXPERIMENT_DESCRIPTION"
echo "⏰ 开始时间: $EXPERIMENT_DATE"
# 构建训练命令
local train_cmd="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python train_pretrain_accelerate.py"
# 添加训练参数
train_cmd+=" --out_dir \"$LOG_DIR\""
train_cmd+=" --epochs $EPOCHS"
train_cmd+=" --embedding_epoch $EMBEDDING_EPOCH"
train_cmd+=" --batch_size $BATCH_SIZE"
train_cmd+=" --learning_rate $LEARNING_RATE"
train_cmd+=" --dtype $DTYPE"
train_cmd+=" --num_workers $NUM_WORKERS"
train_cmd+=" --accumulation_steps $ACCUMULATION_STEPS"
train_cmd+=" --grad_clip $GRAD_CLIP"
train_cmd+=" --warmup_iters $WARMUP_ITERS"
train_cmd+=" --log_interval $LOG_INTERVAL"
train_cmd+=" --val_interval $VAL_INTERVAL"
train_cmd+=" --save_interval $SAVE_INTERVAL"
train_cmd+=" --dim $DIM"
train_cmd+=" --n_layers $N_LAYERS"
train_cmd+=" --n_heads $N_HEADS"
train_cmd+=" --max_seq_len $MAX_SEQ_LEN"
train_cmd+=" --data_path \"$DATA_PATH\""
train_cmd+=" --val_data_path \"$VAL_DATA_PATH\""
train_cmd+=" --knowledge_num $KNOWLEDGE_NUM"
train_cmd+=" --knowledge_length $KNOWLEDGE_LENGTH"
train_cmd+=" --database_init_path \"$DATABASE_INIT_PATH\""
train_cmd+=" --memory_monitor_interval $MEMORY_MONITOR_INTERVAL"
train_cmd+=" --model_type \"$MODEL_TYPE\""
train_cmd+=" --model_size $MODEL_SIZE"
train_cmd+=" --balance_loss_coef $BALANCE_LOSS_COEF"
# 可选参数
if [[ "$USE_PROFILE" == "true" ]]; then
train_cmd+=" --profile"
train_cmd+=" --profile_interval $PROFILE_INTERVAL"
fi
if [[ "$USE_FLASH_ATTN" == "true" ]]; then
train_cmd+=" --use_flash_attn"
fi
if [[ "$FAST_CLUSTERING" == "true" ]]; then
train_cmd+=" --fast_clustering"
fi
if [[ "$CLUSTER_CACHE_PATH" != "None" ]]; then
train_cmd+=" --cluster_cache_path \"$CLUSTER_CACHE_PATH\""
fi
# SwanLab配置
train_cmd+=" --use_swanlab"
train_cmd+=" --swanlab_project \"$SWANLAB_PROJECT\""
# train_cmd+=" --swanlab_online False"
echo "📋 执行命令:"
echo "$train_cmd"
echo
# 记录命令到日志文件
echo "执行命令: $train_cmd" >> "$LOG_FILE"
echo "开始时间: $(date)" >> "$LOG_FILE"
# 使用nohup执行训练后台运行输出写入日志文件
echo "🔄 使用nohup后台运行训练输出将写入日志文件: $LOG_FILE"
# 创建训练脚本
train_script="/tmp/train_${EXPERIMENT_VERSION//./_}.sh"
cat > "$train_script" << EOF
#!/bin/bash
cd /home/zym/Code/Minimind
source /home/user/miniconda3/bin/activate
conda activate minimind
$train_cmd
echo "结束时间: \$(date)"
echo "退出代码: \$?"
EOF
chmod +x "$train_script"
# 使用nohup后台运行
nohup bash "$train_script" >> "$LOG_FILE" 2>&1 &
local train_pid=$!
echo "🔥 训练进程已启动PID: $train_pid"
echo "训练PID: $train_pid" >> "$LOG_FILE"
echo "训练脚本: $train_script" >> "$LOG_FILE"
# 等待几秒确保进程启动
sleep 5
# 检查进程是否还在运行
if kill -0 $train_pid 2>/dev/null; then
echo "✅ 训练进程正在后台运行"
echo "📋 实时查看日志: tail -f $LOG_FILE"
echo "📋 检查进程状态: ps -p $train_pid"
echo "🛑 停止训练: kill $train_pid"
echo "📈 SwanLab: https://swanlab.cn/project/$SWANLAB_PROJECT"
echo ""
echo "🧠 Cross-Attention记忆融合机制正在测试中..."
echo " 🔥 融合机制: 门控MLP → 交叉注意力 (8头)"
echo " 🔥 注意力维度: 512维 → 8头*64维/头"
echo " 🔥 Dropout策略: 注意力(0.1) + 融合(0.15)"
echo " 🔥 层标准化: 残差连接后应用"
echo " 🔥 温度参数: 可训练防过度集中"
echo " 🔥 正则化: 注意力熵正则化(0.01)"
echo ""
echo "📊 与实验1.4.7对比:"
echo " - 融合机制: 门控MLP → Cross-Attention"
echo " - 表达能力: 线性变换 → 多头注意力"
echo " - 记忆交互: 串联拼接 → 查询-键-值交互"
echo " - 正则化: 基础Dropout → 熵正则化"
echo ""
echo "训练正在后台运行,可以安全关闭终端。"
echo ""
echo "🎯 预期改进:"
echo " - 推理Loss < 2.47 (优于1.4.7的2.47)"
echo " - 记忆选择更精准和适应性"
echo " - 生成文本连贯性显著提升"
echo " - 利用1.4.7的文本初始化优势"
echo ""
echo "⏱️ 预计训练时间: 15-20小时"
echo "📊 预计GPU占用: ~23GB"
echo ""
else
echo "❌ 训练进程启动失败"
echo "📋 查看日志: $LOG_FILE"
exit 1
fi
}
# ----------------------------------------------------------------------------
# 🤖 清理函数
# ----------------------------------------------------------------------------
cleanup() {
echo "🧹 清理临时文件..."
# 删除临时验证文件
rm -f /tmp/temp_val.jsonl
}
# ----------------------------------------------------------------------------
# 🤖 信号处理
# ----------------------------------------------------------------------------
trap cleanup EXIT
trap 'echo "❌ 实验被中断"; cleanup; exit 130' INT TERM
# ----------------------------------------------------------------------------
# 🤖 主程序入口
# ----------------------------------------------------------------------------
main() {
echo "============================================================================"
echo "🧠 MiniMind 预训练实验 1.4.8"
echo "🎯 Cross-Attention记忆融合机制 - 从门控MLP升级为多头注意力"
echo "============================================================================"
echo ""
echo "🔥 核心创新:"
echo " ► 融合机制: 门控MLP → Cross-Attention (8头)"
echo " ► 交互方式: 串联拼接 → 查询-键-值交互"
echo " ► 正则化: 基础Dropout → 注意力熵正则化"
echo " ► 自适应: 固定权重 → 可训练温度参数"
echo ""
echo "🎯 实验假设:"
echo " ✓ 交叉注意力提供更精准的记忆选择"
echo " ✓ 多头机制捕获记忆多维特征"
echo " ✓ 熵正则化防止注意力过度集中"
echo ""
echo "============================================================================"
# 执行检查和初始化
check_environment
log_experiment_info
# 运行实验
run_experiment
echo "============================================================================"
echo "✅ 实验 $EXPERIMENT_VERSION 启动完成"
echo "📅 启动时间: $(date)"
echo "============================================================================"
}
# 执行主程序
main "$@"