#!/bin/bash ######################################################### # 实验1.4.7 - Memory Bank文本初始化 + 部分冻结机制 # # 实验目标: # 1. 验证使用有意义文本进行memory_bank初始化的效果 # 2. 验证部分memory_bank冻结机制(freeze_ratio=0.2)的效果 # # 关键特性: # - 使用sentence_trex_data.json文本数据初始化memory_bank # - 冻结20%的memory_bank条目,保护重要知识 # - Token-based memory机制 + EMA更新 # - Product Key Memory架构 ######################################################### echo "==========================================" echo "🚀 开始实验 1.4.7 - Memory Bank优化" echo "🔥 新特性: 文本初始化 + 部分冻结机制" echo "==========================================" # 实验配置 EXPERIMENT_NAME="experiment_1_4_7-04" OUTPUT_DIR="out/${EXPERIMENT_NAME}" LOG_FILE="${OUTPUT_DIR}/experiment.log" PID_FILE="${OUTPUT_DIR}/train.pid" # 创建输出目录 mkdir -p $OUTPUT_DIR echo "📂 实验输出目录: $OUTPUT_DIR" echo "📝 日志文件: $LOG_FILE" # 核心参数配置 MODEL_TYPE="model_memory" # 🔥 使用memory架构 DIM=512 N_LAYERS=8 N_HEADS=32 MAX_SEQ_LEN=512 # 🔥 Memory Bank配置 - 实验1.4.7关键参数 KNOWLEDGE_NUM=1048576 # 1M条记忆(2^20) KNOWLEDGE_LENGTH=8 # 每条记忆32个token KNOWLEDGE_DIM=128 # 记忆向量维度128 FREEZE_RATIO=0.2 # 🔥 新特性: 冻结20%的记忆条目 # EMA更新配置 USE_EMA_UPDATE="True" EMA_DECAY=0.9 # EMA衰减率 EMA_UPDATE_FREQ=5 # EMA更新频率 # 训练配置 EPOCHS=3 BATCH_SIZE=48 ACCUMULATION_STEPS=8 LEARNING_RATE=2e-4 DTYPE="bfloat16" GRAD_CLIP=1.0 BALANCE_LOSS_COEF=0.01 # 平衡损失系数 # 数据路径配置 DATA_PATH="/home/pci/ycz/Code/Minimind/dataset/stable/merged_pretrain.jsonl" DATABASE_INIT_PATH="/home/pci/ycz/Code/Minimind/dataset/stable/sentence_trex_data.json" # 🔥 文本数据初始化 CACHE_PATH="cache/memory_bank_init_${KNOWLEDGE_NUM}_${KNOWLEDGE_LENGTH}.pt" # 🔥 Memory初始化缓存 # GPU和性能配置 export CUDA_VISIBLE_DEVICES=0 NUM_WORKERS=1 MIXED_PRECISION="bf16" # 监控配置 USE_SWANLAB="True" SWANLAB_PROJECT="MiniMind-Experiment-1.4.7" SWANLAB_ONLINE="False" # 离线模式 # 验证和日志配置 LOG_INTERVAL=100 VAL_INTERVAL=200 PROFILE="True" PROFILE_INTERVAL=10 MEMORY_MONITOR="False" # 关闭内存监控降低开销 echo "==========================================" echo "📋 实验配置摘要" echo "==========================================" echo "🔥 核心特性:" echo " - Model Type: $MODEL_TYPE" echo " - Memory Bank Size: $KNOWLEDGE_NUM 条" echo " - Memory Length: $KNOWLEDGE_LENGTH tokens" echo " - Freeze Ratio: $FREEZE_RATIO (冻结 $((KNOWLEDGE_NUM * 20 / 100)) 条记忆)" echo " - Text Initialization: $DATABASE_INIT_PATH" echo "" echo "🏗️ 模型架构:" echo " - Dimension: $DIM" echo " - Layers: $N_LAYERS" echo " - Heads: $N_HEADS" echo " - Max Seq Len: $MAX_SEQ_LEN" echo "" echo "📚 训练设置:" echo " - Epochs: $EPOCHS" echo " - Batch Size: $BATCH_SIZE" echo " - Learning Rate: $LEARNING_RATE" echo " - Data Type: $DTYPE" echo "" echo "⚡ EMA配置:" echo " - EMA Decay: $EMA_DECAY" echo " - Update Frequency: $EMA_UPDATE_FREQ" echo "" echo "📊 监控:" echo " - SwanLab Project: $SWANLAB_PROJECT" echo " - Log Interval: $LOG_INTERVAL" echo "==========================================" # 检查必要文件 echo "🔍 检查必要文件..." if [[ ! -f "$DATA_PATH" ]]; then echo "❌ 错误: 训练数据文件不存在: $DATA_PATH" exit 1 fi if [[ ! -f "$DATABASE_INIT_PATH" ]]; then echo "❌ 错误: Memory初始化数据文件不存在: $DATABASE_INIT_PATH" exit 1 fi echo "✅ 文件检查通过" # 构建训练命令 - 参考experiment_1_4_6.sh的成功模式 TRAIN_CMD="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES .venv/bin/python train_pretrain_accelerate.py" TRAIN_CMD+=" --out_dir \"$OUTPUT_DIR\"" TRAIN_CMD+=" --epochs $EPOCHS" TRAIN_CMD+=" --embedding_epoch 2" TRAIN_CMD+=" --batch_size $BATCH_SIZE" TRAIN_CMD+=" --learning_rate $LEARNING_RATE" TRAIN_CMD+=" --dtype $DTYPE" TRAIN_CMD+=" --num_workers $NUM_WORKERS" TRAIN_CMD+=" --accumulation_steps $ACCUMULATION_STEPS" TRAIN_CMD+=" --grad_clip $GRAD_CLIP" TRAIN_CMD+=" --warmup_iters 0" TRAIN_CMD+=" --log_interval $LOG_INTERVAL" TRAIN_CMD+=" --val_interval $VAL_INTERVAL" TRAIN_CMD+=" --dim $DIM" TRAIN_CMD+=" --n_layers $N_LAYERS" TRAIN_CMD+=" --n_heads $N_HEADS" TRAIN_CMD+=" --max_seq_len $MAX_SEQ_LEN" TRAIN_CMD+=" --data_path \"$DATA_PATH\"" TRAIN_CMD+=" --knowledge_num $KNOWLEDGE_NUM" TRAIN_CMD+=" --knowledge_length $KNOWLEDGE_LENGTH" TRAIN_CMD+=" --knowledge_dim $KNOWLEDGE_DIM" TRAIN_CMD+=" --database_init_path \"$DATABASE_INIT_PATH\"" TRAIN_CMD+=" --cluster_cache_path \"$CACHE_PATH\"" TRAIN_CMD+=" --model_type \"$MODEL_TYPE\"" TRAIN_CMD+=" --balance_loss_coef $BALANCE_LOSS_COEF" # 添加可选的flag参数(不需要值的参数) TRAIN_CMD+=" --use_swanlab" TRAIN_CMD+=" --profile" TRAIN_CMD+=" --use_flash_attn" # 添加有值的可选参数 TRAIN_CMD+=" --swanlab_project \"$SWANLAB_PROJECT\"" TRAIN_CMD+=" --swanlab_online $SWANLAB_ONLINE" TRAIN_CMD+=" --profile_interval $PROFILE_INTERVAL" # 添加memory monitor参数(如果启用) if [[ "$MEMORY_MONITOR" == "True" ]]; then TRAIN_CMD+=" --memory_monitor" fi echo "" echo "🚀 启动训练..." echo "📝 完整训练命令:" echo "$TRAIN_CMD" echo "" echo "⏰ 预计训练时间: 约6-8小时" echo "📊 实时监控: 查看 $LOG_FILE" echo "" # 记录命令到日志文件 echo "执行命令: $TRAIN_CMD" >> "$LOG_FILE" echo "开始时间: $(date)" >> "$LOG_FILE" # 创建训练脚本(参考1.4.6的成功模式) TRAIN_SCRIPT="/tmp/train_1_4_7-04.sh" cat > "$TRAIN_SCRIPT" << EOF #!/bin/bash cd /home/pci/ycz/Code/pretrain-worktree source /home/pci/ycz/Code/pretrain-worktree/.venv/bin/activate $TRAIN_CMD echo "结束时间: \$(date)" echo "退出代码: \$?" EOF chmod +x "$TRAIN_SCRIPT" # 使用nohup后台运行训练脚本 nohup bash "$TRAIN_SCRIPT" >> "$LOG_FILE" 2>&1 & TRAIN_PID=$! echo $TRAIN_PID > $PID_FILE echo "==========================================" echo "✅ 实验1.4.7已启动" echo "🆔 进程ID: $TRAIN_PID" echo "📝 日志文件: $LOG_FILE" echo "📊 监控命令: tail -f $LOG_FILE" echo "🛑 停止命令: kill $TRAIN_PID" echo "==========================================" echo "" echo "🔥 实验1.4.7 - Memory Bank优化特性:" echo " ✨ 文本数据初始化 (sentence_trex_data.json)" echo " ✨ 部分冻结机制 (freeze_ratio=0.2)" echo " ✨ Token-based EMA更新" echo " ✨ Product Key Memory架构" echo "" echo "📋 监控要点:" echo " - 初始化阶段:观察文本数据加载和缓存" echo " - 训练阶段:关注frozen_memories统计" echo " - EMA更新:监控update_ratio和coverage指标" echo " - 生成质量:对比词组连贯性改善" echo "" echo "⚡ 进程状态检查:" echo "ps aux | grep $TRAIN_PID" echo "" # 显示初始进程状态 sleep 2 if ps -p $TRAIN_PID > /dev/null; then echo "✅ 训练进程正在运行 (PID: $TRAIN_PID)" # 显示前几行日志 echo "" echo "📋 初始日志预览:" echo "----------------------------------------" timeout 5 tail -f $LOG_FILE | head -10 || echo "日志文件尚未生成,请稍等..." echo "----------------------------------------" else echo "❌ 训练进程启动失败,请检查日志:" echo "cat $LOG_FILE" fi echo "" echo "🎯 实验1.4.7核心验证点:" echo " 1. Memory bank是否成功用文本数据初始化" echo " 2. 冻结机制是否正常工作 (20%条目不更新)" echo " 3. 生成质量是否有明显改善" echo " 4. 训练稳定性是否提升" echo "" echo "📖 实验记录: experiment/EXPERIMENT_1_4_7-04.md" echo "🚀 实验1.4.7启动完成!"