Minimind/run_file/experiment_1_4_9-02.sh
iomgaa afd4fd1f0f Experiment 1.4.9: Memory Bank优化 - 顺序冻结 + 相似度Loss + 维度修复
🔬 实验基础: 基于实验1.4.7的重要改进
🎯 研究目标: 提升Memory Bank的知识保护和检索准确性

🚀 三大核心创新:

1️⃣ 智能冻结策略改进
• 从随机冻结 → 顺序冻结前20%记忆条目
• 保护重要知识: 假设前面的记忆条目更重要,需要优先保护
• freeze_ratio=0.2: 冻结前20%的memory_bank条目

2️⃣ 查询-知识相似度Loss
• 新增相似度监督信号: 衡量查询向量与选中知识的匹配度
• 余弦相似度计算: F.cosine_similarity(query, selected_memory)
• 相似度统计: 平均值、最大值、最小值、标准差全方位监控

3️⃣ 维度截断问题修复
• 统一维度处理: knowledge_dim → dim,避免信息截断
• concat_dim修正: dim + num_selected * dim (之前是knowledge_dim)
• 记忆向量完整保留: 解决查询结果维度被不当压缩的问题

🏗️ 架构优化细节:
• GatedMemoryFusion维度一致性: 统一使用dim维度
• 记忆池化策略: 使用平均池化压缩knowledge_length维度
• 残差连接增强: 改进memory_output与主路径的融合

📊 实验配置:
• experiment_1_4_9-02: 8层网络完整测试
• experiment_1_4_9-04: 1层网络最小验证
• EMA更新机制: decay=0.9, update_freq=5
• 数据库初始化: sentence_trex_data.json文本数据

💡 技术假设:
顺序冻结策略能更好地保护重要知识,相似度Loss能提升检索精度,
维度统一能减少信息丢失,三者结合将显著改善Memory Bank性能。

🛠️ 基础设施改进:
• UUID映射系统: 跟踪记忆条目的原始数据源
• 增强缓存机制: 支持映射文件自动生成
• 监控系统升级: 相似度统计信息实时追踪

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-05 14:24:48 +08:00

247 lines
7.8 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
#########################################################
# 实验1.4.9-02 - Memory Bank文本初始化 + 部分冻结机制
#
# 实验目标:
# 1. 验证使用有意义文本进行memory_bank初始化的效果
# 2. 验证部分memory_bank冻结机制(freeze_ratio=0.2)的效果
#
# 关键特性:
# - 使用sentence_trex_data.json文本数据初始化memory_bank
# - 冻结20%的memory_bank条目保护重要知识
# - Token-based memory机制 + EMA更新
# - Product Key Memory架构
#########################################################
echo "=========================================="
echo "🚀 开始实验 1.4.9-02 - Memory Bank优化"
echo "🔥 新特性: 文本初始化 + 部分冻结机制"
echo "=========================================="
# 实验配置
EXPERIMENT_NAME="experiment_1_4_9-02"
OUTPUT_DIR="out/${EXPERIMENT_NAME}"
LOG_FILE="${OUTPUT_DIR}/experiment.log"
PID_FILE="${OUTPUT_DIR}/train.pid"
# 创建输出目录
mkdir -p $OUTPUT_DIR
echo "📂 实验输出目录: $OUTPUT_DIR"
echo "📝 日志文件: $LOG_FILE"
# 核心参数配置
MODEL_TYPE="model_memory" # 🔥 使用memory架构
DIM=512
N_LAYERS=8
N_HEADS=32
MAX_SEQ_LEN=512
# 🔥 Memory Bank配置 - 实验1.4.9-02关键参数
KNOWLEDGE_NUM=1048576 # 1M条记忆2^20
KNOWLEDGE_LENGTH=8 # 每条记忆32个token
KNOWLEDGE_DIM=128 # 记忆向量维度128
FREEZE_RATIO=0.2 # 🔥 新特性: 冻结20%的记忆条目
# EMA更新配置
USE_EMA_UPDATE="True"
EMA_DECAY=0.9 # EMA衰减率
EMA_UPDATE_FREQ=5 # EMA更新频率
# 训练配置
EPOCHS=3
BATCH_SIZE=48
ACCUMULATION_STEPS=8
LEARNING_RATE=2e-4
DTYPE="bfloat16"
GRAD_CLIP=1.0
BALANCE_LOSS_COEF=0.01 # 平衡损失系数
# 数据路径配置
DATA_PATH="/home/zym/Code/stable/merged_pretrain.jsonl"
DATABASE_INIT_PATH="/home/zym/Code/stable/sentence_trex_data.json" # 🔥 文本数据初始化
CACHE_PATH="cache/memory_bank_init_${KNOWLEDGE_NUM}_${KNOWLEDGE_LENGTH}.pt" # 🔥 Memory初始化缓存
# GPU和性能配置
export CUDA_VISIBLE_DEVICES=0
NUM_WORKERS=8
MIXED_PRECISION="bf16"
# 监控配置
USE_SWANLAB="True"
SWANLAB_PROJECT="MiniMind-Experiment-1.4.9-02"
SWANLAB_ONLINE="False" # 离线模式
# 验证和日志配置
LOG_INTERVAL=100
VAL_INTERVAL=200
PROFILE="True"
PROFILE_INTERVAL=10
MEMORY_MONITOR="False" # 关闭内存监控降低开销
echo "=========================================="
echo "📋 实验配置摘要"
echo "=========================================="
echo "🔥 核心特性:"
echo " - Model Type: $MODEL_TYPE"
echo " - Memory Bank Size: $KNOWLEDGE_NUM"
echo " - Memory Length: $KNOWLEDGE_LENGTH tokens"
echo " - Freeze Ratio: $FREEZE_RATIO (冻结 $((KNOWLEDGE_NUM * 20 / 100)) 条记忆)"
echo " - Text Initialization: $DATABASE_INIT_PATH"
echo ""
echo "🏗️ 模型架构:"
echo " - Dimension: $DIM"
echo " - Layers: $N_LAYERS"
echo " - Heads: $N_HEADS"
echo " - Max Seq Len: $MAX_SEQ_LEN"
echo ""
echo "📚 训练设置:"
echo " - Epochs: $EPOCHS"
echo " - Batch Size: $BATCH_SIZE"
echo " - Learning Rate: $LEARNING_RATE"
echo " - Data Type: $DTYPE"
echo ""
echo "⚡ EMA配置:"
echo " - EMA Decay: $EMA_DECAY"
echo " - Update Frequency: $EMA_UPDATE_FREQ"
echo ""
echo "📊 监控:"
echo " - SwanLab Project: $SWANLAB_PROJECT"
echo " - Log Interval: $LOG_INTERVAL"
echo "=========================================="
# 检查必要文件
echo "🔍 检查必要文件..."
if [[ ! -f "$DATA_PATH" ]]; then
echo "❌ 错误: 训练数据文件不存在: $DATA_PATH"
exit 1
fi
if [[ ! -f "$DATABASE_INIT_PATH" ]]; then
echo "❌ 错误: Memory初始化数据文件不存在: $DATABASE_INIT_PATH"
exit 1
fi
echo "✅ 文件检查通过"
# 构建训练命令 - 参考experiment_1_4_6.sh的成功模式
TRAIN_CMD="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python train_pretrain_accelerate.py"
TRAIN_CMD+=" --out_dir \"$OUTPUT_DIR\""
TRAIN_CMD+=" --epochs $EPOCHS"
TRAIN_CMD+=" --embedding_epoch 2"
TRAIN_CMD+=" --batch_size $BATCH_SIZE"
TRAIN_CMD+=" --learning_rate $LEARNING_RATE"
TRAIN_CMD+=" --dtype $DTYPE"
TRAIN_CMD+=" --num_workers $NUM_WORKERS"
TRAIN_CMD+=" --accumulation_steps $ACCUMULATION_STEPS"
TRAIN_CMD+=" --grad_clip $GRAD_CLIP"
TRAIN_CMD+=" --warmup_iters 0"
TRAIN_CMD+=" --log_interval $LOG_INTERVAL"
TRAIN_CMD+=" --val_interval $VAL_INTERVAL"
TRAIN_CMD+=" --dim $DIM"
TRAIN_CMD+=" --n_layers $N_LAYERS"
TRAIN_CMD+=" --n_heads $N_HEADS"
TRAIN_CMD+=" --max_seq_len $MAX_SEQ_LEN"
TRAIN_CMD+=" --data_path \"$DATA_PATH\""
TRAIN_CMD+=" --knowledge_num $KNOWLEDGE_NUM"
TRAIN_CMD+=" --knowledge_length $KNOWLEDGE_LENGTH"
TRAIN_CMD+=" --knowledge_dim $KNOWLEDGE_DIM"
TRAIN_CMD+=" --database_init_path \"$DATABASE_INIT_PATH\""
TRAIN_CMD+=" --cluster_cache_path \"$CACHE_PATH\""
TRAIN_CMD+=" --model_type \"$MODEL_TYPE\""
TRAIN_CMD+=" --balance_loss_coef $BALANCE_LOSS_COEF"
# 添加可选的flag参数不需要值的参数
# TRAIN_CMD+=" --use_swanlab"
TRAIN_CMD+=" --profile"
TRAIN_CMD+=" --use_flash_attn"
# 添加有值的可选参数
TRAIN_CMD+=" --swanlab_project \"$SWANLAB_PROJECT\""
TRAIN_CMD+=" --swanlab_online $SWANLAB_ONLINE"
TRAIN_CMD+=" --profile_interval $PROFILE_INTERVAL"
# 添加memory monitor参数如果启用
if [[ "$MEMORY_MONITOR" == "True" ]]; then
TRAIN_CMD+=" --memory_monitor"
fi
echo ""
echo "🚀 启动训练..."
echo "📝 完整训练命令:"
echo "$TRAIN_CMD"
echo ""
echo "⏰ 预计训练时间: 约6-8小时"
echo "📊 实时监控: 查看 $LOG_FILE"
echo ""
# 记录命令到日志文件
echo "执行命令: $TRAIN_CMD" >> "$LOG_FILE"
echo "开始时间: $(date)" >> "$LOG_FILE"
# 创建训练脚本参考1.4.6的成功模式)
TRAIN_SCRIPT="/tmp/train_1_4_9-02.sh"
cat > "$TRAIN_SCRIPT" << EOF
#!/bin/bash
$TRAIN_CMD
echo "结束时间: \$(date)"
echo "退出代码: \$?"
EOF
chmod +x "$TRAIN_SCRIPT"
# 使用nohup后台运行训练脚本
nohup bash "$TRAIN_SCRIPT" >> "$LOG_FILE" 2>&1 &
TRAIN_PID=$!
echo $TRAIN_PID > $PID_FILE
echo "=========================================="
echo "✅ 实验1.4.9-02已启动"
echo "🆔 进程ID: $TRAIN_PID"
echo "📝 日志文件: $LOG_FILE"
echo "📊 监控命令: tail -f $LOG_FILE"
echo "🛑 停止命令: kill $TRAIN_PID"
echo "=========================================="
echo ""
echo "🔥 实验1.4.9-02 - Memory Bank优化特性:"
echo " ✨ 文本数据初始化 (sentence_trex_data.json)"
echo " ✨ 部分冻结机制 (freeze_ratio=0.2)"
echo " ✨ Token-based EMA更新"
echo " ✨ Product Key Memory架构"
echo ""
echo "📋 监控要点:"
echo " - 初始化阶段:观察文本数据加载和缓存"
echo " - 训练阶段关注frozen_memories统计"
echo " - EMA更新监控update_ratio和coverage指标"
echo " - 生成质量:对比词组连贯性改善"
echo ""
echo "⚡ 进程状态检查:"
echo "ps aux | grep $TRAIN_PID"
echo ""
# 显示初始进程状态
sleep 2
if ps -p $TRAIN_PID > /dev/null; then
echo "✅ 训练进程正在运行 (PID: $TRAIN_PID)"
# 显示前几行日志
echo ""
echo "📋 初始日志预览:"
echo "----------------------------------------"
timeout 5 tail -f $LOG_FILE | head -10 || echo "日志文件尚未生成,请稍等..."
echo "----------------------------------------"
else
echo "❌ 训练进程启动失败,请检查日志:"
echo "cat $LOG_FILE"
fi
echo ""
echo "🎯 实验1.4.9-02核心验证点:"
echo " 1. Memory bank是否成功用文本数据初始化"
echo " 2. 冻结机制是否正常工作 (20%条目不更新)"
echo " 3. 生成质量是否有明显改善"
echo " 4. 训练稳定性是否提升"
echo ""
echo "📖 实验记录: experiment/EXPERIMENT_1_4_9-02.md"
echo "🚀 实验1.4.9-02启动完成"