2025-08-19 19:32:52 +08:00
|
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
|
|
|
|
#########################################################
|
2025-09-05 14:24:48 +08:00
|
|
|
|
# 实验1.4.9-02 - Memory Bank文本初始化 + 部分冻结机制
|
2025-08-19 19:32:52 +08:00
|
|
|
|
#
|
|
|
|
|
|
# 实验目标:
|
|
|
|
|
|
# 1. 验证使用有意义文本进行memory_bank初始化的效果
|
|
|
|
|
|
# 2. 验证部分memory_bank冻结机制(freeze_ratio=0.2)的效果
|
|
|
|
|
|
#
|
|
|
|
|
|
# 关键特性:
|
|
|
|
|
|
# - 使用sentence_trex_data.json文本数据初始化memory_bank
|
|
|
|
|
|
# - 冻结20%的memory_bank条目,保护重要知识
|
|
|
|
|
|
# - Token-based memory机制 + EMA更新
|
|
|
|
|
|
# - Product Key Memory架构
|
|
|
|
|
|
#########################################################
|
|
|
|
|
|
|
|
|
|
|
|
echo "=========================================="
|
2025-09-05 14:24:48 +08:00
|
|
|
|
echo "🚀 开始实验 1.4.9-02 - Memory Bank优化"
|
2025-08-19 19:32:52 +08:00
|
|
|
|
echo "🔥 新特性: 文本初始化 + 部分冻结机制"
|
|
|
|
|
|
echo "=========================================="
|
|
|
|
|
|
|
|
|
|
|
|
# 实验配置
|
2025-09-05 14:24:48 +08:00
|
|
|
|
EXPERIMENT_NAME="experiment_1_4_9-02"
|
2025-08-19 19:32:52 +08:00
|
|
|
|
OUTPUT_DIR="out/${EXPERIMENT_NAME}"
|
|
|
|
|
|
LOG_FILE="${OUTPUT_DIR}/experiment.log"
|
|
|
|
|
|
PID_FILE="${OUTPUT_DIR}/train.pid"
|
|
|
|
|
|
|
|
|
|
|
|
# 创建输出目录
|
|
|
|
|
|
mkdir -p $OUTPUT_DIR
|
|
|
|
|
|
|
|
|
|
|
|
echo "📂 实验输出目录: $OUTPUT_DIR"
|
|
|
|
|
|
echo "📝 日志文件: $LOG_FILE"
|
|
|
|
|
|
|
|
|
|
|
|
# 核心参数配置
|
|
|
|
|
|
MODEL_TYPE="model_memory" # 🔥 使用memory架构
|
|
|
|
|
|
DIM=512
|
|
|
|
|
|
N_LAYERS=8
|
|
|
|
|
|
N_HEADS=32
|
|
|
|
|
|
MAX_SEQ_LEN=512
|
|
|
|
|
|
|
2025-09-05 14:24:48 +08:00
|
|
|
|
# 🔥 Memory Bank配置 - 实验1.4.9-02关键参数
|
2025-08-19 19:32:52 +08:00
|
|
|
|
KNOWLEDGE_NUM=1048576 # 1M条记忆(2^20)
|
|
|
|
|
|
KNOWLEDGE_LENGTH=8 # 每条记忆32个token
|
|
|
|
|
|
KNOWLEDGE_DIM=128 # 记忆向量维度128
|
|
|
|
|
|
FREEZE_RATIO=0.2 # 🔥 新特性: 冻结20%的记忆条目
|
|
|
|
|
|
|
|
|
|
|
|
# EMA更新配置
|
|
|
|
|
|
USE_EMA_UPDATE="True"
|
|
|
|
|
|
EMA_DECAY=0.9 # EMA衰减率
|
|
|
|
|
|
EMA_UPDATE_FREQ=5 # EMA更新频率
|
|
|
|
|
|
|
|
|
|
|
|
# 训练配置
|
|
|
|
|
|
EPOCHS=3
|
|
|
|
|
|
BATCH_SIZE=48
|
|
|
|
|
|
ACCUMULATION_STEPS=8
|
|
|
|
|
|
LEARNING_RATE=2e-4
|
|
|
|
|
|
DTYPE="bfloat16"
|
|
|
|
|
|
GRAD_CLIP=1.0
|
|
|
|
|
|
BALANCE_LOSS_COEF=0.01 # 平衡损失系数
|
|
|
|
|
|
|
|
|
|
|
|
# 数据路径配置
|
2025-09-05 14:24:48 +08:00
|
|
|
|
DATA_PATH="./dataset/stable/merged_pretrain.jsonl"
|
|
|
|
|
|
DATABASE_INIT_PATH="./dataset/stable/sentence_trex_data.json" # 🔥 文本数据初始化
|
2025-08-19 19:32:52 +08:00
|
|
|
|
CACHE_PATH="cache/memory_bank_init_${KNOWLEDGE_NUM}_${KNOWLEDGE_LENGTH}.pt" # 🔥 Memory初始化缓存
|
|
|
|
|
|
|
|
|
|
|
|
# GPU和性能配置
|
|
|
|
|
|
export CUDA_VISIBLE_DEVICES=0
|
2025-09-05 14:24:48 +08:00
|
|
|
|
NUM_WORKERS=8
|
2025-08-19 19:32:52 +08:00
|
|
|
|
MIXED_PRECISION="bf16"
|
|
|
|
|
|
|
|
|
|
|
|
# 监控配置
|
|
|
|
|
|
USE_SWANLAB="True"
|
2025-09-05 14:24:48 +08:00
|
|
|
|
SWANLAB_PROJECT="MiniMind-Experiment-1.4.9-02"
|
2025-08-19 19:32:52 +08:00
|
|
|
|
SWANLAB_ONLINE="False" # 离线模式
|
|
|
|
|
|
|
|
|
|
|
|
# 验证和日志配置
|
|
|
|
|
|
LOG_INTERVAL=100
|
|
|
|
|
|
VAL_INTERVAL=200
|
|
|
|
|
|
PROFILE="True"
|
|
|
|
|
|
PROFILE_INTERVAL=10
|
|
|
|
|
|
MEMORY_MONITOR="False" # 关闭内存监控降低开销
|
|
|
|
|
|
|
|
|
|
|
|
echo "=========================================="
|
|
|
|
|
|
echo "📋 实验配置摘要"
|
|
|
|
|
|
echo "=========================================="
|
|
|
|
|
|
echo "🔥 核心特性:"
|
|
|
|
|
|
echo " - Model Type: $MODEL_TYPE"
|
|
|
|
|
|
echo " - Memory Bank Size: $KNOWLEDGE_NUM 条"
|
|
|
|
|
|
echo " - Memory Length: $KNOWLEDGE_LENGTH tokens"
|
|
|
|
|
|
echo " - Freeze Ratio: $FREEZE_RATIO (冻结 $((KNOWLEDGE_NUM * 20 / 100)) 条记忆)"
|
|
|
|
|
|
echo " - Text Initialization: $DATABASE_INIT_PATH"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
echo "🏗️ 模型架构:"
|
|
|
|
|
|
echo " - Dimension: $DIM"
|
|
|
|
|
|
echo " - Layers: $N_LAYERS"
|
|
|
|
|
|
echo " - Heads: $N_HEADS"
|
|
|
|
|
|
echo " - Max Seq Len: $MAX_SEQ_LEN"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
echo "📚 训练设置:"
|
|
|
|
|
|
echo " - Epochs: $EPOCHS"
|
|
|
|
|
|
echo " - Batch Size: $BATCH_SIZE"
|
|
|
|
|
|
echo " - Learning Rate: $LEARNING_RATE"
|
|
|
|
|
|
echo " - Data Type: $DTYPE"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
echo "⚡ EMA配置:"
|
|
|
|
|
|
echo " - EMA Decay: $EMA_DECAY"
|
|
|
|
|
|
echo " - Update Frequency: $EMA_UPDATE_FREQ"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
echo "📊 监控:"
|
|
|
|
|
|
echo " - SwanLab Project: $SWANLAB_PROJECT"
|
|
|
|
|
|
echo " - Log Interval: $LOG_INTERVAL"
|
|
|
|
|
|
echo "=========================================="
|
|
|
|
|
|
|
|
|
|
|
|
# 检查必要文件
|
|
|
|
|
|
echo "🔍 检查必要文件..."
|
|
|
|
|
|
if [[ ! -f "$DATA_PATH" ]]; then
|
|
|
|
|
|
echo "❌ 错误: 训练数据文件不存在: $DATA_PATH"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
if [[ ! -f "$DATABASE_INIT_PATH" ]]; then
|
|
|
|
|
|
echo "❌ 错误: Memory初始化数据文件不存在: $DATABASE_INIT_PATH"
|
|
|
|
|
|
exit 1
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
echo "✅ 文件检查通过"
|
|
|
|
|
|
|
|
|
|
|
|
# 构建训练命令 - 参考experiment_1_4_6.sh的成功模式
|
2025-09-05 14:24:48 +08:00
|
|
|
|
TRAIN_CMD="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python train_pretrain_accelerate.py"
|
2025-08-19 19:32:52 +08:00
|
|
|
|
TRAIN_CMD+=" --out_dir \"$OUTPUT_DIR\""
|
|
|
|
|
|
TRAIN_CMD+=" --epochs $EPOCHS"
|
|
|
|
|
|
TRAIN_CMD+=" --embedding_epoch 2"
|
|
|
|
|
|
TRAIN_CMD+=" --batch_size $BATCH_SIZE"
|
|
|
|
|
|
TRAIN_CMD+=" --learning_rate $LEARNING_RATE"
|
|
|
|
|
|
TRAIN_CMD+=" --dtype $DTYPE"
|
|
|
|
|
|
TRAIN_CMD+=" --num_workers $NUM_WORKERS"
|
|
|
|
|
|
TRAIN_CMD+=" --accumulation_steps $ACCUMULATION_STEPS"
|
|
|
|
|
|
TRAIN_CMD+=" --grad_clip $GRAD_CLIP"
|
|
|
|
|
|
TRAIN_CMD+=" --warmup_iters 0"
|
|
|
|
|
|
TRAIN_CMD+=" --log_interval $LOG_INTERVAL"
|
|
|
|
|
|
TRAIN_CMD+=" --val_interval $VAL_INTERVAL"
|
|
|
|
|
|
TRAIN_CMD+=" --dim $DIM"
|
|
|
|
|
|
TRAIN_CMD+=" --n_layers $N_LAYERS"
|
|
|
|
|
|
TRAIN_CMD+=" --n_heads $N_HEADS"
|
|
|
|
|
|
TRAIN_CMD+=" --max_seq_len $MAX_SEQ_LEN"
|
|
|
|
|
|
TRAIN_CMD+=" --data_path \"$DATA_PATH\""
|
|
|
|
|
|
TRAIN_CMD+=" --knowledge_num $KNOWLEDGE_NUM"
|
|
|
|
|
|
TRAIN_CMD+=" --knowledge_length $KNOWLEDGE_LENGTH"
|
|
|
|
|
|
TRAIN_CMD+=" --knowledge_dim $KNOWLEDGE_DIM"
|
|
|
|
|
|
TRAIN_CMD+=" --database_init_path \"$DATABASE_INIT_PATH\""
|
|
|
|
|
|
TRAIN_CMD+=" --cluster_cache_path \"$CACHE_PATH\""
|
|
|
|
|
|
TRAIN_CMD+=" --model_type \"$MODEL_TYPE\""
|
|
|
|
|
|
TRAIN_CMD+=" --balance_loss_coef $BALANCE_LOSS_COEF"
|
|
|
|
|
|
|
|
|
|
|
|
# 添加可选的flag参数(不需要值的参数)
|
2025-09-05 14:24:48 +08:00
|
|
|
|
# TRAIN_CMD+=" --use_swanlab"
|
2025-08-19 19:32:52 +08:00
|
|
|
|
TRAIN_CMD+=" --profile"
|
|
|
|
|
|
TRAIN_CMD+=" --use_flash_attn"
|
|
|
|
|
|
|
|
|
|
|
|
# 添加有值的可选参数
|
|
|
|
|
|
TRAIN_CMD+=" --swanlab_project \"$SWANLAB_PROJECT\""
|
|
|
|
|
|
TRAIN_CMD+=" --swanlab_online $SWANLAB_ONLINE"
|
|
|
|
|
|
TRAIN_CMD+=" --profile_interval $PROFILE_INTERVAL"
|
|
|
|
|
|
|
|
|
|
|
|
# 添加memory monitor参数(如果启用)
|
|
|
|
|
|
if [[ "$MEMORY_MONITOR" == "True" ]]; then
|
|
|
|
|
|
TRAIN_CMD+=" --memory_monitor"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
echo "🚀 启动训练..."
|
|
|
|
|
|
echo "📝 完整训练命令:"
|
|
|
|
|
|
echo "$TRAIN_CMD"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
echo "⏰ 预计训练时间: 约6-8小时"
|
|
|
|
|
|
echo "📊 实时监控: 查看 $LOG_FILE"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
|
|
|
|
|
|
# 记录命令到日志文件
|
|
|
|
|
|
echo "执行命令: $TRAIN_CMD" >> "$LOG_FILE"
|
|
|
|
|
|
echo "开始时间: $(date)" >> "$LOG_FILE"
|
|
|
|
|
|
|
|
|
|
|
|
# 创建训练脚本(参考1.4.6的成功模式)
|
2025-09-05 14:24:48 +08:00
|
|
|
|
TRAIN_SCRIPT="/tmp/train_1_4_9-02.sh"
|
2025-08-19 19:32:52 +08:00
|
|
|
|
cat > "$TRAIN_SCRIPT" << EOF
|
|
|
|
|
|
#!/bin/bash
|
|
|
|
|
|
$TRAIN_CMD
|
|
|
|
|
|
echo "结束时间: \$(date)"
|
|
|
|
|
|
echo "退出代码: \$?"
|
|
|
|
|
|
EOF
|
|
|
|
|
|
chmod +x "$TRAIN_SCRIPT"
|
|
|
|
|
|
|
|
|
|
|
|
# 使用nohup后台运行训练脚本
|
|
|
|
|
|
nohup bash "$TRAIN_SCRIPT" >> "$LOG_FILE" 2>&1 &
|
|
|
|
|
|
TRAIN_PID=$!
|
|
|
|
|
|
echo $TRAIN_PID > $PID_FILE
|
|
|
|
|
|
|
|
|
|
|
|
echo "=========================================="
|
2025-09-05 14:24:48 +08:00
|
|
|
|
echo "✅ 实验1.4.9-02已启动"
|
2025-08-19 19:32:52 +08:00
|
|
|
|
echo "🆔 进程ID: $TRAIN_PID"
|
|
|
|
|
|
echo "📝 日志文件: $LOG_FILE"
|
|
|
|
|
|
echo "📊 监控命令: tail -f $LOG_FILE"
|
|
|
|
|
|
echo "🛑 停止命令: kill $TRAIN_PID"
|
|
|
|
|
|
echo "=========================================="
|
|
|
|
|
|
echo ""
|
2025-09-05 14:24:48 +08:00
|
|
|
|
echo "🔥 实验1.4.9-02 - Memory Bank优化特性:"
|
2025-08-19 19:32:52 +08:00
|
|
|
|
echo " ✨ 文本数据初始化 (sentence_trex_data.json)"
|
|
|
|
|
|
echo " ✨ 部分冻结机制 (freeze_ratio=0.2)"
|
|
|
|
|
|
echo " ✨ Token-based EMA更新"
|
|
|
|
|
|
echo " ✨ Product Key Memory架构"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
echo "📋 监控要点:"
|
|
|
|
|
|
echo " - 初始化阶段:观察文本数据加载和缓存"
|
|
|
|
|
|
echo " - 训练阶段:关注frozen_memories统计"
|
|
|
|
|
|
echo " - EMA更新:监控update_ratio和coverage指标"
|
|
|
|
|
|
echo " - 生成质量:对比词组连贯性改善"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
echo "⚡ 进程状态检查:"
|
|
|
|
|
|
echo "ps aux | grep $TRAIN_PID"
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
|
|
|
|
|
|
# 显示初始进程状态
|
|
|
|
|
|
sleep 2
|
|
|
|
|
|
if ps -p $TRAIN_PID > /dev/null; then
|
|
|
|
|
|
echo "✅ 训练进程正在运行 (PID: $TRAIN_PID)"
|
|
|
|
|
|
|
|
|
|
|
|
# 显示前几行日志
|
|
|
|
|
|
echo ""
|
|
|
|
|
|
echo "📋 初始日志预览:"
|
|
|
|
|
|
echo "----------------------------------------"
|
|
|
|
|
|
timeout 5 tail -f $LOG_FILE | head -10 || echo "日志文件尚未生成,请稍等..."
|
|
|
|
|
|
echo "----------------------------------------"
|
|
|
|
|
|
else
|
|
|
|
|
|
echo "❌ 训练进程启动失败,请检查日志:"
|
|
|
|
|
|
echo "cat $LOG_FILE"
|
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
|
|
echo ""
|
2025-09-05 14:24:48 +08:00
|
|
|
|
echo "🎯 实验1.4.9-02核心验证点:"
|
2025-08-19 19:32:52 +08:00
|
|
|
|
echo " 1. Memory bank是否成功用文本数据初始化"
|
|
|
|
|
|
echo " 2. 冻结机制是否正常工作 (20%条目不更新)"
|
|
|
|
|
|
echo " 3. 生成质量是否有明显改善"
|
|
|
|
|
|
echo " 4. 训练稳定性是否提升"
|
|
|
|
|
|
echo ""
|
2025-09-05 14:24:48 +08:00
|
|
|
|
echo "📖 实验记录: experiment/EXPERIMENT_1_4_9-02.md"
|
|
|
|
|
|
echo "🚀 实验1.4.9-02启动完成!"
|