This commit is contained in:
iomgaa 2025-09-05 04:00:43 +00:00
parent 92d950fc42
commit f9a7f96607
5 changed files with 500 additions and 5 deletions

View File

@ -337,7 +337,7 @@ class MiniMindBlock(nn.Module):
self.memory_gate = MemoryGate(config)
self.gated_memory_fusion = GatedMemoryFusion(config)
self.attentionpool = nn.Linear(config.dim, 1)
# self.attentionpool = nn.Linear(config.dim, 1)
def forward(self, x, pos_cis, memory_bank, tok_embeddings, collect_ema_stats=False):
"""

View File

@ -0,0 +1,249 @@
#!/bin/bash
#########################################################
# 实验1.4.7 - Memory Bank文本初始化 + 部分冻结机制
#
# 实验目标:
# 1. 验证使用有意义文本进行memory_bank初始化的效果
# 2. 验证部分memory_bank冻结机制(freeze_ratio=0.4)的效果
#
# 关键特性:
# - 使用sentence_trex_data.json文本数据初始化memory_bank
# - 冻结20%的memory_bank条目保护重要知识
# - Token-based memory机制 + EMA更新
# - Product Key Memory架构
#########################################################
echo "=========================================="
echo "🚀 开始实验 1.4.7 - Memory Bank优化"
echo "🔥 新特性: 文本初始化 + 部分冻结机制"
echo "=========================================="
# 实验配置
EXPERIMENT_NAME="experiment_1_4_7-06"
OUTPUT_DIR="out/${EXPERIMENT_NAME}"
LOG_FILE="${OUTPUT_DIR}/experiment.log"
PID_FILE="${OUTPUT_DIR}/train.pid"
# 创建输出目录
mkdir -p $OUTPUT_DIR
echo "📂 实验输出目录: $OUTPUT_DIR"
echo "📝 日志文件: $LOG_FILE"
# 核心参数配置
MODEL_TYPE="model_memory" # 🔥 使用memory架构
DIM=512
N_LAYERS=8
N_HEADS=32
MAX_SEQ_LEN=512
# 🔥 Memory Bank配置 - 实验1.4.7关键参数
KNOWLEDGE_NUM=1048576 # 1M条记忆2^20
KNOWLEDGE_LENGTH=8 # 每条记忆32个token
KNOWLEDGE_DIM=128 # 记忆向量维度128
FREEZE_RATIO=0.6 # 🔥 新特性: 冻结60%的记忆条目
# EMA更新配置
USE_EMA_UPDATE="True"
EMA_DECAY=0.9 # EMA衰减率
EMA_UPDATE_FREQ=5 # EMA更新频率
# 训练配置
EPOCHS=3
BATCH_SIZE=48
ACCUMULATION_STEPS=8
LEARNING_RATE=2e-4
DTYPE="bfloat16"
GRAD_CLIP=1.0
BALANCE_LOSS_COEF=0.01 # 平衡损失系数
# 数据路径配置
DATA_PATH="/home/zym/Code/stable/merged_pretrain.jsonl"
DATABASE_INIT_PATH="/home/zym/Code/stable/sentence_trex_data.json" # 🔥 文本数据初始化
CACHE_PATH="cache/memory_bank_init_${KNOWLEDGE_NUM}_${KNOWLEDGE_LENGTH}.pt" # 🔥 Memory初始化缓存
# GPU和性能配置
export CUDA_VISIBLE_DEVICES=0
NUM_WORKERS=8
MIXED_PRECISION="bf16"
# 监控配置
USE_SWANLAB="True"
SWANLAB_PROJECT="MiniMind-Experiment-1.4.7"
SWANLAB_ONLINE="False" # 离线模式
# 验证和日志配置
LOG_INTERVAL=100
VAL_INTERVAL=200
PROFILE="True"
PROFILE_INTERVAL=10
MEMORY_MONITOR="False" # 关闭内存监控降低开销
echo "=========================================="
echo "📋 实验配置摘要"
echo "=========================================="
echo "🔥 核心特性:"
echo " - Model Type: $MODEL_TYPE"
echo " - Memory Bank Size: $KNOWLEDGE_NUM"
echo " - Memory Length: $KNOWLEDGE_LENGTH tokens"
echo " - Freeze Ratio: $FREEZE_RATIO (冻结 $((KNOWLEDGE_NUM * 20 / 100)) 条记忆)"
echo " - Text Initialization: $DATABASE_INIT_PATH"
echo ""
echo "🏗️ 模型架构:"
echo " - Dimension: $DIM"
echo " - Layers: $N_LAYERS"
echo " - Heads: $N_HEADS"
echo " - Max Seq Len: $MAX_SEQ_LEN"
echo ""
echo "📚 训练设置:"
echo " - Epochs: $EPOCHS"
echo " - Batch Size: $BATCH_SIZE"
echo " - Learning Rate: $LEARNING_RATE"
echo " - Data Type: $DTYPE"
echo ""
echo "⚡ EMA配置:"
echo " - EMA Decay: $EMA_DECAY"
echo " - Update Frequency: $EMA_UPDATE_FREQ"
echo ""
echo "📊 监控:"
echo " - SwanLab Project: $SWANLAB_PROJECT"
echo " - Log Interval: $LOG_INTERVAL"
echo "=========================================="
# 检查必要文件
echo "🔍 检查必要文件..."
if [[ ! -f "$DATA_PATH" ]]; then
echo "❌ 错误: 训练数据文件不存在: $DATA_PATH"
exit 1
fi
if [[ ! -f "$DATABASE_INIT_PATH" ]]; then
echo "❌ 错误: Memory初始化数据文件不存在: $DATABASE_INIT_PATH"
exit 1
fi
echo "✅ 文件检查通过"
# 构建训练命令 - 参考experiment_1_4_6.sh的成功模式
TRAIN_CMD="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python train_pretrain_accelerate.py"
TRAIN_CMD+=" --out_dir \"$OUTPUT_DIR\""
TRAIN_CMD+=" --epochs $EPOCHS"
TRAIN_CMD+=" --embedding_epoch 2"
TRAIN_CMD+=" --batch_size $BATCH_SIZE"
TRAIN_CMD+=" --learning_rate $LEARNING_RATE"
TRAIN_CMD+=" --dtype $DTYPE"
TRAIN_CMD+=" --num_workers $NUM_WORKERS"
TRAIN_CMD+=" --accumulation_steps $ACCUMULATION_STEPS"
TRAIN_CMD+=" --grad_clip $GRAD_CLIP"
TRAIN_CMD+=" --warmup_iters 0"
TRAIN_CMD+=" --log_interval $LOG_INTERVAL"
TRAIN_CMD+=" --val_interval $VAL_INTERVAL"
TRAIN_CMD+=" --dim $DIM"
TRAIN_CMD+=" --n_layers $N_LAYERS"
TRAIN_CMD+=" --n_heads $N_HEADS"
TRAIN_CMD+=" --max_seq_len $MAX_SEQ_LEN"
TRAIN_CMD+=" --data_path \"$DATA_PATH\""
TRAIN_CMD+=" --knowledge_num $KNOWLEDGE_NUM"
TRAIN_CMD+=" --knowledge_length $KNOWLEDGE_LENGTH"
TRAIN_CMD+=" --knowledge_dim $KNOWLEDGE_DIM"
TRAIN_CMD+=" --database_init_path \"$DATABASE_INIT_PATH\""
TRAIN_CMD+=" --cluster_cache_path \"$CACHE_PATH\""
TRAIN_CMD+=" --model_type \"$MODEL_TYPE\""
TRAIN_CMD+=" --balance_loss_coef $BALANCE_LOSS_COEF"
# 添加可选的flag参数不需要值的参数
TRAIN_CMD+=" --use_swanlab"
TRAIN_CMD+=" --profile"
TRAIN_CMD+=" --use_flash_attn"
# 添加有值的可选参数
TRAIN_CMD+=" --swanlab_project \"$SWANLAB_PROJECT\""
# TRAIN_CMD+=" --swanlab_online $SWANLAB_ONLINE"
TRAIN_CMD+=" --profile_interval $PROFILE_INTERVAL"
# 添加memory monitor参数如果启用
if [[ "$MEMORY_MONITOR" == "True" ]]; then
TRAIN_CMD+=" --memory_monitor"
fi
echo ""
echo "🚀 启动训练..."
echo "📝 完整训练命令:"
echo "$TRAIN_CMD"
echo ""
echo "⏰ 预计训练时间: 约6-8小时"
echo "📊 实时监控: 查看 $LOG_FILE"
echo ""
# 记录命令到日志文件
echo "执行命令: $TRAIN_CMD" >> "$LOG_FILE"
echo "开始时间: $(date)" >> "$LOG_FILE"
# 创建训练脚本参考1.4.6的成功模式)
TRAIN_SCRIPT="/tmp/train_1_4_7-06.sh"
cat > "$TRAIN_SCRIPT" << EOF
#!/bin/bash
cd /home/zym/Code/Minimind
source /home/user/miniconda3/bin/activate
conda activate minimind
$TRAIN_CMD
echo "结束时间: \$(date)"
echo "退出代码: \$?"
EOF
chmod +x "$TRAIN_SCRIPT"
# 使用nohup后台运行训练脚本
nohup bash "$TRAIN_SCRIPT" >> "$LOG_FILE" 2>&1 &
TRAIN_PID=$!
echo $TRAIN_PID > $PID_FILE
echo "=========================================="
echo "✅ 实验1.4.7已启动"
echo "🆔 进程ID: $TRAIN_PID"
echo "📝 日志文件: $LOG_FILE"
echo "📊 监控命令: tail -f $LOG_FILE"
echo "🛑 停止命令: kill $TRAIN_PID"
echo "=========================================="
echo ""
echo "🔥 实验1.4.7 - Memory Bank优化特性:"
echo " ✨ 文本数据初始化 (sentence_trex_data.json)"
echo " ✨ 部分冻结机制 (freeze_ratio=0.4)"
echo " ✨ Token-based EMA更新"
echo " ✨ Product Key Memory架构"
echo ""
echo "📋 监控要点:"
echo " - 初始化阶段:观察文本数据加载和缓存"
echo " - 训练阶段关注frozen_memories统计"
echo " - EMA更新监控update_ratio和coverage指标"
echo " - 生成质量:对比词组连贯性改善"
echo ""
echo "⚡ 进程状态检查:"
echo "ps aux | grep $TRAIN_PID"
echo ""
# 显示初始进程状态
sleep 2
if ps -p $TRAIN_PID > /dev/null; then
echo "✅ 训练进程正在运行 (PID: $TRAIN_PID)"
# 显示前几行日志
echo ""
echo "📋 初始日志预览:"
echo "----------------------------------------"
timeout 5 tail -f $LOG_FILE | head -10 || echo "日志文件尚未生成,请稍等..."
echo "----------------------------------------"
else
echo "❌ 训练进程启动失败,请检查日志:"
echo "cat $LOG_FILE"
fi
echo ""
echo "🎯 实验1.4.7核心验证点:"
echo " 1. Memory bank是否成功用文本数据初始化"
echo " 2. 冻结机制是否正常工作 (20%条目不更新)"
echo " 3. 生成质量是否有明显改善"
echo " 4. 训练稳定性是否提升"
echo ""
echo "📖 实验记录: experiment/EXPERIMENT_1_4_7-06.md"
echo "🚀 实验1.4.7启动完成!"

View File

@ -59,8 +59,8 @@ GRAD_CLIP=1.0
BALANCE_LOSS_COEF=0.01 # 平衡损失系数
# 数据路径配置
DATA_PATH="./dataset/stable/merged_pretrain.jsonl"
DATABASE_INIT_PATH="./dataset/stable/sentence_trex_data.json" # 🔥 文本数据初始化
DATA_PATH="/home/zym/Code/stable/merged_pretrain.jsonl"
DATABASE_INIT_PATH="/home/zym/Code/stable/sentence_trex_data.json" # 🔥 文本数据初始化
CACHE_PATH="cache/memory_bank_init_${KNOWLEDGE_NUM}_${KNOWLEDGE_LENGTH}.pt" # 🔥 Memory初始化缓存
# GPU和性能配置

View File

@ -0,0 +1,246 @@
#!/bin/bash
#########################################################
# 实验1.4.9-04 - Memory Bank文本初始化 + 部分冻结机制
#
# 实验目标:
# 1. 验证使用有意义文本进行memory_bank初始化的效果
# 2. 验证部分memory_bank冻结机制(freeze_ratio=0.2)的效果
#
# 关键特性:
# - 使用sentence_trex_data.json文本数据初始化memory_bank
# - 冻结20%的memory_bank条目保护重要知识
# - Token-based memory机制 + EMA更新
# - Product Key Memory架构
#########################################################
echo "=========================================="
echo "🚀 开始实验 1.4.9-04 - Memory Bank优化"
echo "🔥 新特性: 文本初始化 + 部分冻结机制"
echo "=========================================="
# 实验配置
EXPERIMENT_NAME="experiment_1_4_9-04"
OUTPUT_DIR="out/${EXPERIMENT_NAME}"
LOG_FILE="${OUTPUT_DIR}/experiment.log"
PID_FILE="${OUTPUT_DIR}/train.pid"
# 创建输出目录
mkdir -p $OUTPUT_DIR
echo "📂 实验输出目录: $OUTPUT_DIR"
echo "📝 日志文件: $LOG_FILE"
# 核心参数配置
MODEL_TYPE="model_memory" # 🔥 使用memory架构
DIM=512
N_LAYERS=1
N_HEADS=32
MAX_SEQ_LEN=512
# 🔥 Memory Bank配置 - 实验1.4.9-04关键参数
KNOWLEDGE_NUM=1048576 # 1M条记忆2^20
KNOWLEDGE_LENGTH=8 # 每条记忆32个token
KNOWLEDGE_DIM=128 # 记忆向量维度128
FREEZE_RATIO=0.2 # 🔥 新特性: 冻结20%的记忆条目
# EMA更新配置
USE_EMA_UPDATE="True"
EMA_DECAY=0.9 # EMA衰减率
EMA_UPDATE_FREQ=5 # EMA更新频率
# 训练配置
EPOCHS=3
BATCH_SIZE=48
ACCUMULATION_STEPS=8
LEARNING_RATE=2e-4
DTYPE="bfloat16"
GRAD_CLIP=1.0
BALANCE_LOSS_COEF=0.01 # 平衡损失系数
# 数据路径配置
DATA_PATH="/home/zym/Code/stable/merged_pretrain.jsonl"
DATABASE_INIT_PATH="/home/zym/Code/stable/sentence_trex_data.json" # 🔥 文本数据初始化
CACHE_PATH="cache/memory_bank_init_${KNOWLEDGE_NUM}_${KNOWLEDGE_LENGTH}.pt" # 🔥 Memory初始化缓存
# GPU和性能配置
export CUDA_VISIBLE_DEVICES=0
NUM_WORKERS=8
MIXED_PRECISION="bf16"
# 监控配置
USE_SWANLAB="True"
SWANLAB_PROJECT="MiniMind-Experiment-1.4.9-04"
SWANLAB_ONLINE="False" # 离线模式
# 验证和日志配置
LOG_INTERVAL=100
VAL_INTERVAL=200
PROFILE="True"
PROFILE_INTERVAL=10
MEMORY_MONITOR="False" # 关闭内存监控降低开销
echo "=========================================="
echo "📋 实验配置摘要"
echo "=========================================="
echo "🔥 核心特性:"
echo " - Model Type: $MODEL_TYPE"
echo " - Memory Bank Size: $KNOWLEDGE_NUM"
echo " - Memory Length: $KNOWLEDGE_LENGTH tokens"
echo " - Freeze Ratio: $FREEZE_RATIO (冻结 $((KNOWLEDGE_NUM * 20 / 100)) 条记忆)"
echo " - Text Initialization: $DATABASE_INIT_PATH"
echo ""
echo "🏗️ 模型架构:"
echo " - Dimension: $DIM"
echo " - Layers: $N_LAYERS"
echo " - Heads: $N_HEADS"
echo " - Max Seq Len: $MAX_SEQ_LEN"
echo ""
echo "📚 训练设置:"
echo " - Epochs: $EPOCHS"
echo " - Batch Size: $BATCH_SIZE"
echo " - Learning Rate: $LEARNING_RATE"
echo " - Data Type: $DTYPE"
echo ""
echo "⚡ EMA配置:"
echo " - EMA Decay: $EMA_DECAY"
echo " - Update Frequency: $EMA_UPDATE_FREQ"
echo ""
echo "📊 监控:"
echo " - SwanLab Project: $SWANLAB_PROJECT"
echo " - Log Interval: $LOG_INTERVAL"
echo "=========================================="
# 检查必要文件
echo "🔍 检查必要文件..."
if [[ ! -f "$DATA_PATH" ]]; then
echo "❌ 错误: 训练数据文件不存在: $DATA_PATH"
exit 1
fi
if [[ ! -f "$DATABASE_INIT_PATH" ]]; then
echo "❌ 错误: Memory初始化数据文件不存在: $DATABASE_INIT_PATH"
exit 1
fi
echo "✅ 文件检查通过"
# 构建训练命令 - 参考experiment_1_4_6.sh的成功模式
TRAIN_CMD="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python train_pretrain_accelerate.py"
TRAIN_CMD+=" --out_dir \"$OUTPUT_DIR\""
TRAIN_CMD+=" --epochs $EPOCHS"
TRAIN_CMD+=" --embedding_epoch 2"
TRAIN_CMD+=" --batch_size $BATCH_SIZE"
TRAIN_CMD+=" --learning_rate $LEARNING_RATE"
TRAIN_CMD+=" --dtype $DTYPE"
TRAIN_CMD+=" --num_workers $NUM_WORKERS"
TRAIN_CMD+=" --accumulation_steps $ACCUMULATION_STEPS"
TRAIN_CMD+=" --grad_clip $GRAD_CLIP"
TRAIN_CMD+=" --warmup_iters 0"
TRAIN_CMD+=" --log_interval $LOG_INTERVAL"
TRAIN_CMD+=" --val_interval $VAL_INTERVAL"
TRAIN_CMD+=" --dim $DIM"
TRAIN_CMD+=" --n_layers $N_LAYERS"
TRAIN_CMD+=" --n_heads $N_HEADS"
TRAIN_CMD+=" --max_seq_len $MAX_SEQ_LEN"
TRAIN_CMD+=" --data_path \"$DATA_PATH\""
TRAIN_CMD+=" --knowledge_num $KNOWLEDGE_NUM"
TRAIN_CMD+=" --knowledge_length $KNOWLEDGE_LENGTH"
TRAIN_CMD+=" --knowledge_dim $KNOWLEDGE_DIM"
TRAIN_CMD+=" --database_init_path \"$DATABASE_INIT_PATH\""
TRAIN_CMD+=" --cluster_cache_path \"$CACHE_PATH\""
TRAIN_CMD+=" --model_type \"$MODEL_TYPE\""
TRAIN_CMD+=" --balance_loss_coef $BALANCE_LOSS_COEF"
# 添加可选的flag参数不需要值的参数
# TRAIN_CMD+=" --use_swanlab"
TRAIN_CMD+=" --profile"
TRAIN_CMD+=" --use_flash_attn"
# 添加有值的可选参数
TRAIN_CMD+=" --swanlab_project \"$SWANLAB_PROJECT\""
TRAIN_CMD+=" --swanlab_online $SWANLAB_ONLINE"
TRAIN_CMD+=" --profile_interval $PROFILE_INTERVAL"
# 添加memory monitor参数如果启用
if [[ "$MEMORY_MONITOR" == "True" ]]; then
TRAIN_CMD+=" --memory_monitor"
fi
echo ""
echo "🚀 启动训练..."
echo "📝 完整训练命令:"
echo "$TRAIN_CMD"
echo ""
echo "⏰ 预计训练时间: 约6-8小时"
echo "📊 实时监控: 查看 $LOG_FILE"
echo ""
# 记录命令到日志文件
echo "执行命令: $TRAIN_CMD" >> "$LOG_FILE"
echo "开始时间: $(date)" >> "$LOG_FILE"
# 创建训练脚本参考1.4.6的成功模式)
TRAIN_SCRIPT="/tmp/train_1_4_9-04.sh"
cat > "$TRAIN_SCRIPT" << EOF
#!/bin/bash
$TRAIN_CMD
echo "结束时间: \$(date)"
echo "退出代码: \$?"
EOF
chmod +x "$TRAIN_SCRIPT"
# 使用nohup后台运行训练脚本
nohup bash "$TRAIN_SCRIPT" >> "$LOG_FILE" 2>&1 &
TRAIN_PID=$!
echo $TRAIN_PID > $PID_FILE
echo "=========================================="
echo "✅ 实验1.4.9-04已启动"
echo "🆔 进程ID: $TRAIN_PID"
echo "📝 日志文件: $LOG_FILE"
echo "📊 监控命令: tail -f $LOG_FILE"
echo "🛑 停止命令: kill $TRAIN_PID"
echo "=========================================="
echo ""
echo "🔥 实验1.4.9-04 - Memory Bank优化特性:"
echo " ✨ 文本数据初始化 (sentence_trex_data.json)"
echo " ✨ 部分冻结机制 (freeze_ratio=0.2)"
echo " ✨ Token-based EMA更新"
echo " ✨ Product Key Memory架构"
echo ""
echo "📋 监控要点:"
echo " - 初始化阶段:观察文本数据加载和缓存"
echo " - 训练阶段关注frozen_memories统计"
echo " - EMA更新监控update_ratio和coverage指标"
echo " - 生成质量:对比词组连贯性改善"
echo ""
echo "⚡ 进程状态检查:"
echo "ps aux | grep $TRAIN_PID"
echo ""
# 显示初始进程状态
sleep 2
if ps -p $TRAIN_PID > /dev/null; then
echo "✅ 训练进程正在运行 (PID: $TRAIN_PID)"
# 显示前几行日志
echo ""
echo "📋 初始日志预览:"
echo "----------------------------------------"
timeout 5 tail -f $LOG_FILE | head -10 || echo "日志文件尚未生成,请稍等..."
echo "----------------------------------------"
else
echo "❌ 训练进程启动失败,请检查日志:"
echo "cat $LOG_FILE"
fi
echo ""
echo "🎯 实验1.4.9-04核心验证点:"
echo " 1. Memory bank是否成功用文本数据初始化"
echo " 2. 冻结机制是否正常工作 (20%条目不更新)"
echo " 3. 生成质量是否有明显改善"
echo " 4. 训练稳定性是否提升"
echo ""
echo "📖 实验记录: experiment/EXPERIMENT_1_4_9-04.md"
echo "🚀 实验1.4.9-04启动完成"

View File

@ -1356,7 +1356,7 @@ def main():
parser.add_argument("--model_size", type=float, default=50.0, help="模型大小")
parser.add_argument("--swanlab_online", type=bool, default=False, help="是否使用在线SwanLab服务")
parser.add_argument("--balance_loss_coef", type=float, default=0.01, help="平衡损失系数")
parser.add_argument("--val_data_path", type=str, default="dataset/stable/eval_data.json", help="验证数据集路径")
parser.add_argument("--val_data_path", type=str, default="/home/zym/Code/stable/eval_data.json", help="验证数据集路径")
parser.add_argument("--val_interval", type=int, default=100, help="验证评估间隔")
args = parser.parse_args()