Minimind/run_file/experiment_1_4_10.sh
2025-09-06 17:57:33 +08:00

414 lines
15 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# ============================================================================
# MiniMind 实验脚本 - Experiment 1.4.10
# ============================================================================
#
# 🎯 实验目标:
# 基于实验1.4.9实现四损失系统CE + Balance + Similarity + Diversity
# 核心创新Gumbel-Softmax选择机制 + 可微分相似度损失 + 候选集多样性约束
#
# 使用方法:
# bash run_file/experiment_1_4_10.sh
# ============================================================================
# ----------------------------------------------------------------------------
# 🧑‍🔬 实验基本信息
# ----------------------------------------------------------------------------
EXPERIMENT_VERSION="1.4.10"
EXPERIMENT_DESCRIPTION="四损失系统实验 - Gumbel-Softmax + 可微分相似度损失 + 多样性约束"
RESEARCHER_NAME="AI Assistant"
EXPERIMENT_DATE="$(date '+%Y-%m-%d %H:%M:%S')"
# ----------------------------------------------------------------------------
# 🤖 环境配置
# ----------------------------------------------------------------------------
# 调试和监控环境变量
export NCCL_DEBUG=INFO
export PYTHONFAULTHANDLER=1
export CUDA_LAUNCH_BLOCKING=1
# SwanLab 配置
export SWANLAB_PROJECT="MiniMind-Experiment-1.4.10"
# 日志配置
LOG_DIR="out/experiment_${EXPERIMENT_VERSION}"
mkdir -p "$LOG_DIR"
LOG_FILE="$LOG_DIR/experiment.log"
# ----------------------------------------------------------------------------
# 🤖 硬件配置
# ----------------------------------------------------------------------------
CUDA_VISIBLE_DEVICES="0,1,2,3"
NUM_PROCESSES="4"
MIXED_PRECISION="bf16"
MAIN_PROCESS_PORT="29500"
# ----------------------------------------------------------------------------
# 🤖 模型架构参数
# ----------------------------------------------------------------------------
MODEL_TYPE="model_memory" # 🔥 使用Token-based Memory模型
MODEL_SIZE="50.0"
DIM="512"
N_LAYERS="8"
N_HEADS="16"
MAX_SEQ_LEN="512"
USE_MOE="false"
# 🔥 知识库配置(四损失系统优化)
KNOWLEDGE_NUM="1048576" # 1M entries
KNOWLEDGE_LENGTH="8" # 🔥 增加到16个token提升表达能力
KNOWLEDGE_DIM="128" # 保留兼容性
DISABLE_DB="false"
# ----------------------------------------------------------------------------
# 🤖 训练超参数
# ----------------------------------------------------------------------------
EPOCHS="3"
EMBEDDING_EPOCH="2"
BATCH_SIZE="48" # 🔥 降低批次大小以适应更复杂的计算
ACCUMULATION_STEPS="8" # 🔥 增加累积步数保持有效批次大小
LEARNING_RATE="2e-4" # 🔥 适度降低学习率提升稳定性
DTYPE="bfloat16"
GRAD_CLIP="1.0"
WARMUP_ITERS="0"
# 🔥 四损失系统配置
BALANCE_LOSS_COEF="0.01" # 平衡损失系数
SIMILARITY_LOSS_COEF="0.8" # 🔥 相似度损失系数(核心损失)
DIVERSITY_LOSS_COEF="0.2" # 🔥 多样性损失系数(避免候选重复)
# 数据和缓存路径
DATA_PATH="dataset/stable/merged_pretrain.jsonl"
DATABASE_INIT_PATH="dataset/stable/sentence_trex_data.json"
CLUSTER_CACHE_PATH="None" # 禁用聚类缓存
VAL_DATA_PATH="dataset/stable/eval_data.json"
# 训练配置
NUM_WORKERS="8"
LOG_INTERVAL="100" # 🔥 更频繁的日志记录观察四个损失
VAL_INTERVAL="100"
SAVE_INTERVAL="10000"
# 性能分析配置
USE_PROFILE="true"
PROFILE_INTERVAL="10"
MEMORY_MONITOR_INTERVAL="100"
# 高级功能
USE_FLASH_ATTN="true"
FAST_CLUSTERING="true"
# 冻结率
FREEZE_RATIO="0.2"
# ----------------------------------------------------------------------------
# 🤖 预检查函数
# ----------------------------------------------------------------------------
check_environment() {
echo "🔍 环境检查中..."
# 检查GPU可用性
if ! nvidia-smi &> /dev/null; then
echo "❌ 错误: 未检测到GPU或nvidia-smi不可用"
exit 1
fi
# 检查CUDA设备
if ! nvidia-smi -i "$CUDA_VISIBLE_DEVICES" &> /dev/null; then
echo "❌ 错误: GPU $CUDA_VISIBLE_DEVICES 不可用"
exit 1
fi
# 检查数据文件
if [[ ! -f "$DATA_PATH" ]]; then
echo "❌ 错误: 训练数据文件不存在: $DATA_PATH"
exit 1
fi
if [[ ! -f "$DATABASE_INIT_PATH" ]]; then
echo "❌ 错误: 数据库初始化文件不存在: $DATABASE_INIT_PATH"
exit 1
fi
echo "✅ 环境检查通过"
}
# ----------------------------------------------------------------------------
# 🤖 实验信息记录
# ----------------------------------------------------------------------------
log_experiment_info() {
echo "📝 记录实验信息..."
cat > "$LOG_DIR/experiment_info.txt" << EOF
========================================
MiniMind 实验信息
========================================
实验版本: $EXPERIMENT_VERSION
实验描述: $EXPERIMENT_DESCRIPTION
研究者: $RESEARCHER_NAME
开始时间: $EXPERIMENT_DATE
========================================
硬件配置:
GPU设备: $CUDA_VISIBLE_DEVICES
进程数: $NUM_PROCESSES
混合精度: $MIXED_PRECISION
========================================
模型配置:
模型类型: $MODEL_TYPE (Token-based Memory + 四损失系统)
模型大小: $MODEL_SIZE MB
维度: $DIM
层数: $N_LAYERS
注意力头数: $N_HEADS
最大序列长度: $MAX_SEQ_LEN
知识库大小: $KNOWLEDGE_NUM (1M entries)
知识长度: $KNOWLEDGE_LENGTH (增强表达能力)
知识维度: $KNOWLEDGE_DIM (兼容性保留)
========================================
训练配置:
训练轮次: $EPOCHS
批次大小: $BATCH_SIZE (优化显存使用)
学习率: $LEARNING_RATE (稳定性优化)
梯度累积: $ACCUMULATION_STEPS (保持有效批次)
数据类型: $DTYPE
========================================
🔥 四损失系统配置:
平衡损失系数: $BALANCE_LOSS_COEF (记忆选择平衡)
相似度损失系数: $SIMILARITY_LOSS_COEF (语义匹配优化)
多样性损失系数: $DIVERSITY_LOSS_COEF (候选集多样性)
========================================
🔥 Gumbel-Softmax配置:
候选项数量: 32 (Product Key生成)
选择数量: 1 (Gumbel-Softmax选择最佳)
温度参数: 1.0 (平衡探索与利用)
选择机制: 硬选择 + Straight-Through Estimator
========================================
🔥 核心创新对比:
传统方法: 16个记忆平均融合 (缺乏语义针对性)
新方法: 32候选→1最佳 (语义相似度驱动)
旧相似度损失: no_grad计算 (不可微分)
新相似度损失: 可微分优化 (直接指导学习)
新增多样性: 候选集内部差异性约束
========================================
数据路径:
训练数据: $DATA_PATH
验证数据: $VAL_DATA_PATH
数据库初始化: $DATABASE_INIT_PATH
聚类缓存: $CLUSTER_CACHE_PATH
========================================
预期改进:
1. 相似度损失收敛: 从震荡→稳定下降
2. 记忆选择质量: 更精准的语义匹配
3. 生成文本质量: 更好的连贯性和相关性
4. 四损失平衡: CE主导其他损失辅助
========================================
EOF
}
# ----------------------------------------------------------------------------
# 🤖 主执行函数
# ----------------------------------------------------------------------------
run_experiment() {
echo "🚀 开始执行实验 $EXPERIMENT_VERSION"
echo "📄 实验描述: $EXPERIMENT_DESCRIPTION"
echo "⏰ 开始时间: $EXPERIMENT_DATE"
# 构建训练命令
local train_cmd="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES accelerate launch --config_file accelerate_config.yaml train_pretrain_accelerate.py"
# 添加训练参数
train_cmd+=" --out_dir \"$LOG_DIR\""
train_cmd+=" --epochs $EPOCHS"
train_cmd+=" --embedding_epoch $EMBEDDING_EPOCH"
train_cmd+=" --batch_size $BATCH_SIZE"
train_cmd+=" --learning_rate $LEARNING_RATE"
train_cmd+=" --dtype $DTYPE"
train_cmd+=" --num_workers $NUM_WORKERS"
train_cmd+=" --accumulation_steps $ACCUMULATION_STEPS"
train_cmd+=" --grad_clip $GRAD_CLIP"
train_cmd+=" --warmup_iters $WARMUP_ITERS"
train_cmd+=" --log_interval $LOG_INTERVAL"
train_cmd+=" --val_interval $VAL_INTERVAL"
train_cmd+=" --save_interval $SAVE_INTERVAL"
train_cmd+=" --dim $DIM"
train_cmd+=" --n_layers $N_LAYERS"
train_cmd+=" --n_heads $N_HEADS"
train_cmd+=" --max_seq_len $MAX_SEQ_LEN"
train_cmd+=" --data_path \"$DATA_PATH\""
train_cmd+=" --val_data_path \"$VAL_DATA_PATH\""
train_cmd+=" --knowledge_num $KNOWLEDGE_NUM"
train_cmd+=" --knowledge_length $KNOWLEDGE_LENGTH"
train_cmd+=" --database_init_path \"$DATABASE_INIT_PATH\""
train_cmd+=" --memory_monitor_interval $MEMORY_MONITOR_INTERVAL"
train_cmd+=" --model_type \"$MODEL_TYPE\""
train_cmd+=" --model_size $MODEL_SIZE"
train_cmd+=" --freeze_ratio $FREEZE_RATIO"
# 🔥 四损失系统参数
train_cmd+=" --balance_loss_coef $BALANCE_LOSS_COEF"
train_cmd+=" --similarity_loss_coef $SIMILARITY_LOSS_COEF"
train_cmd+=" --diversity_loss_coef $DIVERSITY_LOSS_COEF"
# 可选参数
if [[ "$USE_PROFILE" == "true" ]]; then
train_cmd+=" --profile"
train_cmd+=" --profile_interval $PROFILE_INTERVAL"
fi
if [[ "$USE_FLASH_ATTN" == "true" ]]; then
train_cmd+=" --use_flash_attn"
fi
if [[ "$FAST_CLUSTERING" == "true" ]]; then
train_cmd+=" --fast_clustering"
fi
if [[ "$CLUSTER_CACHE_PATH" != "None" ]]; then
train_cmd+=" --cluster_cache_path \"$CLUSTER_CACHE_PATH\""
fi
# SwanLab配置
train_cmd+=" --use_swanlab"
train_cmd+=" --swanlab_project \"$SWANLAB_PROJECT\""
# train_cmd+=" --swanlab_online True"
echo "📋 执行命令:"
echo "$train_cmd"
echo
# 记录命令到日志文件
echo "执行命令: $train_cmd" >> "$LOG_FILE"
echo "开始时间: $(date)" >> "$LOG_FILE"
# 使用nohup执行训练后台运行输出写入日志文件
echo "🔄 使用nohup后台运行训练输出将写入日志文件: $LOG_FILE"
# 创建训练脚本
train_script="/tmp/train_${EXPERIMENT_VERSION}.sh"
cat > "$train_script" << EOF
#!/bin/bash
# cd /home/pci/nas/AI_Large_Model_Team/ycz/Minimind
# source .venv/bin/activate
$train_cmd
echo "结束时间: \$(date)"
echo "退出代码: \$?"
EOF
chmod +x "$train_script"
# 使用nohup后台运行
nohup bash "$train_script" >> "$LOG_FILE" 2>&1 &
local train_pid=$!
echo "🔥 训练进程已启动PID: $train_pid"
echo "训练PID: $train_pid" >> "$LOG_FILE"
echo "训练脚本: $train_script" >> "$LOG_FILE"
# 等待几秒确保进程启动
sleep 5
# 检查进程是否还在运行
if kill -0 $train_pid 2>/dev/null; then
echo "✅ 训练进程正在后台运行"
echo "📋 实时查看日志: tail -f $LOG_FILE"
echo "📋 检查进程状态: ps -p $train_pid"
echo "🛑 停止训练: kill $train_pid"
echo "📈 SwanLab: https://swanlab.cn/project/$SWANLAB_PROJECT"
echo ""
echo "🧠 四损失系统机制正在测试中..."
echo " 🔥 损失结构: CE + Balance + Similarity + Diversity"
echo " 🔥 候选机制: 32个候选 → Gumbel-Softmax选择1个最佳"
echo " 🔥 相似度损失: 可微分优化 (修复震荡问题)"
echo " 🔥 多样性约束: 候选集内部差异性正则化"
echo " 🔥 选择策略: 语义相似度驱动 vs 随机平均"
echo ""
echo "📊 与实验1.4.9对比:"
echo " - 选择机制: 平均融合 → 最优选择"
echo " - 相似度损失: 不可微分 → 可微分"
echo " - 候选多样性: 无约束 → 多样性正则化"
echo " - 损失系统: 三损失 → 四损失平衡"
echo ""
echo "训练正在后台运行,可以安全关闭终端。"
echo ""
echo "🎯 预期改进:"
echo " - 相似度损失: 稳定收敛 (不再震荡)"
echo " - CE Loss: < 0.8 (改善语言建模)"
echo " - 生成质量: 更连贯的文本输出"
echo " - 记忆选择: 更精准的语义匹配"
echo ""
echo "⏱️ 预计训练时间: 18-22小时 (额外计算开销)"
echo "📊 预计GPU占用: ~24GB (Gumbel-Softmax + 多样性计算)"
echo ""
echo "🔍 关键监控指标:"
echo " - Similarity Loss: 期望从1.9震荡→稳定下降"
echo " - Diversity Loss: 保持适中值避免过度惩罚"
echo " - Selection Entropy: 监控选择多样性"
echo " - Selected Similarity: 观察选中记忆的相似度"
echo ""
else
echo "❌ 训练进程启动失败"
echo "📋 查看日志: $LOG_FILE"
exit 1
fi
}
# ----------------------------------------------------------------------------
# 🤖 清理函数
# ----------------------------------------------------------------------------
cleanup() {
echo "🧹 清理临时文件..."
# 删除临时验证文件
rm -f /tmp/temp_val.jsonl
}
# ----------------------------------------------------------------------------
# 🤖 信号处理
# ----------------------------------------------------------------------------
trap cleanup EXIT
trap 'echo "❌ 实验被中断"; cleanup; exit 130' INT TERM
# ----------------------------------------------------------------------------
# 🤖 主程序入口
# ----------------------------------------------------------------------------
main() {
echo "============================================================================"
echo "🧠 MiniMind 预训练实验 1.4.10"
echo "🎯 四损失系统 - Gumbel-Softmax + 可微分相似度损失 + 多样性约束"
echo "============================================================================"
echo ""
echo "🔥 核心创新:"
echo " ► 四损失架构: CE + Balance + Similarity + Diversity"
echo " ► Gumbel-Softmax: 32候选→1最佳 (可微分离散选择)"
echo " ► 相似度损失: 可微分优化 (修复震荡)"
echo " ► 多样性约束: 候选集内部差异性正则化"
echo " ► 语义选择: 相似度驱动 vs 平均融合"
echo ""
echo "🎯 实验假设:"
echo " ✓ 可微分相似度损失解决震荡问题"
echo " ✓ 语义驱动选择改善记忆利用质量"
echo " ✓ 多样性约束防止候选集退化"
echo " ✓ 四损失平衡提升整体模型性能"
echo ""
echo "🔧 关键技术细节:"
echo " ► Straight-Through Estimator确保梯度流"
echo " ► 候选集多样性通过余弦相似度矩阵计算"
echo " ► Gumbel噪声增强选择随机性"
echo " ► 硬选择保持离散性,软梯度保持可微性"
echo ""
echo "============================================================================"
# 执行检查和初始化
check_environment
log_experiment_info
# 运行实验
run_experiment
echo "============================================================================"
echo "✅ 实验 $EXPERIMENT_VERSION 启动完成"
echo "📅 启动时间: $(date)"
echo "============================================================================"
}
# 执行主程序
main "$@"