update
This commit is contained in:
parent
7b42ea2f93
commit
b5dcf0c576
@ -32,14 +32,6 @@
|
|||||||
"bf16": {
|
"bf16": {
|
||||||
"enabled": true
|
"enabled": true
|
||||||
},
|
},
|
||||||
"activation_checkpointing": {
|
|
||||||
"partition_activations": true,
|
|
||||||
"cpu_checkpointing": true,
|
|
||||||
"contiguous_memory_optimization": false,
|
|
||||||
"number_checkpoints": null,
|
|
||||||
"synchronize_checkpoint_boundary": false,
|
|
||||||
"profile": false
|
|
||||||
},
|
|
||||||
"aio": {
|
"aio": {
|
||||||
"block_size": 1048576,
|
"block_size": 1048576,
|
||||||
"queue_depth": 8,
|
"queue_depth": 8,
|
||||||
|
|||||||
@ -5,20 +5,23 @@
|
|||||||
# ============================================================================
|
# ============================================================================
|
||||||
#
|
#
|
||||||
# 🎯 实验目标:
|
# 🎯 实验目标:
|
||||||
# 基于实验1.4.10,通过三大优化策略解决80GB显存不足问题:
|
# 基于实验1.4.10,通过二大安全优化策略解决80GB显存不足问题:
|
||||||
# 1. 候选项数量优化:32→16 (减少50%候选相关显存)
|
# 1. 候选项数量优化:32→16 (减少50%候选相关显存)
|
||||||
# 2. 梯度检查点:减少60-80%激活显存占用
|
# 2. 强化DeepSpeed:参数+优化器CPU offload + 异步I/O优化
|
||||||
# 3. 强化DeepSpeed:参数offload + 激活检查点 + 异步I/O优化
|
#
|
||||||
|
# 📝 优化策略说明:
|
||||||
|
# - 不使用梯度检查点:避免对四损失系统和Gumbel-Softmax的数值稳定性影响
|
||||||
|
# - 专注安全优化:确保训练质量的同时减少显存占用
|
||||||
#
|
#
|
||||||
# 使用方法:
|
# 使用方法:
|
||||||
# bash run_file/experiment_1_4_10_optimized.sh
|
# bash run_file/experiment_1_4_10.sh
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------
|
||||||
# 🧑🔬 实验基本信息
|
# 🧑🔬 实验基本信息
|
||||||
# ----------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------
|
||||||
EXPERIMENT_VERSION="1.4.10_optimized"
|
EXPERIMENT_VERSION="1.4.10_optimized"
|
||||||
EXPERIMENT_DESCRIPTION="四损失系统优化版 - 三大显存优化策略实现"
|
EXPERIMENT_DESCRIPTION="四损失系统优化版 - 二大安全显存优化策略实现"
|
||||||
RESEARCHER_NAME="AI Assistant"
|
RESEARCHER_NAME="AI Assistant"
|
||||||
EXPERIMENT_DATE="$(date '+%Y-%m-%d %H:%M:%S')"
|
EXPERIMENT_DATE="$(date '+%Y-%m-%d %H:%M:%S')"
|
||||||
|
|
||||||
@ -152,10 +155,9 @@ MiniMind 实验信息 - 显存优化版
|
|||||||
研究者: $RESEARCHER_NAME
|
研究者: $RESEARCHER_NAME
|
||||||
开始时间: $EXPERIMENT_DATE
|
开始时间: $EXPERIMENT_DATE
|
||||||
========================================
|
========================================
|
||||||
🔥 三大显存优化策略:
|
🔥 二大安全显存优化策略:
|
||||||
1. 候选项数量优化: 32→16 (减少50%候选相关显存)
|
1. 候选项数量优化: 32→16 (减少50%候选相关显存)
|
||||||
2. 梯度检查点启用: 减少60-80%激活显存占用
|
2. 强化DeepSpeed配置: 参数+优化器CPU offload + 异步I/O
|
||||||
3. 强化DeepSpeed配置: 参数offload + 激活检查点
|
|
||||||
========================================
|
========================================
|
||||||
硬件配置:
|
硬件配置:
|
||||||
GPU设备: $CUDA_VISIBLE_DEVICES
|
GPU设备: $CUDA_VISIBLE_DEVICES
|
||||||
@ -189,8 +191,8 @@ GPU设备: $CUDA_VISIBLE_DEVICES
|
|||||||
========================================
|
========================================
|
||||||
🔥 显存优化对比:
|
🔥 显存优化对比:
|
||||||
原始候选项: 32个 → 优化版: 16个 (减少50%)
|
原始候选项: 32个 → 优化版: 16个 (减少50%)
|
||||||
原始激活显存: 100% → 梯度检查点: 20-40% (减少60-80%)
|
原始参数+优化器: GPU → DeepSpeed offload: CPU (大幅减少GPU占用)
|
||||||
原始参数显存: GPU → DeepSpeed offload: CPU (减少参数GPU占用)
|
数值稳定性: 保持原版稳定性,不使用梯度检查点
|
||||||
========================================
|
========================================
|
||||||
数据路径:
|
数据路径:
|
||||||
训练数据: $DATA_PATH
|
训练数据: $DATA_PATH
|
||||||
@ -199,9 +201,10 @@ GPU设备: $CUDA_VISIBLE_DEVICES
|
|||||||
聚类缓存: $CLUSTER_CACHE_PATH
|
聚类缓存: $CLUSTER_CACHE_PATH
|
||||||
========================================
|
========================================
|
||||||
预期显存使用:
|
预期显存使用:
|
||||||
预计GPU显存: 30-45GB (原版需80GB+)
|
预计GPU显存: 45-55GB (原版需80GB+)
|
||||||
优化效果: 62-75%显存节省
|
优化效果: 31-44%显存节省 (保守估计)
|
||||||
A800 80GB兼容性: ✅ 应该能正常运行
|
A800 80GB兼容性: ✅ 应该能正常运行
|
||||||
|
数值稳定性: ✅ 完全保持,无梯度检查点风险
|
||||||
========================================
|
========================================
|
||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
@ -216,9 +219,9 @@ run_experiment() {
|
|||||||
echo ""
|
echo ""
|
||||||
echo "🔥 显存优化摘要:"
|
echo "🔥 显存优化摘要:"
|
||||||
echo " ► 候选项数量: 32→16 (50%减少)"
|
echo " ► 候选项数量: 32→16 (50%减少)"
|
||||||
echo " ► 梯度检查点: 激活显存减少60-80%"
|
|
||||||
echo " ► DeepSpeed优化: 参数+优化器CPU offload"
|
echo " ► DeepSpeed优化: 参数+优化器CPU offload"
|
||||||
echo " ► 批次大小调整: 48→24 (保持有效批次大小)"
|
echo " ► 批次大小调整: 48→24 (保持有效批次大小)"
|
||||||
|
echo " ► 数值稳定性: 保持完整,避免梯度检查点风险"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
# 构建训练命令
|
# 构建训练命令
|
||||||
@ -323,17 +326,17 @@ EOF
|
|||||||
echo "📈 SwanLab: https://swanlab.cn/project/$SWANLAB_PROJECT"
|
echo "📈 SwanLab: https://swanlab.cn/project/$SWANLAB_PROJECT"
|
||||||
echo ""
|
echo ""
|
||||||
echo "🧠 显存优化版四损失系统正在测试中..."
|
echo "🧠 显存优化版四损失系统正在测试中..."
|
||||||
echo " 🔥 三大优化策略已启用"
|
echo " 🔥 二大安全优化策略已启用"
|
||||||
echo " 🔥 损失结构: CE + Balance + Similarity + Diversity"
|
echo " 🔥 损失结构: CE + Balance + Similarity + Diversity"
|
||||||
echo " 🔥 候选机制: 16个候选 → Gumbel-Softmax选择1个最佳"
|
echo " 🔥 候选机制: 16个候选 → Gumbel-Softmax选择1个最佳"
|
||||||
echo " 🔥 梯度检查点: 自动激活显存减少60-80%"
|
echo " 🔥 数值稳定性: 完全保持,无梯度检查点干扰"
|
||||||
echo " 🔥 DeepSpeed优化: 参数+优化器CPU offload"
|
echo " 🔥 DeepSpeed优化: 参数+优化器CPU offload"
|
||||||
echo ""
|
echo ""
|
||||||
echo "📊 与原版1.4.10对比:"
|
echo "📊 与原版1.4.10对比:"
|
||||||
echo " - 候选项数量: 32→16 (50%减少)"
|
echo " - 候选项数量: 32→16 (50%减少)"
|
||||||
echo " - 显存占用: ~80GB → ~35GB (62%节省)"
|
echo " - 显存占用: ~80GB → ~50GB (37%节省,保守估计)"
|
||||||
echo " - 批次大小: 48→24 (保持有效批次)"
|
echo " - 批次大小: 48→24 (保持有效批次)"
|
||||||
echo " - 激活显存: 梯度检查点大幅减少"
|
echo " - 数值稳定性: 完全保持,无风险优化"
|
||||||
echo ""
|
echo ""
|
||||||
echo "训练正在后台运行,可以安全关闭终端。"
|
echo "训练正在后台运行,可以安全关闭终端。"
|
||||||
echo ""
|
echo ""
|
||||||
@ -343,8 +346,8 @@ EOF
|
|||||||
echo " - 四损失收敛: 与原版期望一致"
|
echo " - 四损失收敛: 与原版期望一致"
|
||||||
echo " - 生成质量: 保持原版目标质量"
|
echo " - 生成质量: 保持原版目标质量"
|
||||||
echo ""
|
echo ""
|
||||||
echo "⏱️ 预计训练时间: 20-24小时 (优化导致轻微增加)"
|
echo "⏱️ 预计训练时间: 18-20小时 (无梯度检查点重复计算)"
|
||||||
echo "📊 预计GPU占用: 30-45GB (A800兼容)"
|
echo "📊 预计GPU占用: 45-55GB (A800兼容)"
|
||||||
echo ""
|
echo ""
|
||||||
echo "🔍 关键监控指标:"
|
echo "🔍 关键监控指标:"
|
||||||
echo " - GPU显存占用: 应保持在70GB以下"
|
echo " - GPU显存占用: 应保持在70GB以下"
|
||||||
@ -380,26 +383,26 @@ trap 'echo "❌ 实验被中断"; cleanup; exit 130' INT TERM
|
|||||||
main() {
|
main() {
|
||||||
echo "============================================================================"
|
echo "============================================================================"
|
||||||
echo "🧠 MiniMind 预训练实验 1.4.10 优化版"
|
echo "🧠 MiniMind 预训练实验 1.4.10 优化版"
|
||||||
echo "🎯 四损失系统 + 三大显存优化策略"
|
echo "🎯 四损失系统 + 二大安全显存优化策略"
|
||||||
echo "============================================================================"
|
echo "============================================================================"
|
||||||
echo ""
|
echo ""
|
||||||
echo "🔥 核心优化策略:"
|
echo "🔥 核心优化策略:"
|
||||||
echo " ► 候选项数量优化: 32→16个 (50%显存减少)"
|
echo " ► 候选项数量优化: 32→16个 (50%显存减少)"
|
||||||
echo " ► 梯度检查点: 激活显存减少60-80%"
|
|
||||||
echo " ► 强化DeepSpeed: 参数+优化器CPU offload + 异步I/O"
|
echo " ► 强化DeepSpeed: 参数+优化器CPU offload + 异步I/O"
|
||||||
echo " ► 批次优化: 24 batch × 16 accum × 4 GPU = 1536 有效批次"
|
echo " ► 批次优化: 24 batch × 16 accum × 4 GPU = 1536 有效批次"
|
||||||
|
echo " ► 数值稳定性: 避免梯度检查点风险,保持训练质量"
|
||||||
echo ""
|
echo ""
|
||||||
echo "🎯 显存优化目标:"
|
echo "🎯 显存优化目标:"
|
||||||
echo " ✓ 原版1.4.10: 需要80GB+ → 优化版: 30-45GB"
|
echo " ✓ 原版1.4.10: 需要80GB+ → 优化版: 45-55GB (保守估计)"
|
||||||
echo " ✓ A800 80GB兼容: 从无法运行 → 完全兼容"
|
echo " ✓ A800 80GB兼容: 从无法运行 → 完全兼容"
|
||||||
echo " ✓ 训练质量保持: 四损失系统功能完整"
|
echo " ✓ 训练质量保持: 四损失系统功能完整,数值稳定"
|
||||||
echo " ✓ 收敛行为一致: 与原版1.4.10期望一致"
|
echo " ✓ 收敛行为一致: 与原版1.4.10期望完全一致"
|
||||||
echo ""
|
echo ""
|
||||||
echo "🔧 技术实现细节:"
|
echo "🔧 技术实现细节:"
|
||||||
echo " ► LMConfig.py: num_candidates 32→16"
|
echo " ► LMConfig.py: num_candidates 32→16 (核心显存优化)"
|
||||||
echo " ► train_pretrain_accelerate.py: 梯度检查点启用"
|
echo " ► train_pretrain_accelerate.py: 移除梯度检查点,保持数值稳定"
|
||||||
echo " ► ds_config.json: 参数offload + 激活检查点 + 异步I/O"
|
echo " ► ds_config.json: 参数+优化器offload + 异步I/O优化"
|
||||||
echo " ► 批次调整: 保持1536有效批次大小"
|
echo " ► 批次调整: 48→24,accumulation 8→16,保持1536有效批次"
|
||||||
echo ""
|
echo ""
|
||||||
echo "============================================================================"
|
echo "============================================================================"
|
||||||
|
|
||||||
@ -413,7 +416,7 @@ main() {
|
|||||||
echo "============================================================================"
|
echo "============================================================================"
|
||||||
echo "✅ 实验 $EXPERIMENT_VERSION 启动完成"
|
echo "✅ 实验 $EXPERIMENT_VERSION 启动完成"
|
||||||
echo "📅 启动时间: $(date)"
|
echo "📅 启动时间: $(date)"
|
||||||
echo "🎯 优化目标: 从80GB+显存需求降至30-45GB,A800兼容"
|
echo "🎯 优化目标: 从80GB+显存需求降至45-55GB,A800兼容,数值稳定"
|
||||||
echo "============================================================================"
|
echo "============================================================================"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -708,23 +708,9 @@ def init_model(lm_config, pretrained_embedding_path=None, database_init_path=Non
|
|||||||
if hasattr(module, 'weight'):
|
if hasattr(module, 'weight'):
|
||||||
nn.init.ones_(module.weight)
|
nn.init.ones_(module.weight)
|
||||||
|
|
||||||
# 🔥 实验1.4.10优化: 启用梯度检查点以减少显存占用
|
# 🔥 实验1.4.10优化: 专注于安全的显存优化策略
|
||||||
if hasattr(model, 'gradient_checkpointing_enable'):
|
# 不使用梯度检查点,避免对四损失系统和Gumbel-Softmax的数值稳定性影响
|
||||||
model.gradient_checkpointing_enable()
|
Logger("✅ 显存优化策略:候选项减少(32→16) + DeepSpeed参数offload")
|
||||||
Logger("✅ 梯度检查点已启用 - 预计减少激活显存占用60-80%")
|
|
||||||
else:
|
|
||||||
# 手动为每个Transformer层启用梯度检查点
|
|
||||||
from torch.utils.checkpoint import checkpoint
|
|
||||||
if hasattr(model, 'layers'):
|
|
||||||
def make_checkpoint_forward(original_forward):
|
|
||||||
def checkpoint_forward(*args, **kwargs):
|
|
||||||
return checkpoint(original_forward, *args, **kwargs, use_reentrant=False)
|
|
||||||
return checkpoint_forward
|
|
||||||
|
|
||||||
for layer_idx, layer in enumerate(model.layers):
|
|
||||||
# 包装layer的forward方法以使用checkpoint
|
|
||||||
layer.forward = make_checkpoint_forward(layer.forward)
|
|
||||||
Logger("✅ 手动梯度检查点已启用 - 预计减少激活显存占用60-80%")
|
|
||||||
|
|
||||||
# 记忆库初始化
|
# 记忆库初始化
|
||||||
if database_init_path and os.path.exists(database_init_path):
|
if database_init_path and os.path.exists(database_init_path):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user