This commit is contained in:
iomgaa 2025-09-06 15:12:05 +08:00
parent cb3152dc94
commit 2f6995d667
4 changed files with 27 additions and 15 deletions

View File

@ -622,10 +622,12 @@ class MiniMindLM(PreTrainedModel):
# 固定冻结前面的条目
freeze_mask[:freeze_num] = True
self.register_buffer('freeze_mask', freeze_mask, persistent=False)
print(f"🔥 Memory bank freezing enabled: {freeze_num}/{params.knowledge_num} entries ({params.freeze_ratio*100:.1f}%) frozen")
print(f"🔥 Memory bank freezing enabled: {freeze_num}/{params.knowledge_num} entries ({params.freeze_ratio*100:.1f}%) frozen", flush=True)
import sys; sys.stdout.flush()
else:
self.register_buffer('freeze_mask', torch.zeros(params.knowledge_num, dtype=torch.bool), persistent=False)
print(f"🔥 Memory bank freezing disabled: all entries can be updated")
print(f"🔥 Memory bank freezing disabled: all entries can be updated", flush=True)
import sys; sys.stdout.flush()
self.OUT = CausalLMOutputWithPast()

View File

@ -40,8 +40,8 @@ LOG_FILE="$LOG_DIR/experiment.log"
# ----------------------------------------------------------------------------
# 🤖 硬件配置
# ----------------------------------------------------------------------------
CUDA_VISIBLE_DEVICES="0"
NUM_PROCESSES="1"
CUDA_VISIBLE_DEVICES="0,1,2,3"
NUM_PROCESSES="4"
MIXED_PRECISION="bf16"
MAIN_PROCESS_PORT="29500"
@ -66,9 +66,9 @@ DISABLE_DB="false"
# 🤖 训练超参数
# ----------------------------------------------------------------------------
EPOCHS="3"
EMBEDDING_EPOCH="2"
BATCH_SIZE="32" # 🔥 降低批次大小以适应更复杂的计算
ACCUMULATION_STEPS="12" # 🔥 增加累积步数保持有效批次大小
EMBEDDING_EPOCH="42"
BATCH_SIZE="4" # 🔥 降低批次大小以适应更复杂的计算
ACCUMULATION_STEPS="4" # 🔥 增加累积步数保持有效批次大小
LEARNING_RATE="2e-4" # 🔥 适度降低学习率提升稳定性
DTYPE="bfloat16"
GRAD_CLIP="1.0"
@ -86,7 +86,7 @@ CLUSTER_CACHE_PATH="None" # 禁用聚类缓存
VAL_DATA_PATH="dataset/stable/eval_data.json"
# 训练配置
NUM_WORKERS="1"
NUM_WORKERS="8"
LOG_INTERVAL="100" # 🔥 更频繁的日志记录观察四个损失
VAL_INTERVAL="100"
SAVE_INTERVAL="10000"
@ -215,7 +215,7 @@ run_experiment() {
echo "⏰ 开始时间: $EXPERIMENT_DATE"
# 构建训练命令
local train_cmd="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python train_pretrain_accelerate.py"
local train_cmd="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES accelerate launch --config_file accelerate_config.yaml train_pretrain_accelerate.py"
# 添加训练参数
train_cmd+=" --out_dir \"$LOG_DIR\""

View File

@ -36,10 +36,14 @@ EXPERIMENT_DATE="$(date '+%Y-%m-%d %H:%M:%S')" # 自动记录实验开始时间
# export VIRTUAL_ENV="[VENV_PATH]"
# source "$VIRTUAL_ENV/bin/activate"
# 调试和监控环境变量
export NCCL_DEBUG=INFO # NCCL 调试信息
# 调试和监控环境变量
export PYTHONFAULTHANDLER=1 # Python 故障处理
export CUDA_LAUNCH_BLOCKING=1 # CUDA 同步执行(调试用)
# export NCCL_DEBUG=INFO # NCCL 调试信息(仅调试时启用)
# export CUDA_LAUNCH_BLOCKING=1 # CUDA 同步执行(严重影响性能,仅调试时启用)
# 🔥 强制禁用输出缓冲确保日志立即写入不影响GPU性能
export PYTHONUNBUFFERED=1 # Python 解释器不缓冲输出
export PYTHONIOENCODING=utf-8 # 确保编码一致性
# SwanLab 配置
export SWANLAB_API_KEY="[SWANLAB_API_KEY]" # 🤖 [AI构建] SwanLab API密钥
@ -292,8 +296,8 @@ echo "退出代码: \$?"
EOF
chmod +x "$train_script"
# 使用nohup后台运行
nohup bash "$train_script" >> "$LOG_FILE" 2>&1 &
# 使用nohup后台运行并使用stdbuf禁用缓冲
nohup stdbuf -oL -eL bash "$train_script" >> "$LOG_FILE" 2>&1 &
local train_pid=$!
echo "🔥 训练进程已启动PID: $train_pid"

View File

@ -1,6 +1,10 @@
import os
# 设置环境变量 - 将wandb替换为SwanLab
# os.environ["SWANLAB_MODE"] = "online" # SwanLab使用在线模式
# 🔥 强制禁用输出缓冲,确保日志立即写入
os.environ['PYTHONUNBUFFERED'] = '1' # Python 解释器不缓冲输出
os.environ['PYTHONIOENCODING'] = 'utf-8' # 确保编码一致性
import platform
import argparse
from tqdm import tqdm
@ -92,7 +96,9 @@ def log_memory_status(step, prefetch_batches, accelerator, stage="", detailed=Fa
def Logger(msg, accelerator=None):
# 如果没有提供accelerator则只在主进程打印
if accelerator is None or accelerator.is_main_process:
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {msg}")
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {msg}", flush=True) # 强制刷新输出缓冲
import sys
sys.stdout.flush() # 确保立即写入
# Helper function to format seconds into HH:MM:SS
def format_time(seconds):