From 2f6995d667163b5a1010a5a2d2df3e45419ae6d8 Mon Sep 17 00:00:00 2001 From: iomgaa Date: Sat, 6 Sep 2025 15:12:05 +0800 Subject: [PATCH] update --- model/model_memory.py | 6 ++++-- run_file/experiment_1_4_10.sh | 14 +++++++------- run_file/experiment_template.sh | 14 +++++++++----- train_pretrain_accelerate.py | 8 +++++++- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/model/model_memory.py b/model/model_memory.py index 82e0a4a..cbf2d51 100644 --- a/model/model_memory.py +++ b/model/model_memory.py @@ -622,10 +622,12 @@ class MiniMindLM(PreTrainedModel): # 固定冻结前面的条目 freeze_mask[:freeze_num] = True self.register_buffer('freeze_mask', freeze_mask, persistent=False) - print(f"🔥 Memory bank freezing enabled: {freeze_num}/{params.knowledge_num} entries ({params.freeze_ratio*100:.1f}%) frozen") + print(f"🔥 Memory bank freezing enabled: {freeze_num}/{params.knowledge_num} entries ({params.freeze_ratio*100:.1f}%) frozen", flush=True) + import sys; sys.stdout.flush() else: self.register_buffer('freeze_mask', torch.zeros(params.knowledge_num, dtype=torch.bool), persistent=False) - print(f"🔥 Memory bank freezing disabled: all entries can be updated") + print(f"🔥 Memory bank freezing disabled: all entries can be updated", flush=True) + import sys; sys.stdout.flush() self.OUT = CausalLMOutputWithPast() diff --git a/run_file/experiment_1_4_10.sh b/run_file/experiment_1_4_10.sh index 226abcf..d0aa2ed 100644 --- a/run_file/experiment_1_4_10.sh +++ b/run_file/experiment_1_4_10.sh @@ -40,8 +40,8 @@ LOG_FILE="$LOG_DIR/experiment.log" # ---------------------------------------------------------------------------- # 🤖 硬件配置 # ---------------------------------------------------------------------------- -CUDA_VISIBLE_DEVICES="0" -NUM_PROCESSES="1" +CUDA_VISIBLE_DEVICES="0,1,2,3" +NUM_PROCESSES="4" MIXED_PRECISION="bf16" MAIN_PROCESS_PORT="29500" @@ -66,9 +66,9 @@ DISABLE_DB="false" # 🤖 训练超参数 # ---------------------------------------------------------------------------- EPOCHS="3" -EMBEDDING_EPOCH="2" -BATCH_SIZE="32" # 🔥 降低批次大小以适应更复杂的计算 -ACCUMULATION_STEPS="12" # 🔥 增加累积步数保持有效批次大小 +EMBEDDING_EPOCH="42" +BATCH_SIZE="4" # 🔥 降低批次大小以适应更复杂的计算 +ACCUMULATION_STEPS="4" # 🔥 增加累积步数保持有效批次大小 LEARNING_RATE="2e-4" # 🔥 适度降低学习率提升稳定性 DTYPE="bfloat16" GRAD_CLIP="1.0" @@ -86,7 +86,7 @@ CLUSTER_CACHE_PATH="None" # 禁用聚类缓存 VAL_DATA_PATH="dataset/stable/eval_data.json" # 训练配置 -NUM_WORKERS="1" +NUM_WORKERS="8" LOG_INTERVAL="100" # 🔥 更频繁的日志记录观察四个损失 VAL_INTERVAL="100" SAVE_INTERVAL="10000" @@ -215,7 +215,7 @@ run_experiment() { echo "⏰ 开始时间: $EXPERIMENT_DATE" # 构建训练命令 - local train_cmd="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python train_pretrain_accelerate.py" + local train_cmd="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES accelerate launch --config_file accelerate_config.yaml train_pretrain_accelerate.py" # 添加训练参数 train_cmd+=" --out_dir \"$LOG_DIR\"" diff --git a/run_file/experiment_template.sh b/run_file/experiment_template.sh index 2520af1..a2733f4 100644 --- a/run_file/experiment_template.sh +++ b/run_file/experiment_template.sh @@ -36,10 +36,14 @@ EXPERIMENT_DATE="$(date '+%Y-%m-%d %H:%M:%S')" # 自动记录实验开始时间 # export VIRTUAL_ENV="[VENV_PATH]" # source "$VIRTUAL_ENV/bin/activate" -# 调试和监控环境变量 -export NCCL_DEBUG=INFO # NCCL 调试信息 +# 调试和监控环境变量 export PYTHONFAULTHANDLER=1 # Python 故障处理 -export CUDA_LAUNCH_BLOCKING=1 # CUDA 同步执行(调试用) +# export NCCL_DEBUG=INFO # NCCL 调试信息(仅调试时启用) +# export CUDA_LAUNCH_BLOCKING=1 # CUDA 同步执行(严重影响性能,仅调试时启用) + +# 🔥 强制禁用输出缓冲,确保日志立即写入(不影响GPU性能) +export PYTHONUNBUFFERED=1 # Python 解释器不缓冲输出 +export PYTHONIOENCODING=utf-8 # 确保编码一致性 # SwanLab 配置 export SWANLAB_API_KEY="[SWANLAB_API_KEY]" # 🤖 [AI构建] SwanLab API密钥 @@ -292,8 +296,8 @@ echo "退出代码: \$?" EOF chmod +x "$train_script" - # 使用nohup后台运行 - nohup bash "$train_script" >> "$LOG_FILE" 2>&1 & + # 使用nohup后台运行,并使用stdbuf禁用缓冲 + nohup stdbuf -oL -eL bash "$train_script" >> "$LOG_FILE" 2>&1 & local train_pid=$! echo "🔥 训练进程已启动,PID: $train_pid" diff --git a/train_pretrain_accelerate.py b/train_pretrain_accelerate.py index 6fe94d8..22600cd 100644 --- a/train_pretrain_accelerate.py +++ b/train_pretrain_accelerate.py @@ -1,6 +1,10 @@ import os # 设置环境变量 - 将wandb替换为SwanLab # os.environ["SWANLAB_MODE"] = "online" # SwanLab使用在线模式 + +# 🔥 强制禁用输出缓冲,确保日志立即写入 +os.environ['PYTHONUNBUFFERED'] = '1' # Python 解释器不缓冲输出 +os.environ['PYTHONIOENCODING'] = 'utf-8' # 确保编码一致性 import platform import argparse from tqdm import tqdm @@ -92,7 +96,9 @@ def log_memory_status(step, prefetch_batches, accelerator, stage="", detailed=Fa def Logger(msg, accelerator=None): # 如果没有提供accelerator,则只在主进程打印 if accelerator is None or accelerator.is_main_process: - print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {msg}") + print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {msg}", flush=True) # 强制刷新输出缓冲 + import sys + sys.stdout.flush() # 确保立即写入 # Helper function to format seconds into HH:MM:SS def format_time(seconds):