diff --git a/model/model_memory.py b/model/model_memory.py index 975e1f8..a4c474f 100644 --- a/model/model_memory.py +++ b/model/model_memory.py @@ -337,7 +337,7 @@ class MiniMindBlock(nn.Module): self.memory_gate = MemoryGate(config) self.gated_memory_fusion = GatedMemoryFusion(config) - self.attentionpool = nn.Linear(config.dim, 1) + # self.attentionpool = nn.Linear(config.dim, 1) def forward(self, x, pos_cis, memory_bank, tok_embeddings, collect_ema_stats=False): """ diff --git a/run_file/experiment_1_4_7-06.sh b/run_file/experiment_1_4_7-06.sh new file mode 100644 index 0000000..962212e --- /dev/null +++ b/run_file/experiment_1_4_7-06.sh @@ -0,0 +1,249 @@ +#!/bin/bash + +######################################################### +# 实验1.4.7 - Memory Bank文本初始化 + 部分冻结机制 +# +# 实验目标: +# 1. 验证使用有意义文本进行memory_bank初始化的效果 +# 2. 验证部分memory_bank冻结机制(freeze_ratio=0.4)的效果 +# +# 关键特性: +# - 使用sentence_trex_data.json文本数据初始化memory_bank +# - 冻结20%的memory_bank条目,保护重要知识 +# - Token-based memory机制 + EMA更新 +# - Product Key Memory架构 +######################################################### + +echo "==========================================" +echo "🚀 开始实验 1.4.7 - Memory Bank优化" +echo "🔥 新特性: 文本初始化 + 部分冻结机制" +echo "==========================================" + +# 实验配置 +EXPERIMENT_NAME="experiment_1_4_7-06" +OUTPUT_DIR="out/${EXPERIMENT_NAME}" +LOG_FILE="${OUTPUT_DIR}/experiment.log" +PID_FILE="${OUTPUT_DIR}/train.pid" + +# 创建输出目录 +mkdir -p $OUTPUT_DIR + +echo "📂 实验输出目录: $OUTPUT_DIR" +echo "📝 日志文件: $LOG_FILE" + +# 核心参数配置 +MODEL_TYPE="model_memory" # 🔥 使用memory架构 +DIM=512 +N_LAYERS=8 +N_HEADS=32 +MAX_SEQ_LEN=512 + +# 🔥 Memory Bank配置 - 实验1.4.7关键参数 +KNOWLEDGE_NUM=1048576 # 1M条记忆(2^20) +KNOWLEDGE_LENGTH=8 # 每条记忆32个token +KNOWLEDGE_DIM=128 # 记忆向量维度128 +FREEZE_RATIO=0.6 # 🔥 新特性: 冻结60%的记忆条目 + +# EMA更新配置 +USE_EMA_UPDATE="True" +EMA_DECAY=0.9 # EMA衰减率 +EMA_UPDATE_FREQ=5 # EMA更新频率 + +# 训练配置 +EPOCHS=3 +BATCH_SIZE=48 +ACCUMULATION_STEPS=8 +LEARNING_RATE=2e-4 +DTYPE="bfloat16" +GRAD_CLIP=1.0 +BALANCE_LOSS_COEF=0.01 # 平衡损失系数 + +# 数据路径配置 +DATA_PATH="/home/zym/Code/stable/merged_pretrain.jsonl" +DATABASE_INIT_PATH="/home/zym/Code/stable/sentence_trex_data.json" # 🔥 文本数据初始化 +CACHE_PATH="cache/memory_bank_init_${KNOWLEDGE_NUM}_${KNOWLEDGE_LENGTH}.pt" # 🔥 Memory初始化缓存 + +# GPU和性能配置 +export CUDA_VISIBLE_DEVICES=0 +NUM_WORKERS=8 +MIXED_PRECISION="bf16" + +# 监控配置 +USE_SWANLAB="True" +SWANLAB_PROJECT="MiniMind-Experiment-1.4.7" +SWANLAB_ONLINE="False" # 离线模式 + +# 验证和日志配置 +LOG_INTERVAL=100 +VAL_INTERVAL=200 +PROFILE="True" +PROFILE_INTERVAL=10 +MEMORY_MONITOR="False" # 关闭内存监控降低开销 + +echo "==========================================" +echo "📋 实验配置摘要" +echo "==========================================" +echo "🔥 核心特性:" +echo " - Model Type: $MODEL_TYPE" +echo " - Memory Bank Size: $KNOWLEDGE_NUM 条" +echo " - Memory Length: $KNOWLEDGE_LENGTH tokens" +echo " - Freeze Ratio: $FREEZE_RATIO (冻结 $((KNOWLEDGE_NUM * 20 / 100)) 条记忆)" +echo " - Text Initialization: $DATABASE_INIT_PATH" +echo "" +echo "🏗️ 模型架构:" +echo " - Dimension: $DIM" +echo " - Layers: $N_LAYERS" +echo " - Heads: $N_HEADS" +echo " - Max Seq Len: $MAX_SEQ_LEN" +echo "" +echo "📚 训练设置:" +echo " - Epochs: $EPOCHS" +echo " - Batch Size: $BATCH_SIZE" +echo " - Learning Rate: $LEARNING_RATE" +echo " - Data Type: $DTYPE" +echo "" +echo "⚡ EMA配置:" +echo " - EMA Decay: $EMA_DECAY" +echo " - Update Frequency: $EMA_UPDATE_FREQ" +echo "" +echo "📊 监控:" +echo " - SwanLab Project: $SWANLAB_PROJECT" +echo " - Log Interval: $LOG_INTERVAL" +echo "==========================================" + +# 检查必要文件 +echo "🔍 检查必要文件..." +if [[ ! -f "$DATA_PATH" ]]; then + echo "❌ 错误: 训练数据文件不存在: $DATA_PATH" + exit 1 +fi + +if [[ ! -f "$DATABASE_INIT_PATH" ]]; then + echo "❌ 错误: Memory初始化数据文件不存在: $DATABASE_INIT_PATH" + exit 1 +fi + +echo "✅ 文件检查通过" + +# 构建训练命令 - 参考experiment_1_4_6.sh的成功模式 +TRAIN_CMD="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python train_pretrain_accelerate.py" +TRAIN_CMD+=" --out_dir \"$OUTPUT_DIR\"" +TRAIN_CMD+=" --epochs $EPOCHS" +TRAIN_CMD+=" --embedding_epoch 2" +TRAIN_CMD+=" --batch_size $BATCH_SIZE" +TRAIN_CMD+=" --learning_rate $LEARNING_RATE" +TRAIN_CMD+=" --dtype $DTYPE" +TRAIN_CMD+=" --num_workers $NUM_WORKERS" +TRAIN_CMD+=" --accumulation_steps $ACCUMULATION_STEPS" +TRAIN_CMD+=" --grad_clip $GRAD_CLIP" +TRAIN_CMD+=" --warmup_iters 0" +TRAIN_CMD+=" --log_interval $LOG_INTERVAL" +TRAIN_CMD+=" --val_interval $VAL_INTERVAL" +TRAIN_CMD+=" --dim $DIM" +TRAIN_CMD+=" --n_layers $N_LAYERS" +TRAIN_CMD+=" --n_heads $N_HEADS" +TRAIN_CMD+=" --max_seq_len $MAX_SEQ_LEN" +TRAIN_CMD+=" --data_path \"$DATA_PATH\"" +TRAIN_CMD+=" --knowledge_num $KNOWLEDGE_NUM" +TRAIN_CMD+=" --knowledge_length $KNOWLEDGE_LENGTH" +TRAIN_CMD+=" --knowledge_dim $KNOWLEDGE_DIM" +TRAIN_CMD+=" --database_init_path \"$DATABASE_INIT_PATH\"" +TRAIN_CMD+=" --cluster_cache_path \"$CACHE_PATH\"" +TRAIN_CMD+=" --model_type \"$MODEL_TYPE\"" +TRAIN_CMD+=" --balance_loss_coef $BALANCE_LOSS_COEF" + +# 添加可选的flag参数(不需要值的参数) +TRAIN_CMD+=" --use_swanlab" +TRAIN_CMD+=" --profile" +TRAIN_CMD+=" --use_flash_attn" + +# 添加有值的可选参数 +TRAIN_CMD+=" --swanlab_project \"$SWANLAB_PROJECT\"" +# TRAIN_CMD+=" --swanlab_online $SWANLAB_ONLINE" +TRAIN_CMD+=" --profile_interval $PROFILE_INTERVAL" + +# 添加memory monitor参数(如果启用) +if [[ "$MEMORY_MONITOR" == "True" ]]; then + TRAIN_CMD+=" --memory_monitor" +fi + +echo "" +echo "🚀 启动训练..." +echo "📝 完整训练命令:" +echo "$TRAIN_CMD" +echo "" +echo "⏰ 预计训练时间: 约6-8小时" +echo "📊 实时监控: 查看 $LOG_FILE" +echo "" + +# 记录命令到日志文件 +echo "执行命令: $TRAIN_CMD" >> "$LOG_FILE" +echo "开始时间: $(date)" >> "$LOG_FILE" + +# 创建训练脚本(参考1.4.6的成功模式) +TRAIN_SCRIPT="/tmp/train_1_4_7-06.sh" +cat > "$TRAIN_SCRIPT" << EOF +#!/bin/bash +cd /home/zym/Code/Minimind +source /home/user/miniconda3/bin/activate +conda activate minimind +$TRAIN_CMD +echo "结束时间: \$(date)" +echo "退出代码: \$?" +EOF +chmod +x "$TRAIN_SCRIPT" + +# 使用nohup后台运行训练脚本 +nohup bash "$TRAIN_SCRIPT" >> "$LOG_FILE" 2>&1 & +TRAIN_PID=$! +echo $TRAIN_PID > $PID_FILE + +echo "==========================================" +echo "✅ 实验1.4.7已启动" +echo "🆔 进程ID: $TRAIN_PID" +echo "📝 日志文件: $LOG_FILE" +echo "📊 监控命令: tail -f $LOG_FILE" +echo "🛑 停止命令: kill $TRAIN_PID" +echo "==========================================" +echo "" +echo "🔥 实验1.4.7 - Memory Bank优化特性:" +echo " ✨ 文本数据初始化 (sentence_trex_data.json)" +echo " ✨ 部分冻结机制 (freeze_ratio=0.4)" +echo " ✨ Token-based EMA更新" +echo " ✨ Product Key Memory架构" +echo "" +echo "📋 监控要点:" +echo " - 初始化阶段:观察文本数据加载和缓存" +echo " - 训练阶段:关注frozen_memories统计" +echo " - EMA更新:监控update_ratio和coverage指标" +echo " - 生成质量:对比词组连贯性改善" +echo "" +echo "⚡ 进程状态检查:" +echo "ps aux | grep $TRAIN_PID" +echo "" + +# 显示初始进程状态 +sleep 2 +if ps -p $TRAIN_PID > /dev/null; then + echo "✅ 训练进程正在运行 (PID: $TRAIN_PID)" + + # 显示前几行日志 + echo "" + echo "📋 初始日志预览:" + echo "----------------------------------------" + timeout 5 tail -f $LOG_FILE | head -10 || echo "日志文件尚未生成,请稍等..." + echo "----------------------------------------" +else + echo "❌ 训练进程启动失败,请检查日志:" + echo "cat $LOG_FILE" +fi + +echo "" +echo "🎯 实验1.4.7核心验证点:" +echo " 1. Memory bank是否成功用文本数据初始化" +echo " 2. 冻结机制是否正常工作 (20%条目不更新)" +echo " 3. 生成质量是否有明显改善" +echo " 4. 训练稳定性是否提升" +echo "" +echo "📖 实验记录: experiment/EXPERIMENT_1_4_7-06.md" +echo "🚀 实验1.4.7启动完成!" \ No newline at end of file diff --git a/run_file/experiment_1_4_9-02.sh b/run_file/experiment_1_4_9-02.sh index b5603d5..e36ca9e 100644 --- a/run_file/experiment_1_4_9-02.sh +++ b/run_file/experiment_1_4_9-02.sh @@ -59,8 +59,8 @@ GRAD_CLIP=1.0 BALANCE_LOSS_COEF=0.01 # 平衡损失系数 # 数据路径配置 -DATA_PATH="./dataset/stable/merged_pretrain.jsonl" -DATABASE_INIT_PATH="./dataset/stable/sentence_trex_data.json" # 🔥 文本数据初始化 +DATA_PATH="/home/zym/Code/stable/merged_pretrain.jsonl" +DATABASE_INIT_PATH="/home/zym/Code/stable/sentence_trex_data.json" # 🔥 文本数据初始化 CACHE_PATH="cache/memory_bank_init_${KNOWLEDGE_NUM}_${KNOWLEDGE_LENGTH}.pt" # 🔥 Memory初始化缓存 # GPU和性能配置 @@ -243,4 +243,4 @@ echo " 3. 生成质量是否有明显改善" echo " 4. 训练稳定性是否提升" echo "" echo "📖 实验记录: experiment/EXPERIMENT_1_4_9-02.md" -echo "🚀 实验1.4.9-02启动完成!" \ No newline at end of file +echo "🚀 实验1.4.9-02启动完成!" diff --git a/run_file/experiment_1_4_9-04.sh b/run_file/experiment_1_4_9-04.sh new file mode 100644 index 0000000..bd1b1c4 --- /dev/null +++ b/run_file/experiment_1_4_9-04.sh @@ -0,0 +1,246 @@ +#!/bin/bash + +######################################################### +# 实验1.4.9-04 - Memory Bank文本初始化 + 部分冻结机制 +# +# 实验目标: +# 1. 验证使用有意义文本进行memory_bank初始化的效果 +# 2. 验证部分memory_bank冻结机制(freeze_ratio=0.2)的效果 +# +# 关键特性: +# - 使用sentence_trex_data.json文本数据初始化memory_bank +# - 冻结20%的memory_bank条目,保护重要知识 +# - Token-based memory机制 + EMA更新 +# - Product Key Memory架构 +######################################################### + +echo "==========================================" +echo "🚀 开始实验 1.4.9-04 - Memory Bank优化" +echo "🔥 新特性: 文本初始化 + 部分冻结机制" +echo "==========================================" + +# 实验配置 +EXPERIMENT_NAME="experiment_1_4_9-04" +OUTPUT_DIR="out/${EXPERIMENT_NAME}" +LOG_FILE="${OUTPUT_DIR}/experiment.log" +PID_FILE="${OUTPUT_DIR}/train.pid" + +# 创建输出目录 +mkdir -p $OUTPUT_DIR + +echo "📂 实验输出目录: $OUTPUT_DIR" +echo "📝 日志文件: $LOG_FILE" + +# 核心参数配置 +MODEL_TYPE="model_memory" # 🔥 使用memory架构 +DIM=512 +N_LAYERS=1 +N_HEADS=32 +MAX_SEQ_LEN=512 + +# 🔥 Memory Bank配置 - 实验1.4.9-04关键参数 +KNOWLEDGE_NUM=1048576 # 1M条记忆(2^20) +KNOWLEDGE_LENGTH=8 # 每条记忆32个token +KNOWLEDGE_DIM=128 # 记忆向量维度128 +FREEZE_RATIO=0.2 # 🔥 新特性: 冻结20%的记忆条目 + +# EMA更新配置 +USE_EMA_UPDATE="True" +EMA_DECAY=0.9 # EMA衰减率 +EMA_UPDATE_FREQ=5 # EMA更新频率 + +# 训练配置 +EPOCHS=3 +BATCH_SIZE=48 +ACCUMULATION_STEPS=8 +LEARNING_RATE=2e-4 +DTYPE="bfloat16" +GRAD_CLIP=1.0 +BALANCE_LOSS_COEF=0.01 # 平衡损失系数 + +# 数据路径配置 +DATA_PATH="/home/zym/Code/stable/merged_pretrain.jsonl" +DATABASE_INIT_PATH="/home/zym/Code/stable/sentence_trex_data.json" # 🔥 文本数据初始化 +CACHE_PATH="cache/memory_bank_init_${KNOWLEDGE_NUM}_${KNOWLEDGE_LENGTH}.pt" # 🔥 Memory初始化缓存 + +# GPU和性能配置 +export CUDA_VISIBLE_DEVICES=0 +NUM_WORKERS=8 +MIXED_PRECISION="bf16" + +# 监控配置 +USE_SWANLAB="True" +SWANLAB_PROJECT="MiniMind-Experiment-1.4.9-04" +SWANLAB_ONLINE="False" # 离线模式 + +# 验证和日志配置 +LOG_INTERVAL=100 +VAL_INTERVAL=200 +PROFILE="True" +PROFILE_INTERVAL=10 +MEMORY_MONITOR="False" # 关闭内存监控降低开销 + +echo "==========================================" +echo "📋 实验配置摘要" +echo "==========================================" +echo "🔥 核心特性:" +echo " - Model Type: $MODEL_TYPE" +echo " - Memory Bank Size: $KNOWLEDGE_NUM 条" +echo " - Memory Length: $KNOWLEDGE_LENGTH tokens" +echo " - Freeze Ratio: $FREEZE_RATIO (冻结 $((KNOWLEDGE_NUM * 20 / 100)) 条记忆)" +echo " - Text Initialization: $DATABASE_INIT_PATH" +echo "" +echo "🏗️ 模型架构:" +echo " - Dimension: $DIM" +echo " - Layers: $N_LAYERS" +echo " - Heads: $N_HEADS" +echo " - Max Seq Len: $MAX_SEQ_LEN" +echo "" +echo "📚 训练设置:" +echo " - Epochs: $EPOCHS" +echo " - Batch Size: $BATCH_SIZE" +echo " - Learning Rate: $LEARNING_RATE" +echo " - Data Type: $DTYPE" +echo "" +echo "⚡ EMA配置:" +echo " - EMA Decay: $EMA_DECAY" +echo " - Update Frequency: $EMA_UPDATE_FREQ" +echo "" +echo "📊 监控:" +echo " - SwanLab Project: $SWANLAB_PROJECT" +echo " - Log Interval: $LOG_INTERVAL" +echo "==========================================" + +# 检查必要文件 +echo "🔍 检查必要文件..." +if [[ ! -f "$DATA_PATH" ]]; then + echo "❌ 错误: 训练数据文件不存在: $DATA_PATH" + exit 1 +fi + +if [[ ! -f "$DATABASE_INIT_PATH" ]]; then + echo "❌ 错误: Memory初始化数据文件不存在: $DATABASE_INIT_PATH" + exit 1 +fi + +echo "✅ 文件检查通过" + +# 构建训练命令 - 参考experiment_1_4_6.sh的成功模式 +TRAIN_CMD="CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES python train_pretrain_accelerate.py" +TRAIN_CMD+=" --out_dir \"$OUTPUT_DIR\"" +TRAIN_CMD+=" --epochs $EPOCHS" +TRAIN_CMD+=" --embedding_epoch 2" +TRAIN_CMD+=" --batch_size $BATCH_SIZE" +TRAIN_CMD+=" --learning_rate $LEARNING_RATE" +TRAIN_CMD+=" --dtype $DTYPE" +TRAIN_CMD+=" --num_workers $NUM_WORKERS" +TRAIN_CMD+=" --accumulation_steps $ACCUMULATION_STEPS" +TRAIN_CMD+=" --grad_clip $GRAD_CLIP" +TRAIN_CMD+=" --warmup_iters 0" +TRAIN_CMD+=" --log_interval $LOG_INTERVAL" +TRAIN_CMD+=" --val_interval $VAL_INTERVAL" +TRAIN_CMD+=" --dim $DIM" +TRAIN_CMD+=" --n_layers $N_LAYERS" +TRAIN_CMD+=" --n_heads $N_HEADS" +TRAIN_CMD+=" --max_seq_len $MAX_SEQ_LEN" +TRAIN_CMD+=" --data_path \"$DATA_PATH\"" +TRAIN_CMD+=" --knowledge_num $KNOWLEDGE_NUM" +TRAIN_CMD+=" --knowledge_length $KNOWLEDGE_LENGTH" +TRAIN_CMD+=" --knowledge_dim $KNOWLEDGE_DIM" +TRAIN_CMD+=" --database_init_path \"$DATABASE_INIT_PATH\"" +TRAIN_CMD+=" --cluster_cache_path \"$CACHE_PATH\"" +TRAIN_CMD+=" --model_type \"$MODEL_TYPE\"" +TRAIN_CMD+=" --balance_loss_coef $BALANCE_LOSS_COEF" + +# 添加可选的flag参数(不需要值的参数) +# TRAIN_CMD+=" --use_swanlab" +TRAIN_CMD+=" --profile" +TRAIN_CMD+=" --use_flash_attn" + +# 添加有值的可选参数 +TRAIN_CMD+=" --swanlab_project \"$SWANLAB_PROJECT\"" +TRAIN_CMD+=" --swanlab_online $SWANLAB_ONLINE" +TRAIN_CMD+=" --profile_interval $PROFILE_INTERVAL" + +# 添加memory monitor参数(如果启用) +if [[ "$MEMORY_MONITOR" == "True" ]]; then + TRAIN_CMD+=" --memory_monitor" +fi + +echo "" +echo "🚀 启动训练..." +echo "📝 完整训练命令:" +echo "$TRAIN_CMD" +echo "" +echo "⏰ 预计训练时间: 约6-8小时" +echo "📊 实时监控: 查看 $LOG_FILE" +echo "" + +# 记录命令到日志文件 +echo "执行命令: $TRAIN_CMD" >> "$LOG_FILE" +echo "开始时间: $(date)" >> "$LOG_FILE" + +# 创建训练脚本(参考1.4.6的成功模式) +TRAIN_SCRIPT="/tmp/train_1_4_9-04.sh" +cat > "$TRAIN_SCRIPT" << EOF +#!/bin/bash +$TRAIN_CMD +echo "结束时间: \$(date)" +echo "退出代码: \$?" +EOF +chmod +x "$TRAIN_SCRIPT" + +# 使用nohup后台运行训练脚本 +nohup bash "$TRAIN_SCRIPT" >> "$LOG_FILE" 2>&1 & +TRAIN_PID=$! +echo $TRAIN_PID > $PID_FILE + +echo "==========================================" +echo "✅ 实验1.4.9-04已启动" +echo "🆔 进程ID: $TRAIN_PID" +echo "📝 日志文件: $LOG_FILE" +echo "📊 监控命令: tail -f $LOG_FILE" +echo "🛑 停止命令: kill $TRAIN_PID" +echo "==========================================" +echo "" +echo "🔥 实验1.4.9-04 - Memory Bank优化特性:" +echo " ✨ 文本数据初始化 (sentence_trex_data.json)" +echo " ✨ 部分冻结机制 (freeze_ratio=0.2)" +echo " ✨ Token-based EMA更新" +echo " ✨ Product Key Memory架构" +echo "" +echo "📋 监控要点:" +echo " - 初始化阶段:观察文本数据加载和缓存" +echo " - 训练阶段:关注frozen_memories统计" +echo " - EMA更新:监控update_ratio和coverage指标" +echo " - 生成质量:对比词组连贯性改善" +echo "" +echo "⚡ 进程状态检查:" +echo "ps aux | grep $TRAIN_PID" +echo "" + +# 显示初始进程状态 +sleep 2 +if ps -p $TRAIN_PID > /dev/null; then + echo "✅ 训练进程正在运行 (PID: $TRAIN_PID)" + + # 显示前几行日志 + echo "" + echo "📋 初始日志预览:" + echo "----------------------------------------" + timeout 5 tail -f $LOG_FILE | head -10 || echo "日志文件尚未生成,请稍等..." + echo "----------------------------------------" +else + echo "❌ 训练进程启动失败,请检查日志:" + echo "cat $LOG_FILE" +fi + +echo "" +echo "🎯 实验1.4.9-04核心验证点:" +echo " 1. Memory bank是否成功用文本数据初始化" +echo " 2. 冻结机制是否正常工作 (20%条目不更新)" +echo " 3. 生成质量是否有明显改善" +echo " 4. 训练稳定性是否提升" +echo "" +echo "📖 实验记录: experiment/EXPERIMENT_1_4_9-04.md" +echo "🚀 实验1.4.9-04启动完成!" diff --git a/train_pretrain_accelerate.py b/train_pretrain_accelerate.py index 35c6d65..e622af7 100644 --- a/train_pretrain_accelerate.py +++ b/train_pretrain_accelerate.py @@ -1356,7 +1356,7 @@ def main(): parser.add_argument("--model_size", type=float, default=50.0, help="模型大小") parser.add_argument("--swanlab_online", type=bool, default=False, help="是否使用在线SwanLab服务") parser.add_argument("--balance_loss_coef", type=float, default=0.01, help="平衡损失系数") - parser.add_argument("--val_data_path", type=str, default="dataset/stable/eval_data.json", help="验证数据集路径") + parser.add_argument("--val_data_path", type=str, default="/home/zym/Code/stable/eval_data.json", help="验证数据集路径") parser.add_argument("--val_interval", type=int, default=100, help="验证评估间隔") args = parser.parse_args()