diff --git a/train_pretrain_accelerate.py b/train_pretrain_accelerate.py
index eac86a0..97c98c9 100644
--- a/train_pretrain_accelerate.py
+++ b/train_pretrain_accelerate.py
@@ -228,15 +228,15 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
     best_loss = float('10000')
 
     # 添加CUDA事件来分析性能 (只在主进程进行)
-    if args.profile and accelerator.is_main_process:
-        data_start = torch.cuda.Event(enable_timing=True)
-        data_end = torch.cuda.Event(enable_timing=True)
-        forward_start = torch.cuda.Event(enable_timing=True)
-        forward_end = torch.cuda.Event(enable_timing=True)
-        backward_start = torch.cuda.Event(enable_timing=True)
-        backward_end = torch.cuda.Event(enable_timing=True)
-        optimizer_start = torch.cuda.Event(enable_timing=True)
-        optimizer_end = torch.cuda.Event(enable_timing=True)
+    # if args.profile and accelerator.is_main_process:
+    #     data_start = torch.cuda.Event(enable_timing=True)
+    #     data_end = torch.cuda.Event(enable_timing=True)
+    #     forward_start = torch.cuda.Event(enable_timing=True)
+    #     forward_end = torch.cuda.Event(enable_timing=True)
+    #     backward_start = torch.cuda.Event(enable_timing=True)
+    #     backward_end = torch.cuda.Event(enable_timing=True)
+    #     optimizer_start = torch.cuda.Event(enable_timing=True)
+    #     optimizer_end = torch.cuda.Event(enable_timing=True)
 
     # 预取数据
     prefetch_factor = 2  # 预取的批次数
@@ -257,8 +257,8 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
     for step in range(total_steps_in_epoch):
         try:
             # 计时数据加载 (只在主进程进行)
-            if args.profile and accelerator.is_main_process:
-                data_start.record()
+            # if args.profile and accelerator.is_main_process:
+            #     data_start.record()
 
             # 使用预取的数据
             if prefetch_batches:
@@ -276,16 +276,16 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
                     pass
 
             # 计时数据加载结束 (只在主进程进行)
-            if args.profile and accelerator.is_main_process:
-                data_end.record()
+            # if args.profile and accelerator.is_main_process:
+            #     data_end.record()
 
             # 更新学习率
             if scheduler is not None:
                 scheduler.step()
 
             # 计时前向传播 (只在主进程进行)
-            if args.profile and accelerator.is_main_process:
-                forward_start.record()
+            # if args.profile and accelerator.is_main_process:
+            #     forward_start.record()
 
             # 前向传播
             with ctx:
@@ -311,24 +311,24 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
                 loss = loss / args.accumulation_steps
 
             # 计时前向传播结束 (只在主进程进行)
-            if args.profile and accelerator.is_main_process:
-                forward_end.record()
+            # if args.profile and accelerator.is_main_process:
+            #     forward_end.record()
 
             # 计时反向传播 (只在主进程进行)
-            if args.profile and accelerator.is_main_process:
-                backward_start.record()
+            # if args.profile and accelerator.is_main_process:
+            #     backward_start.record()
 
             # 反向传播
             # 当使用DeepSpeed时，它会自动处理梯度累积和梯度裁剪
             accelerator.backward(loss)
 
             # 计时反向传播结束 (只在主进程进行)
-            if args.profile and accelerator.is_main_process:
-                backward_end.record()
+            # if args.profile and accelerator.is_main_process:
+            #     backward_end.record()
 
             # 计时优化器步骤 (只在主进程进行)
-            if args.profile and accelerator.is_main_process:
-                optimizer_start.record()
+            # if args.profile and accelerator.is_main_process:
+            #     optimizer_start.record()
 
             # 优化器步骤 - 当使用DeepSpeed时，它会自动处理梯度累积和梯度裁剪
             # 只有在达到累积步数时才会执行优化器步骤
@@ -340,8 +340,8 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
             optimizer.zero_grad()
 
             # 计时优化器步骤结束 (只在主进程进行)
-            if args.profile and accelerator.is_main_process:
-                optimizer_end.record()
+            # if args.profile and accelerator.is_main_process:
+            #     optimizer_end.record()
 
             # 打印训练信息 (只在主进程进行)
             if (step + 1) % args.log_interval == 0 and accelerator.is_main_process:
@@ -419,7 +419,7 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
 
             # 保存模型 (只在主进程进行)
             loss_total = loss.item() * args.accumulation_steps
-            if best_loss > loss_total and accelerator.is_main_process:
+            if epoch > 1 or best_loss > loss_total and accelerator.is_main_process:
                 best_loss = loss_total
                 # 使用函数开始处定义的moe_path变量
                 ckp = f'{args.save_dir}/pretrain_{args.dim}{moe_path}.pth'