diff --git a/train_pretrain_accelerate.py b/train_pretrain_accelerate.py index eac86a0..97c98c9 100644 --- a/train_pretrain_accelerate.py +++ b/train_pretrain_accelerate.py @@ -228,15 +228,15 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a best_loss = float('10000') # 添加CUDA事件来分析性能 (只在主进程进行) - if args.profile and accelerator.is_main_process: - data_start = torch.cuda.Event(enable_timing=True) - data_end = torch.cuda.Event(enable_timing=True) - forward_start = torch.cuda.Event(enable_timing=True) - forward_end = torch.cuda.Event(enable_timing=True) - backward_start = torch.cuda.Event(enable_timing=True) - backward_end = torch.cuda.Event(enable_timing=True) - optimizer_start = torch.cuda.Event(enable_timing=True) - optimizer_end = torch.cuda.Event(enable_timing=True) + # if args.profile and accelerator.is_main_process: + # data_start = torch.cuda.Event(enable_timing=True) + # data_end = torch.cuda.Event(enable_timing=True) + # forward_start = torch.cuda.Event(enable_timing=True) + # forward_end = torch.cuda.Event(enable_timing=True) + # backward_start = torch.cuda.Event(enable_timing=True) + # backward_end = torch.cuda.Event(enable_timing=True) + # optimizer_start = torch.cuda.Event(enable_timing=True) + # optimizer_end = torch.cuda.Event(enable_timing=True) # 预取数据 prefetch_factor = 2 # 预取的批次数 @@ -257,8 +257,8 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a for step in range(total_steps_in_epoch): try: # 计时数据加载 (只在主进程进行) - if args.profile and accelerator.is_main_process: - data_start.record() + # if args.profile and accelerator.is_main_process: + # data_start.record() # 使用预取的数据 if prefetch_batches: @@ -276,16 +276,16 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a pass # 计时数据加载结束 (只在主进程进行) - if args.profile and accelerator.is_main_process: - data_end.record() + # if args.profile and accelerator.is_main_process: + # data_end.record() # 更新学习率 if scheduler is not None: scheduler.step() # 计时前向传播 (只在主进程进行) - if args.profile and accelerator.is_main_process: - forward_start.record() + # if args.profile and accelerator.is_main_process: + # forward_start.record() # 前向传播 with ctx: @@ -311,24 +311,24 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a loss = loss / args.accumulation_steps # 计时前向传播结束 (只在主进程进行) - if args.profile and accelerator.is_main_process: - forward_end.record() + # if args.profile and accelerator.is_main_process: + # forward_end.record() # 计时反向传播 (只在主进程进行) - if args.profile and accelerator.is_main_process: - backward_start.record() + # if args.profile and accelerator.is_main_process: + # backward_start.record() # 反向传播 # 当使用DeepSpeed时,它会自动处理梯度累积和梯度裁剪 accelerator.backward(loss) # 计时反向传播结束 (只在主进程进行) - if args.profile and accelerator.is_main_process: - backward_end.record() + # if args.profile and accelerator.is_main_process: + # backward_end.record() # 计时优化器步骤 (只在主进程进行) - if args.profile and accelerator.is_main_process: - optimizer_start.record() + # if args.profile and accelerator.is_main_process: + # optimizer_start.record() # 优化器步骤 - 当使用DeepSpeed时,它会自动处理梯度累积和梯度裁剪 # 只有在达到累积步数时才会执行优化器步骤 @@ -340,8 +340,8 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a optimizer.zero_grad() # 计时优化器步骤结束 (只在主进程进行) - if args.profile and accelerator.is_main_process: - optimizer_end.record() + # if args.profile and accelerator.is_main_process: + # optimizer_end.record() # 打印训练信息 (只在主进程进行) if (step + 1) % args.log_interval == 0 and accelerator.is_main_process: @@ -419,7 +419,7 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a # 保存模型 (只在主进程进行) loss_total = loss.item() * args.accumulation_steps - if best_loss > loss_total and accelerator.is_main_process: + if epoch > 1 or best_loss > loss_total and accelerator.is_main_process: best_loss = loss_total # 使用函数开始处定义的moe_path变量 ckp = f'{args.save_dir}/pretrain_{args.dim}{moe_path}.pth'