update
This commit is contained in:
parent
44cd7b4d72
commit
5f19adcffa
@ -228,15 +228,15 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
|
||||
best_loss = float('10000')
|
||||
|
||||
# 添加CUDA事件来分析性能 (只在主进程进行)
|
||||
if args.profile and accelerator.is_main_process:
|
||||
data_start = torch.cuda.Event(enable_timing=True)
|
||||
data_end = torch.cuda.Event(enable_timing=True)
|
||||
forward_start = torch.cuda.Event(enable_timing=True)
|
||||
forward_end = torch.cuda.Event(enable_timing=True)
|
||||
backward_start = torch.cuda.Event(enable_timing=True)
|
||||
backward_end = torch.cuda.Event(enable_timing=True)
|
||||
optimizer_start = torch.cuda.Event(enable_timing=True)
|
||||
optimizer_end = torch.cuda.Event(enable_timing=True)
|
||||
# if args.profile and accelerator.is_main_process:
|
||||
# data_start = torch.cuda.Event(enable_timing=True)
|
||||
# data_end = torch.cuda.Event(enable_timing=True)
|
||||
# forward_start = torch.cuda.Event(enable_timing=True)
|
||||
# forward_end = torch.cuda.Event(enable_timing=True)
|
||||
# backward_start = torch.cuda.Event(enable_timing=True)
|
||||
# backward_end = torch.cuda.Event(enable_timing=True)
|
||||
# optimizer_start = torch.cuda.Event(enable_timing=True)
|
||||
# optimizer_end = torch.cuda.Event(enable_timing=True)
|
||||
|
||||
# 预取数据
|
||||
prefetch_factor = 2 # 预取的批次数
|
||||
@ -257,8 +257,8 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
|
||||
for step in range(total_steps_in_epoch):
|
||||
try:
|
||||
# 计时数据加载 (只在主进程进行)
|
||||
if args.profile and accelerator.is_main_process:
|
||||
data_start.record()
|
||||
# if args.profile and accelerator.is_main_process:
|
||||
# data_start.record()
|
||||
|
||||
# 使用预取的数据
|
||||
if prefetch_batches:
|
||||
@ -276,16 +276,16 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
|
||||
pass
|
||||
|
||||
# 计时数据加载结束 (只在主进程进行)
|
||||
if args.profile and accelerator.is_main_process:
|
||||
data_end.record()
|
||||
# if args.profile and accelerator.is_main_process:
|
||||
# data_end.record()
|
||||
|
||||
# 更新学习率
|
||||
if scheduler is not None:
|
||||
scheduler.step()
|
||||
|
||||
# 计时前向传播 (只在主进程进行)
|
||||
if args.profile and accelerator.is_main_process:
|
||||
forward_start.record()
|
||||
# if args.profile and accelerator.is_main_process:
|
||||
# forward_start.record()
|
||||
|
||||
# 前向传播
|
||||
with ctx:
|
||||
@ -311,24 +311,24 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
|
||||
loss = loss / args.accumulation_steps
|
||||
|
||||
# 计时前向传播结束 (只在主进程进行)
|
||||
if args.profile and accelerator.is_main_process:
|
||||
forward_end.record()
|
||||
# if args.profile and accelerator.is_main_process:
|
||||
# forward_end.record()
|
||||
|
||||
# 计时反向传播 (只在主进程进行)
|
||||
if args.profile and accelerator.is_main_process:
|
||||
backward_start.record()
|
||||
# if args.profile and accelerator.is_main_process:
|
||||
# backward_start.record()
|
||||
|
||||
# 反向传播
|
||||
# 当使用DeepSpeed时,它会自动处理梯度累积和梯度裁剪
|
||||
accelerator.backward(loss)
|
||||
|
||||
# 计时反向传播结束 (只在主进程进行)
|
||||
if args.profile and accelerator.is_main_process:
|
||||
backward_end.record()
|
||||
# if args.profile and accelerator.is_main_process:
|
||||
# backward_end.record()
|
||||
|
||||
# 计时优化器步骤 (只在主进程进行)
|
||||
if args.profile and accelerator.is_main_process:
|
||||
optimizer_start.record()
|
||||
# if args.profile and accelerator.is_main_process:
|
||||
# optimizer_start.record()
|
||||
|
||||
# 优化器步骤 - 当使用DeepSpeed时,它会自动处理梯度累积和梯度裁剪
|
||||
# 只有在达到累积步数时才会执行优化器步骤
|
||||
@ -340,8 +340,8 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
|
||||
optimizer.zero_grad()
|
||||
|
||||
# 计时优化器步骤结束 (只在主进程进行)
|
||||
if args.profile and accelerator.is_main_process:
|
||||
optimizer_end.record()
|
||||
# if args.profile and accelerator.is_main_process:
|
||||
# optimizer_end.record()
|
||||
|
||||
# 打印训练信息 (只在主进程进行)
|
||||
if (step + 1) % args.log_interval == 0 and accelerator.is_main_process:
|
||||
@ -419,7 +419,7 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
|
||||
|
||||
# 保存模型 (只在主进程进行)
|
||||
loss_total = loss.item() * args.accumulation_steps
|
||||
if best_loss > loss_total and accelerator.is_main_process:
|
||||
if epoch > 1 or best_loss > loss_total and accelerator.is_main_process:
|
||||
best_loss = loss_total
|
||||
# 使用函数开始处定义的moe_path变量
|
||||
ckp = f'{args.save_dir}/pretrain_{args.dim}{moe_path}.pth'
|
||||
|
Loading…
x
Reference in New Issue
Block a user