添加了wandb
This commit is contained in:
parent
0fa4d17d26
commit
06a66d88c9
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,2 +1,4 @@
|
|||||||
/model/__pycache__
|
/model/__pycache__
|
||||||
/dataset
|
/dataset
|
||||||
|
/wandb
|
||||||
|
/out
|
@ -37,7 +37,7 @@ def get_lr(it, all):
|
|||||||
return min_lr + coeff * (learning_rate - min_lr)
|
return min_lr + coeff * (learning_rate - min_lr)
|
||||||
|
|
||||||
|
|
||||||
def train_epoch(epoch, accumulation_steps=8):
|
def train_epoch(epoch, wandb, accumulation_steps=8):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
for step, (X, Y) in enumerate(train_loader):
|
for step, (X, Y) in enumerate(train_loader):
|
||||||
X = X.to(device)
|
X = X.to(device)
|
||||||
@ -73,6 +73,10 @@ def train_epoch(epoch, accumulation_steps=8):
|
|||||||
loss.item() * accumulation_steps,
|
loss.item() * accumulation_steps,
|
||||||
optimizer.param_groups[-1]['lr'],
|
optimizer.param_groups[-1]['lr'],
|
||||||
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
|
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
|
||||||
|
if wandb != None:
|
||||||
|
wandb.log({"loss": loss.item() * accumulation_steps,
|
||||||
|
"lr": optimizer.param_groups[-1]['lr'],
|
||||||
|
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
|
||||||
|
|
||||||
if (step + 1) % 1000 == 0 and (not ddp or dist.get_rank() == 0):
|
if (step + 1) % 1000 == 0 and (not ddp or dist.get_rank() == 0):
|
||||||
model.eval()
|
model.eval()
|
||||||
@ -138,6 +142,17 @@ if __name__ == "__main__":
|
|||||||
tokens_per_iter = batch_size * max_seq_len
|
tokens_per_iter = batch_size * max_seq_len
|
||||||
torch.manual_seed(1337)
|
torch.manual_seed(1337)
|
||||||
device_type = device if "cuda" in device else "cpu"
|
device_type = device if "cuda" in device else "cpu"
|
||||||
|
|
||||||
|
use_wandb = True #是否使用wandb
|
||||||
|
wandb_project = "MiniMind-Pretrain"
|
||||||
|
wandb_run_name = f"MiniMind-Pretrain-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
|
||||||
|
if use_wandb:
|
||||||
|
import wandb
|
||||||
|
wandb.init(project=wandb_project, name=wandb_run_name)
|
||||||
|
else:
|
||||||
|
wandb = None
|
||||||
|
|
||||||
|
|
||||||
ctx = (
|
ctx = (
|
||||||
nullcontext()
|
nullcontext()
|
||||||
if device_type == "cpu"
|
if device_type == "cpu"
|
||||||
@ -186,4 +201,4 @@ if __name__ == "__main__":
|
|||||||
# training loop
|
# training loop
|
||||||
iter_per_epoch = len(train_loader)
|
iter_per_epoch = len(train_loader)
|
||||||
for epoch in range(epochs):
|
for epoch in range(epochs):
|
||||||
train_epoch(epoch)
|
train_epoch(epoch, wandb)
|
||||||
|
@ -43,7 +43,7 @@ def get_lr(it, all):
|
|||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
def train_epoch(epoch):
|
def train_epoch(epoch, wandb):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
for step, (X, Y, loss_mask) in enumerate(train_loader):
|
for step, (X, Y, loss_mask) in enumerate(train_loader):
|
||||||
X = X.to(device)
|
X = X.to(device)
|
||||||
@ -85,6 +85,9 @@ def train_epoch(epoch):
|
|||||||
loss,
|
loss,
|
||||||
optimizer.param_groups[-1]['lr'],
|
optimizer.param_groups[-1]['lr'],
|
||||||
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
|
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
|
||||||
|
if use_wandb != None:
|
||||||
|
wandb.log({"loss": loss, "lr": optimizer.param_groups[-1]['lr'],
|
||||||
|
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
|
||||||
|
|
||||||
if (step + 1) % 1000 == 0 and (not ddp or dist.get_rank() == 0):
|
if (step + 1) % 1000 == 0 and (not ddp or dist.get_rank() == 0):
|
||||||
model.eval()
|
model.eval()
|
||||||
@ -157,6 +160,16 @@ if __name__ == "__main__":
|
|||||||
os.makedirs(out_dir, exist_ok=True)
|
os.makedirs(out_dir, exist_ok=True)
|
||||||
torch.manual_seed(1337)
|
torch.manual_seed(1337)
|
||||||
device_type = device if "cuda" in device else "cpu"
|
device_type = device if "cuda" in device else "cpu"
|
||||||
|
|
||||||
|
use_wandb = True #是否使用wandb
|
||||||
|
wandb_project = "MiniMind-Full-SFT"
|
||||||
|
wandb_run_name = f"MiniMind-Full-SFT-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
|
||||||
|
if use_wandb:
|
||||||
|
import wandb
|
||||||
|
wandb.init(project=wandb_project, name=wandb_run_name)
|
||||||
|
else:
|
||||||
|
wandb = None
|
||||||
|
|
||||||
ctx = (
|
ctx = (
|
||||||
nullcontext()
|
nullcontext()
|
||||||
if device_type == "cpu"
|
if device_type == "cpu"
|
||||||
@ -206,5 +219,5 @@ if __name__ == "__main__":
|
|||||||
model = DistributedDataParallel(model, device_ids=[ddp_local_rank])
|
model = DistributedDataParallel(model, device_ids=[ddp_local_rank])
|
||||||
|
|
||||||
# training loop
|
# training loop
|
||||||
for epoch in range(epochs):
|
for epoch in range(epochs,wandb):
|
||||||
train_epoch(epoch)
|
train_epoch(epoch)
|
||||||
|
@ -35,7 +35,7 @@ def get_lr(it):
|
|||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
def train_epoch(epoch):
|
def train_epoch(epoch, wandb):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
for step, (X, Y, loss_mask) in enumerate(train_loader):
|
for step, (X, Y, loss_mask) in enumerate(train_loader):
|
||||||
X = X.to(device)
|
X = X.to(device)
|
||||||
@ -72,6 +72,9 @@ def train_epoch(epoch):
|
|||||||
loss.item(),
|
loss.item(),
|
||||||
optimizer.param_groups[-1]['lr'],
|
optimizer.param_groups[-1]['lr'],
|
||||||
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
|
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
|
||||||
|
if use_wandb != None:
|
||||||
|
wandb.log({"loss": loss.item(), "lr": optimizer.param_groups[-1]['lr'],
|
||||||
|
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
|
||||||
|
|
||||||
|
|
||||||
def find_all_linear_names(model):
|
def find_all_linear_names(model):
|
||||||
@ -127,6 +130,16 @@ if __name__ == "__main__":
|
|||||||
os.makedirs(out_dir, exist_ok=True)
|
os.makedirs(out_dir, exist_ok=True)
|
||||||
torch.manual_seed(1337)
|
torch.manual_seed(1337)
|
||||||
device_type = device if "cuda" in device else "cpu"
|
device_type = device if "cuda" in device else "cpu"
|
||||||
|
|
||||||
|
use_wandb = True #是否使用wandb
|
||||||
|
wandb_project = "MiniMind-LoRA"
|
||||||
|
wandb_run_name = f"MiniMind-LoRA-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
|
||||||
|
if use_wandb:
|
||||||
|
import wandb
|
||||||
|
wandb.init(project=wandb_project, name=wandb_run_name)
|
||||||
|
else:
|
||||||
|
wandb = None
|
||||||
|
|
||||||
ctx = (
|
ctx = (
|
||||||
nullcontext()
|
nullcontext()
|
||||||
if device_type == "cpu"
|
if device_type == "cpu"
|
||||||
@ -162,5 +175,5 @@ if __name__ == "__main__":
|
|||||||
raw_model = model
|
raw_model = model
|
||||||
# training loop
|
# training loop
|
||||||
for epoch in range(epochs):
|
for epoch in range(epochs):
|
||||||
train_epoch(epoch)
|
train_epoch(epoch, wandb)
|
||||||
model.save_pretrained('minimind')
|
model.save_pretrained('minimind')
|
||||||
|
@ -72,6 +72,7 @@ https://github.com/user-attachments/assets/88b98128-636e-43bc-a419-b1b1403c2055
|
|||||||
- 训练支持单机单卡、单机多卡(DDP、DeepSpeed)训练。训练过程中支持在任意位置停止,及在任意位置继续训练。
|
- 训练支持单机单卡、单机多卡(DDP、DeepSpeed)训练。训练过程中支持在任意位置停止,及在任意位置继续训练。
|
||||||
- 在Ceval数据集上进行模型测试的代码。
|
- 在Ceval数据集上进行模型测试的代码。
|
||||||
- 实现Openai-Api基本的chat接口,便于集成到第三方ChatUI使用(FastGPT、Open-WebUI等)。
|
- 实现Openai-Api基本的chat接口,便于集成到第三方ChatUI使用(FastGPT、Open-WebUI等)。
|
||||||
|
- 使用wandb可视化训练流程。
|
||||||
|
|
||||||
希望此开源项目可以帮助LLM初学者快速入门!
|
希望此开源项目可以帮助LLM初学者快速入门!
|
||||||
|
|
||||||
|
@ -81,6 +81,7 @@ The project includes:
|
|||||||
- Code for testing the model on the Ceval dataset.
|
- Code for testing the model on the Ceval dataset.
|
||||||
- Implementation of a basic chat interface compatible with OpenAI's API, facilitating integration into third-party Chat
|
- Implementation of a basic chat interface compatible with OpenAI's API, facilitating integration into third-party Chat
|
||||||
UIs (such as FastGPT, Open-WebUI, etc.).
|
UIs (such as FastGPT, Open-WebUI, etc.).
|
||||||
|
- Use wandb to visualize the training process.
|
||||||
|
|
||||||
We hope this open-source project helps LLM beginners get started quickly!
|
We hope this open-source project helps LLM beginners get started quickly!
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user