diff --git a/.gitignore b/.gitignore index 09f33a7..cbb3920 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ /model/__pycache__ -/dataset \ No newline at end of file +/dataset +/wandb +/out \ No newline at end of file diff --git a/1-pretrain.py b/1-pretrain.py index d1d84ec..8560126 100644 --- a/1-pretrain.py +++ b/1-pretrain.py @@ -37,7 +37,7 @@ def get_lr(it, all): return min_lr + coeff * (learning_rate - min_lr) -def train_epoch(epoch, accumulation_steps=8): +def train_epoch(epoch, wandb, accumulation_steps=8): start_time = time.time() for step, (X, Y) in enumerate(train_loader): X = X.to(device) @@ -73,6 +73,10 @@ def train_epoch(epoch, accumulation_steps=8): loss.item() * accumulation_steps, optimizer.param_groups[-1]['lr'], spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) + if wandb != None: + wandb.log({"loss": loss.item() * accumulation_steps, + "lr": optimizer.param_groups[-1]['lr'], + "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) if (step + 1) % 1000 == 0 and (not ddp or dist.get_rank() == 0): model.eval() @@ -138,6 +142,17 @@ if __name__ == "__main__": tokens_per_iter = batch_size * max_seq_len torch.manual_seed(1337) device_type = device if "cuda" in device else "cpu" + + use_wandb = True #是否使用wandb + wandb_project = "MiniMind-Pretrain" + wandb_run_name = f"MiniMind-Pretrain-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}" + if use_wandb: + import wandb + wandb.init(project=wandb_project, name=wandb_run_name) + else: + wandb = None + + ctx = ( nullcontext() if device_type == "cpu" @@ -186,4 +201,4 @@ if __name__ == "__main__": # training loop iter_per_epoch = len(train_loader) for epoch in range(epochs): - train_epoch(epoch) + train_epoch(epoch, wandb) diff --git a/3-full_sft.py b/3-full_sft.py index 3c94597..c50dedf 100644 --- a/3-full_sft.py +++ b/3-full_sft.py @@ -43,7 +43,7 @@ def get_lr(it, all): # ------------------------------------------------------------------------------ -def train_epoch(epoch): +def train_epoch(epoch, wandb): start_time = time.time() for step, (X, Y, loss_mask) in enumerate(train_loader): X = X.to(device) @@ -85,6 +85,9 @@ def train_epoch(epoch): loss, optimizer.param_groups[-1]['lr'], spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) + if use_wandb != None: + wandb.log({"loss": loss, "lr": optimizer.param_groups[-1]['lr'], + "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) if (step + 1) % 1000 == 0 and (not ddp or dist.get_rank() == 0): model.eval() @@ -157,6 +160,16 @@ if __name__ == "__main__": os.makedirs(out_dir, exist_ok=True) torch.manual_seed(1337) device_type = device if "cuda" in device else "cpu" + + use_wandb = True #是否使用wandb + wandb_project = "MiniMind-Full-SFT" + wandb_run_name = f"MiniMind-Full-SFT-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}" + if use_wandb: + import wandb + wandb.init(project=wandb_project, name=wandb_run_name) + else: + wandb = None + ctx = ( nullcontext() if device_type == "cpu" @@ -206,5 +219,5 @@ if __name__ == "__main__": model = DistributedDataParallel(model, device_ids=[ddp_local_rank]) # training loop - for epoch in range(epochs): + for epoch in range(epochs,wandb): train_epoch(epoch) diff --git a/4-lora_sft.py b/4-lora_sft.py index 936461d..128041a 100644 --- a/4-lora_sft.py +++ b/4-lora_sft.py @@ -35,7 +35,7 @@ def get_lr(it): # ------------------------------------------------------------------------------ -def train_epoch(epoch): +def train_epoch(epoch, wandb): start_time = time.time() for step, (X, Y, loss_mask) in enumerate(train_loader): X = X.to(device) @@ -72,6 +72,9 @@ def train_epoch(epoch): loss.item(), optimizer.param_groups[-1]['lr'], spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) + if use_wandb != None: + wandb.log({"loss": loss.item(), "lr": optimizer.param_groups[-1]['lr'], + "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) def find_all_linear_names(model): @@ -127,6 +130,16 @@ if __name__ == "__main__": os.makedirs(out_dir, exist_ok=True) torch.manual_seed(1337) device_type = device if "cuda" in device else "cpu" + + use_wandb = True #是否使用wandb + wandb_project = "MiniMind-LoRA" + wandb_run_name = f"MiniMind-LoRA-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}" + if use_wandb: + import wandb + wandb.init(project=wandb_project, name=wandb_run_name) + else: + wandb = None + ctx = ( nullcontext() if device_type == "cpu" @@ -162,5 +175,5 @@ if __name__ == "__main__": raw_model = model # training loop for epoch in range(epochs): - train_epoch(epoch) + train_epoch(epoch, wandb) model.save_pretrained('minimind') diff --git a/README.md b/README.md index ffaa032..4a78f3a 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,7 @@ https://github.com/user-attachments/assets/88b98128-636e-43bc-a419-b1b1403c2055 - 训练支持单机单卡、单机多卡(DDP、DeepSpeed)训练。训练过程中支持在任意位置停止,及在任意位置继续训练。 - 在Ceval数据集上进行模型测试的代码。 - 实现Openai-Api基本的chat接口,便于集成到第三方ChatUI使用(FastGPT、Open-WebUI等)。 +- 使用wandb可视化训练流程。 希望此开源项目可以帮助LLM初学者快速入门! diff --git a/README_en.md b/README_en.md index e2e558a..192b3a6 100644 --- a/README_en.md +++ b/README_en.md @@ -81,6 +81,7 @@ The project includes: - Code for testing the model on the Ceval dataset. - Implementation of a basic chat interface compatible with OpenAI's API, facilitating integration into third-party Chat UIs (such as FastGPT, Open-WebUI, etc.). +- Use wandb to visualize the training process. We hope this open-source project helps LLM beginners get started quickly!