From b4359b3335776a583377ee70771f126b15108e95 Mon Sep 17 00:00:00 2001 From: gongjy <2474590974@qq.com> Date: Mon, 23 Sep 2024 20:11:19 +0800 Subject: [PATCH 1/3] fix data_process bug --- README.md | 2 ++ README_en.md | 2 ++ data_process.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d4f871e..ffaa032 100644 --- a/README.md +++ b/README.md @@ -687,6 +687,8 @@ minimind模型本身没有使用较大的数据集训练,也没有针对回答     + +  ## 😊鸣谢 diff --git a/README_en.md b/README_en.md index 7c88224..e2e558a 100644 --- a/README_en.md +++ b/README_en.md @@ -756,6 +756,8 @@ your model with third-party UIs, such as fastgpt, OpenWebUI, etc.     + +  ## 😊Thanks for diff --git a/data_process.py b/data_process.py index 9c03628..047ff0e 100644 --- a/data_process.py +++ b/data_process.py @@ -95,7 +95,7 @@ def process_seq_monkey(chunk_size=50000): if doc_ids: arr = np.array(doc_ids, dtype=np.uint16) - with open(f'./dataset/clean_seq_monkey.bin', 'wb') as f: + with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f: f.write(arr.tobytes()) From 235b6c6fd3e8b159d468f7e8e52d5352bc236ac4 Mon Sep 17 00:00:00 2001 From: gongjy <2474590974@qq.com> Date: Mon, 23 Sep 2024 22:14:52 +0800 Subject: [PATCH 2/3] update wandb monitor --- 1-pretrain.py | 7 ++++--- 3-full_sft.py | 15 +++++++++------ 4-lora_sft.py | 20 +++++++++++--------- README.md | 3 +-- README_en.md | 5 +---- 5 files changed, 26 insertions(+), 24 deletions(-) diff --git a/1-pretrain.py b/1-pretrain.py index 8560126..4a0bb29 100644 --- a/1-pretrain.py +++ b/1-pretrain.py @@ -73,7 +73,8 @@ def train_epoch(epoch, wandb, accumulation_steps=8): loss.item() * accumulation_steps, optimizer.param_groups[-1]['lr'], spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) - if wandb != None: + + if (use_wandb is not None) and (not ddp or dist.get_rank() == 0): wandb.log({"loss": loss.item() * accumulation_steps, "lr": optimizer.param_groups[-1]['lr'], "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) @@ -124,6 +125,7 @@ def init_distributed_mode(): DEVICE = f"cuda:{ddp_local_rank}" torch.cuda.set_device(DEVICE) + # torchrun --nproc_per_node 2 1-pretrain.py # I/O if __name__ == "__main__": @@ -143,7 +145,7 @@ if __name__ == "__main__": torch.manual_seed(1337) device_type = device if "cuda" in device else "cpu" - use_wandb = True #是否使用wandb + use_wandb = False # 是否使用wandb wandb_project = "MiniMind-Pretrain" wandb_run_name = f"MiniMind-Pretrain-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}" if use_wandb: @@ -152,7 +154,6 @@ if __name__ == "__main__": else: wandb = None - ctx = ( nullcontext() if device_type == "cpu" diff --git a/3-full_sft.py b/3-full_sft.py index c50dedf..a2f9b8d 100644 --- a/3-full_sft.py +++ b/3-full_sft.py @@ -85,9 +85,11 @@ def train_epoch(epoch, wandb): loss, optimizer.param_groups[-1]['lr'], spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) - if use_wandb != None: - wandb.log({"loss": loss, "lr": optimizer.param_groups[-1]['lr'], - "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) + + if (use_wandb is not None) and (not ddp or dist.get_rank() == 0): + wandb.log({"loss": loss, + "lr": optimizer.param_groups[-1]['lr'], + "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) if (step + 1) % 1000 == 0 and (not ddp or dist.get_rank() == 0): model.eval() @@ -161,11 +163,12 @@ if __name__ == "__main__": torch.manual_seed(1337) device_type = device if "cuda" in device else "cpu" - use_wandb = True #是否使用wandb + use_wandb = False # 是否使用wandb wandb_project = "MiniMind-Full-SFT" wandb_run_name = f"MiniMind-Full-SFT-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}" if use_wandb: import wandb + wandb.init(project=wandb_project, name=wandb_run_name) else: wandb = None @@ -219,5 +222,5 @@ if __name__ == "__main__": model = DistributedDataParallel(model, device_ids=[ddp_local_rank]) # training loop - for epoch in range(epochs,wandb): - train_epoch(epoch) + for epoch in range(epochs): + train_epoch(epoch, wandb) diff --git a/4-lora_sft.py b/4-lora_sft.py index 128041a..2dfd22b 100644 --- a/4-lora_sft.py +++ b/4-lora_sft.py @@ -72,9 +72,10 @@ def train_epoch(epoch, wandb): loss.item(), optimizer.param_groups[-1]['lr'], spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) - if use_wandb != None: - wandb.log({"loss": loss.item(), "lr": optimizer.param_groups[-1]['lr'], - "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) + + if use_wandb is not None: + wandb.log({"loss": loss.item(), "lr": optimizer.param_groups[-1]['lr'], + "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) def find_all_linear_names(model): @@ -91,8 +92,8 @@ def find_all_linear_names(model): def init_model(): - model_name_or_path = "./minimind" - tokenizer_name_or_path = "./minimind" + model_name_or_path = "./minimind-v1-small" + tokenizer_name_or_path = "./minimind-v1-small" tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True, use_fast=False) model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).to(device) @@ -131,11 +132,12 @@ if __name__ == "__main__": torch.manual_seed(1337) device_type = device if "cuda" in device else "cpu" - use_wandb = True #是否使用wandb - wandb_project = "MiniMind-LoRA" - wandb_run_name = f"MiniMind-LoRA-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}" + use_wandb = False # 是否使用wandb + wandb_project = "MiniMind-LoRA-SFT" + wandb_run_name = f"MiniMind-LoRA-SFT-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}" if use_wandb: import wandb + wandb.init(project=wandb_project, name=wandb_run_name) else: wandb = None @@ -150,7 +152,7 @@ if __name__ == "__main__": model, tokenizer = init_model() # -----init dataloader------ - df = pd.read_csv('./dataset/sft_data.csv') + df = pd.read_csv('./dataset/sft_data_single.csv') df = df.sample(frac=1.0) train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len) train_loader = DataLoader( diff --git a/README.md b/README.md index 4a78f3a..d7f8c34 100644 --- a/README.md +++ b/README.md @@ -69,10 +69,9 @@ https://github.com/user-attachments/assets/88b98128-636e-43bc-a419-b1b1403c2055 - 公开MiniMind模型代码(包含Dense和MoE模型)、Pretrain、SFT指令微调、LoRA微调、DPO偏好优化的全过程代码、数据集和来源。 - 兼容`transformers`、`accelerate`、`trl`、`peft`等流行框架。 -- 训练支持单机单卡、单机多卡(DDP、DeepSpeed)训练。训练过程中支持在任意位置停止,及在任意位置继续训练。 +- 训练支持单机单卡、单机多卡(DDP、DeepSpeed)训练,使用wandb可视化训练流程。支持在任意位置停止,及在任意位置继续训练。 - 在Ceval数据集上进行模型测试的代码。 - 实现Openai-Api基本的chat接口,便于集成到第三方ChatUI使用(FastGPT、Open-WebUI等)。 -- 使用wandb可视化训练流程。 希望此开源项目可以帮助LLM初学者快速入门! diff --git a/README_en.md b/README_en.md index 192b3a6..4b1bbbf 100644 --- a/README_en.md +++ b/README_en.md @@ -75,13 +75,10 @@ The project includes: - Public MiniMind model code (including Dense and MoE models), code for Pretrain, SFT instruction fine-tuning, LoRA fine-tuning, and DPO preference optimization, along with datasets and sources. - Compatibility with popular frameworks such as `transformers`, `accelerate`, `trl`, and `peft`. -- Training support for single-GPU and multi-GPU setups(DDP、DeepSpeed). The training process allows for stopping and - resuming at any - point. +- Training support for single-GPU and multi-GPU setups(DDP、DeepSpeed), Use wandb to visualize the training process. The training process allows for stopping and resuming at any point. - Code for testing the model on the Ceval dataset. - Implementation of a basic chat interface compatible with OpenAI's API, facilitating integration into third-party Chat UIs (such as FastGPT, Open-WebUI, etc.). -- Use wandb to visualize the training process. We hope this open-source project helps LLM beginners get started quickly! From 7947fa17fb3d764ca60a981c5323c1134ef3897e Mon Sep 17 00:00:00 2001 From: gongjy <2474590974@qq.com> Date: Mon, 23 Sep 2024 22:16:21 +0800 Subject: [PATCH 3/3] update wandb monitor --- 1-pretrain.py | 2 +- 3-full_sft.py | 2 +- 4-lora_sft.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/1-pretrain.py b/1-pretrain.py index 4a0bb29..50fee2a 100644 --- a/1-pretrain.py +++ b/1-pretrain.py @@ -74,7 +74,7 @@ def train_epoch(epoch, wandb, accumulation_steps=8): optimizer.param_groups[-1]['lr'], spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) - if (use_wandb is not None) and (not ddp or dist.get_rank() == 0): + if (wandb is not None) and (not ddp or dist.get_rank() == 0): wandb.log({"loss": loss.item() * accumulation_steps, "lr": optimizer.param_groups[-1]['lr'], "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) diff --git a/3-full_sft.py b/3-full_sft.py index a2f9b8d..c413de0 100644 --- a/3-full_sft.py +++ b/3-full_sft.py @@ -86,7 +86,7 @@ def train_epoch(epoch, wandb): optimizer.param_groups[-1]['lr'], spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) - if (use_wandb is not None) and (not ddp or dist.get_rank() == 0): + if (wandb is not None) and (not ddp or dist.get_rank() == 0): wandb.log({"loss": loss, "lr": optimizer.param_groups[-1]['lr'], "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) diff --git a/4-lora_sft.py b/4-lora_sft.py index 2dfd22b..ab8ba31 100644 --- a/4-lora_sft.py +++ b/4-lora_sft.py @@ -73,7 +73,7 @@ def train_epoch(epoch, wandb): optimizer.param_groups[-1]['lr'], spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) - if use_wandb is not None: + if wandb is not None: wandb.log({"loss": loss.item(), "lr": optimizer.param_groups[-1]['lr'], "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})