From b4359b3335776a583377ee70771f126b15108e95 Mon Sep 17 00:00:00 2001
From: gongjy <2474590974@qq.com>
Date: Mon, 23 Sep 2024 20:11:19 +0800
Subject: [PATCH 1/3] fix data_process bug

---
 README.md       | 2 ++
 README_en.md    | 2 ++
 data_process.py | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index d4f871e..ffaa032 100644
--- a/README.md
+++ b/README.md
@@ -687,6 +687,8 @@ minimind模型本身没有使用较大的数据集训练，也没有针对回答
 &nbsp;
 <a href="https://github.com/chuanzhubin"><img src="https://avatars.githubusercontent.com/u/2813798" width="70px" height="70px"/></a>
 &nbsp;
+<a href="https://github.com/iomgaa-ycz"><img src="https://avatars.githubusercontent.com/u/124225682" width="70px" height="70px"/></a>
+&nbsp;
 
 ## 😊鸣谢
 
diff --git a/README_en.md b/README_en.md
index 7c88224..e2e558a 100644
--- a/README_en.md
+++ b/README_en.md
@@ -756,6 +756,8 @@ your model with third-party UIs, such as fastgpt, OpenWebUI, etc.
 &nbsp;
 <a href="https://github.com/chuanzhubin"><img src="https://avatars.githubusercontent.com/u/2813798" width="70px" height="70px"/></a>
 &nbsp;
+<a href="https://github.com/iomgaa-ycz"><img src="https://avatars.githubusercontent.com/u/124225682" width="70px" height="70px"/></a>
+&nbsp;
 
 ## 😊Thanks for
 
diff --git a/data_process.py b/data_process.py
index 9c03628..047ff0e 100644
--- a/data_process.py
+++ b/data_process.py
@@ -95,7 +95,7 @@ def process_seq_monkey(chunk_size=50000):
 
     if doc_ids:
         arr = np.array(doc_ids, dtype=np.uint16)
-        with open(f'./dataset/clean_seq_monkey.bin', 'wb') as f:
+        with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f:
             f.write(arr.tobytes())
 
 

From 235b6c6fd3e8b159d468f7e8e52d5352bc236ac4 Mon Sep 17 00:00:00 2001
From: gongjy <2474590974@qq.com>
Date: Mon, 23 Sep 2024 22:14:52 +0800
Subject: [PATCH 2/3] update wandb monitor

---
 1-pretrain.py |  7 ++++---
 3-full_sft.py | 15 +++++++++------
 4-lora_sft.py | 20 +++++++++++---------
 README.md     |  3 +--
 README_en.md  |  5 +----
 5 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/1-pretrain.py b/1-pretrain.py
index 8560126..4a0bb29 100644
--- a/1-pretrain.py
+++ b/1-pretrain.py
@@ -73,7 +73,8 @@ def train_epoch(epoch, wandb, accumulation_steps=8):
                     loss.item() * accumulation_steps,
                     optimizer.param_groups[-1]['lr'],
                     spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
-            if wandb != None:
+
+            if (use_wandb is not None) and (not ddp or dist.get_rank() == 0):
                 wandb.log({"loss": loss.item() * accumulation_steps,
                            "lr": optimizer.param_groups[-1]['lr'],
                            "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
@@ -124,6 +125,7 @@ def init_distributed_mode():
     DEVICE = f"cuda:{ddp_local_rank}"
     torch.cuda.set_device(DEVICE)
 
+
 # torchrun --nproc_per_node 2 1-pretrain.py
 # I/O
 if __name__ == "__main__":
@@ -143,7 +145,7 @@ if __name__ == "__main__":
     torch.manual_seed(1337)
     device_type = device if "cuda" in device else "cpu"
 
-    use_wandb = True #是否使用wandb
+    use_wandb = False  # 是否使用wandb
     wandb_project = "MiniMind-Pretrain"
     wandb_run_name = f"MiniMind-Pretrain-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
     if use_wandb:
@@ -152,7 +154,6 @@ if __name__ == "__main__":
     else:
         wandb = None
 
-
     ctx = (
         nullcontext()
         if device_type == "cpu"
diff --git a/3-full_sft.py b/3-full_sft.py
index c50dedf..a2f9b8d 100644
--- a/3-full_sft.py
+++ b/3-full_sft.py
@@ -85,9 +85,11 @@ def train_epoch(epoch, wandb):
                     loss,
                     optimizer.param_groups[-1]['lr'],
                     spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
-        if use_wandb != None:
-            wandb.log({"loss": loss, "lr": optimizer.param_groups[-1]['lr'],
-                       "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
+
+            if (use_wandb is not None) and (not ddp or dist.get_rank() == 0):
+                wandb.log({"loss": loss,
+                           "lr": optimizer.param_groups[-1]['lr'],
+                           "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
 
         if (step + 1) % 1000 == 0 and (not ddp or dist.get_rank() == 0):
             model.eval()
@@ -161,11 +163,12 @@ if __name__ == "__main__":
     torch.manual_seed(1337)
     device_type = device if "cuda" in device else "cpu"
 
-    use_wandb = True #是否使用wandb
+    use_wandb = False  # 是否使用wandb
     wandb_project = "MiniMind-Full-SFT"
     wandb_run_name = f"MiniMind-Full-SFT-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
     if use_wandb:
         import wandb
+
         wandb.init(project=wandb_project, name=wandb_run_name)
     else:
         wandb = None
@@ -219,5 +222,5 @@ if __name__ == "__main__":
         model = DistributedDataParallel(model, device_ids=[ddp_local_rank])
 
     # training loop
-    for epoch in range(epochs,wandb):
-        train_epoch(epoch)
+    for epoch in range(epochs):
+        train_epoch(epoch, wandb)
diff --git a/4-lora_sft.py b/4-lora_sft.py
index 128041a..2dfd22b 100644
--- a/4-lora_sft.py
+++ b/4-lora_sft.py
@@ -72,9 +72,10 @@ def train_epoch(epoch, wandb):
                     loss.item(),
                     optimizer.param_groups[-1]['lr'],
                     spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
-        if use_wandb != None:
-            wandb.log({"loss": loss.item(), "lr": optimizer.param_groups[-1]['lr'],
-                       "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
+
+            if use_wandb is not None:
+                wandb.log({"loss": loss.item(), "lr": optimizer.param_groups[-1]['lr'],
+                           "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
 
 
 def find_all_linear_names(model):
@@ -91,8 +92,8 @@ def find_all_linear_names(model):
 
 
 def init_model():
-    model_name_or_path = "./minimind"
-    tokenizer_name_or_path = "./minimind"
+    model_name_or_path = "./minimind-v1-small"
+    tokenizer_name_or_path = "./minimind-v1-small"
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True, use_fast=False)
     model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).to(device)
 
@@ -131,11 +132,12 @@ if __name__ == "__main__":
     torch.manual_seed(1337)
     device_type = device if "cuda" in device else "cpu"
 
-    use_wandb = True #是否使用wandb
-    wandb_project = "MiniMind-LoRA"
-    wandb_run_name = f"MiniMind-LoRA-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
+    use_wandb = False  # 是否使用wandb
+    wandb_project = "MiniMind-LoRA-SFT"
+    wandb_run_name = f"MiniMind-LoRA-SFT-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
     if use_wandb:
         import wandb
+
         wandb.init(project=wandb_project, name=wandb_run_name)
     else:
         wandb = None
@@ -150,7 +152,7 @@ if __name__ == "__main__":
     model, tokenizer = init_model()
 
     # -----init dataloader------
-    df = pd.read_csv('./dataset/sft_data.csv')
+    df = pd.read_csv('./dataset/sft_data_single.csv')
     df = df.sample(frac=1.0)
     train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len)
     train_loader = DataLoader(
diff --git a/README.md b/README.md
index 4a78f3a..d7f8c34 100644
--- a/README.md
+++ b/README.md
@@ -69,10 +69,9 @@ https://github.com/user-attachments/assets/88b98128-636e-43bc-a419-b1b1403c2055
 
 - 公开MiniMind模型代码（包含Dense和MoE模型）、Pretrain、SFT指令微调、LoRA微调、DPO偏好优化的全过程代码、数据集和来源。
 - 兼容`transformers`、`accelerate`、`trl`、`peft`等流行框架。
-- 训练支持单机单卡、单机多卡(DDP、DeepSpeed)训练。训练过程中支持在任意位置停止，及在任意位置继续训练。
+- 训练支持单机单卡、单机多卡(DDP、DeepSpeed)训练，使用wandb可视化训练流程。支持在任意位置停止，及在任意位置继续训练。
 - 在Ceval数据集上进行模型测试的代码。
 - 实现Openai-Api基本的chat接口，便于集成到第三方ChatUI使用（FastGPT、Open-WebUI等）。
-- 使用wandb可视化训练流程。
 
 希望此开源项目可以帮助LLM初学者快速入门！
 
diff --git a/README_en.md b/README_en.md
index 192b3a6..4b1bbbf 100644
--- a/README_en.md
+++ b/README_en.md
@@ -75,13 +75,10 @@ The project includes:
 - Public MiniMind model code (including Dense and MoE models), code for Pretrain, SFT instruction fine-tuning, LoRA
   fine-tuning, and DPO preference optimization, along with datasets and sources.
 - Compatibility with popular frameworks such as `transformers`, `accelerate`, `trl`, and `peft`.
-- Training support for single-GPU and multi-GPU setups(DDP、DeepSpeed). The training process allows for stopping and
-  resuming at any
-  point.
+- Training support for single-GPU and multi-GPU setups(DDP、DeepSpeed), Use wandb to visualize the training process. The training process allows for stopping and resuming at any point.
 - Code for testing the model on the Ceval dataset.
 - Implementation of a basic chat interface compatible with OpenAI's API, facilitating integration into third-party Chat
   UIs (such as FastGPT, Open-WebUI, etc.).
-- Use wandb to visualize the training process.
 
 We hope this open-source project helps LLM beginners get started quickly!
 

From 7947fa17fb3d764ca60a981c5323c1134ef3897e Mon Sep 17 00:00:00 2001
From: gongjy <2474590974@qq.com>
Date: Mon, 23 Sep 2024 22:16:21 +0800
Subject: [PATCH 3/3] update wandb monitor

---
 1-pretrain.py | 2 +-
 3-full_sft.py | 2 +-
 4-lora_sft.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/1-pretrain.py b/1-pretrain.py
index 4a0bb29..50fee2a 100644
--- a/1-pretrain.py
+++ b/1-pretrain.py
@@ -74,7 +74,7 @@ def train_epoch(epoch, wandb, accumulation_steps=8):
                     optimizer.param_groups[-1]['lr'],
                     spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
 
-            if (use_wandb is not None) and (not ddp or dist.get_rank() == 0):
+            if (wandb is not None) and (not ddp or dist.get_rank() == 0):
                 wandb.log({"loss": loss.item() * accumulation_steps,
                            "lr": optimizer.param_groups[-1]['lr'],
                            "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
diff --git a/3-full_sft.py b/3-full_sft.py
index a2f9b8d..c413de0 100644
--- a/3-full_sft.py
+++ b/3-full_sft.py
@@ -86,7 +86,7 @@ def train_epoch(epoch, wandb):
                     optimizer.param_groups[-1]['lr'],
                     spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
 
-            if (use_wandb is not None) and (not ddp or dist.get_rank() == 0):
+            if (wandb is not None) and (not ddp or dist.get_rank() == 0):
                 wandb.log({"loss": loss,
                            "lr": optimizer.param_groups[-1]['lr'],
                            "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
diff --git a/4-lora_sft.py b/4-lora_sft.py
index 2dfd22b..ab8ba31 100644
--- a/4-lora_sft.py
+++ b/4-lora_sft.py
@@ -73,7 +73,7 @@ def train_epoch(epoch, wandb):
                     optimizer.param_groups[-1]['lr'],
                     spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
 
-            if use_wandb is not None:
+            if wandb is not None:
                 wandb.log({"loss": loss.item(), "lr": optimizer.param_groups[-1]['lr'],
                            "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})