Merge branch 'master' into wandb

2024-09-24 14:09:54 +08:00 · 2024-09-24 14:09:54 +08:00 · 5dd4e15aa2
commit 5dd4e15aa2
parent d7a056a545 7947fa17fb
6 changed files with 16 additions and 12 deletions
--- a/1-pretrain.py
+++ b/1-pretrain.py
@ -74,7 +74,8 @@ def train_epoch(epoch, wandb):
                    loss.item() * args.accumulation_steps,
                    optimizer.param_groups[-1]['lr'],
                    spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
-            if wandb is not None:
+
+            if (wandb is not None) and (not ddp or dist.get_rank() == 0):
                wandb.log({"loss": loss.item() * args.accumulation_steps,
                           "lr": optimizer.param_groups[-1]['lr'],
                           "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
@ -115,6 +116,7 @@ def init_distributed_mode():
    DEVICE = f"cuda:{ddp_local_rank}"
    torch.cuda.set_device(DEVICE)

+
 # torchrun --nproc_per_node 2 1-pretrain.py
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="MiniMind Pretraining")
@ -149,6 +151,7 @@ if __name__ == "__main__":
    args.wandb_run_name = f"MiniMind-Pretrain-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"

    ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
+
    ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
    ddp_local_rank, DEVICE = 0, "cuda:0"
    if ddp:
--- a/3-full_sft.py
+++ b/3-full_sft.py
@ -80,8 +80,9 @@ def train_epoch(epoch, wandb):
                    loss.item(),
                    optimizer.param_groups[-1]['lr'],
                    spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
-            if wandb is not None:
-                wandb.log({"loss": loss.item(),
+
+            if (wandb is not None) and (not ddp or dist.get_rank() == 0):
+                wandb.log({"loss": loss,
                           "lr": optimizer.param_groups[-1]['lr'],
                           "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})

--- a/4-lora_sft.py
+++ b/4-lora_sft.py
@ -102,8 +102,8 @@ def find_all_linear_names(model):


 def init_model():
-    model_name_or_path = "./minimind"
-    tokenizer_name_or_path = "./minimind"
+    model_name_or_path = "./minimind-v1-small"
+    tokenizer_name_or_path = "./minimind-v1-small"
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True, use_fast=False)
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).to(args.device)

--- a/README.md
+++ b/README.md
@ -69,10 +69,9 @@ https://github.com/user-attachments/assets/88b98128-636e-43bc-a419-b1b1403c2055

 - 公开MiniMind模型代码（包含Dense和MoE模型）、Pretrain、SFT指令微调、LoRA微调、DPO偏好优化的全过程代码、数据集和来源。
 - 兼容`transformers`、`accelerate`、`trl`、`peft`等流行框架。
- 训练支持单机单卡、单机多卡(DDP、DeepSpeed)训练。训练过程中支持在任意位置停止，及在任意位置继续训练。
+- 训练支持单机单卡、单机多卡(DDP、DeepSpeed)训练，使用wandb可视化训练流程。支持在任意位置停止，及在任意位置继续训练。
 - 在Ceval数据集上进行模型测试的代码。
 - 实现Openai-Api基本的chat接口，便于集成到第三方ChatUI使用（FastGPT、Open-WebUI等）。
- 使用wandb可视化训练流程。

 希望此开源项目可以帮助LLM初学者快速入门！

@ -696,6 +695,8 @@ minimind模型本身没有使用较大的数据集训练，也没有针对回答
 &nbsp;
 <a href="https://github.com/chuanzhubin"><img src="https://avatars.githubusercontent.com/u/2813798" width="70px" height="70px"/></a>
 &nbsp;
+<a href="https://github.com/iomgaa-ycz"><img src="https://avatars.githubusercontent.com/u/124225682" width="70px" height="70px"/></a>
+&nbsp;

 ## 😊鸣谢

--- a/README_en.md
+++ b/README_en.md
@ -75,13 +75,10 @@ The project includes:
 - Public MiniMind model code (including Dense and MoE models), code for Pretrain, SFT instruction fine-tuning, LoRA
  fine-tuning, and DPO preference optimization, along with datasets and sources.
 - Compatibility with popular frameworks such as `transformers`, `accelerate`, `trl`, and `peft`.
- Training support for single-GPU and multi-GPU setups(DDP、DeepSpeed). The training process allows for stopping and
-  resuming at any
-  point.
+- Training support for single-GPU and multi-GPU setups(DDP、DeepSpeed), Use wandb to visualize the training process. The training process allows for stopping and resuming at any point.
 - Code for testing the model on the Ceval dataset.
 - Implementation of a basic chat interface compatible with OpenAI's API, facilitating integration into third-party Chat
  UIs (such as FastGPT, Open-WebUI, etc.).
- Use wandb to visualize the training process.

 We hope this open-source project helps LLM beginners get started quickly!

@ -764,6 +761,8 @@ your model with third-party UIs, such as fastgpt, OpenWebUI, etc.
 &nbsp;
 <a href="https://github.com/chuanzhubin"><img src="https://avatars.githubusercontent.com/u/2813798" width="70px" height="70px"/></a>
 &nbsp;
+<a href="https://github.com/iomgaa-ycz"><img src="https://avatars.githubusercontent.com/u/124225682" width="70px" height="70px"/></a>
+&nbsp;

 ## 😊Thanks for

--- a/data_process.py
+++ b/data_process.py
@ -95,7 +95,7 @@ def process_seq_monkey(chunk_size=50000):

    if doc_ids:
        arr = np.array(doc_ids, dtype=np.uint16)
-        with open(f'./dataset/clean_seq_monkey.bin', 'wb') as f:
+        with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f:
            f.write(arr.tobytes())