From d2f5ef4355880ea502d1f7efcc130ed3e1e13ec3 Mon Sep 17 00:00:00 2001 From: gongjy <2474590974@qq.com> Date: Tue, 11 Feb 2025 23:52:40 +0800 Subject: [PATCH] update lr --- README.md | 47 ++++++++++++++++++++++++------------------- README_en.md | 51 ++++++++++++++++++++++++++++------------------- train_full_sft.py | 2 +- train_pretrain.py | 2 +- 4 files changed, 59 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 98e9dcd..d6e2742 100644 --- a/README.md +++ b/README.md @@ -209,22 +209,26 @@ git clone https://github.com/jingyaogong/minimind.git ## Ⅰ 测试已有模型效果 -### 1.下载模型 +### 1.环境准备 + +```bash +pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple +``` + +### 2.下载模型 ```bash -# step 1 git clone https://huggingface.co/jingyaogong/MiniMind2 ``` -### 2.命令行问答 +### 3.命令行问答 ```bash -# step 2 -# load=1: load from transformers-hf model +# load=0: load from pytorch model, load=1: load from transformers-hf model python eval_model.py --load 1 ``` -### 3.或启动WebUI +### 4.或启动WebUI ```bash # 可能需要`python>=3.10` 安装 `pip install streamlit` @@ -323,26 +327,29 @@ python eval_model.py --model_mode 1 # 默认为0:测试pretrain模型效果, 单机N卡启动训练方式 (DDP, 支持多机多卡集群) ```bash -torchrun --nproc_per_node 3 train_xxx.py +torchrun --nproc_per_node N train_xxx.py ```
注:其它须知 -* 单机N卡启动训练 (DeepSpeed) - ```bash - deepspeed --master_port 29500 --num_gpus=N train_xxx.py - ``` +单机N卡启动训练 (DeepSpeed) -* 可根据需要开启wandb记录训练过程 - ```bash - # 需要登录: wandb login - torchrun --nproc_per_node N train_xxx.py --use_wandb - # and - python train_xxx.py --use_wandb - ``` - 通过添加`--use_wandb`参数,可以记录训练过程,训练完成后,可以在wandb网站上查看训练过程。通过修改`wandb_project` - 和`wandb_run_name`参数,可以指定项目名称和运行名称。 +```bash +deepspeed --master_port 29500 --num_gpus=N train_xxx.py +``` + +可根据需要开启wandb记录训练过程 + +```bash +# 需要登录: wandb login +torchrun --nproc_per_node N train_xxx.py --use_wandb +# and +python train_xxx.py --use_wandb +``` + +通过添加`--use_wandb`参数,可以记录训练过程,训练完成后,可以在wandb网站上查看训练过程。通过修改`wandb_project` +和`wandb_run_name`参数,可以指定项目名称和运行名称。
diff --git a/README_en.md b/README_en.md index 7c0be6c..988ffea 100644 --- a/README_en.md +++ b/README_en.md @@ -221,22 +221,28 @@ git clone https://github.com/jingyaogong/minimind.git ## Ⅰ Test Pre-trained Model -### 1. Download the Model + +### 1. Environment Setup + +```bash +pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple +``` + + +### 2. Download the Model ```bash -# step 1 git clone https://huggingface.co/jingyaogong/MiniMind2 ``` -### 2. Command-line Q&A +### 3. Command-line Q&A ```bash -# step 2 -# load=1: load from transformers-hf model +# load=0: load from pytorch model, load=1: load from transformers-hf model python eval_model.py --load 1 ``` -### 3. Or Start WebUI +### 4. Or Start WebUI ```bash # You may need `python>=3.10` and install `pip install streamlit`. @@ -347,27 +353,30 @@ SFT-Chat model, 2: RLHF-Chat model, 3: Reason model. Start training with N GPUs on a single machine (DDP, supports multi-node, multi-GPU clusters): ```bash -torchrun --nproc_per_node 3 train_xxx.py +torchrun --nproc_per_node N train_xxx.py ```
Note: Others -* Start training with N GPUs on a single machine (DeepSpeed): - ```bash - deepspeed --master_port 29500 --num_gpus=N train_xxx.py - ``` +Start training with N GPUs on a single machine (DeepSpeed): -* Enable wandb to record the training process if needed: - ```bash - # Need to log in: wandb login - torchrun --nproc_per_node N train_xxx.py --use_wandb - # and - python train_xxx.py --use_wandb - ``` - By adding the `--use_wandb` parameter, the training process will be recorded, and after training, you can view the - process on the wandb website. Modify the `wandb_project` and `wandb_run_name` parameters to specify project and run - names. +```bash +deepspeed --master_port 29500 --num_gpus=N train_xxx.py +``` + +Enable wandb to record the training process if needed: + +```bash +# Need to log in: wandb login +torchrun --nproc_per_node N train_xxx.py --use_wandb +# and +python train_xxx.py --use_wandb +``` + +By adding the `--use_wandb` parameter, the training process will be recorded, and after training, you can view the +process on the wandb website. Modify the `wandb_project` and `wandb_run_name` parameters to specify project and run +names.
diff --git a/train_full_sft.py b/train_full_sft.py index 3c6242b..859dafc 100644 --- a/train_full_sft.py +++ b/train_full_sft.py @@ -123,7 +123,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="MiniMind Full SFT") parser.add_argument("--out_dir", type=str, default="out") parser.add_argument("--epochs", type=int, default=6) - parser.add_argument("--batch_size", type=int, default=128) + parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--learning_rate", type=float, default=5e-5) parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu") parser.add_argument("--dtype", type=str, default="bfloat16") diff --git a/train_pretrain.py b/train_pretrain.py index afd59ec..a4465b6 100644 --- a/train_pretrain.py +++ b/train_pretrain.py @@ -120,7 +120,7 @@ if __name__ == "__main__": parser.add_argument("--out_dir", type=str, default="out") # 若要以最快速度实现zero则epochs设置为1轮;否则应当利用有限的数据训练2~6个epochs。 parser.add_argument("--epochs", type=int, default=1) - parser.add_argument("--batch_size", type=int, default=128) + parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--learning_rate", type=float, default=5e-4) parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu") parser.add_argument("--dtype", type=str, default="bfloat16")