update wandb monitor

This commit is contained in:
gongjy 2024-09-23 22:14:52 +08:00
parent 15f8242ba7
commit 235b6c6fd3
5 changed files with 26 additions and 24 deletions

View File

@ -73,7 +73,8 @@ def train_epoch(epoch, wandb, accumulation_steps=8):
loss.item() * accumulation_steps, loss.item() * accumulation_steps,
optimizer.param_groups[-1]['lr'], optimizer.param_groups[-1]['lr'],
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
if wandb != None:
if (use_wandb is not None) and (not ddp or dist.get_rank() == 0):
wandb.log({"loss": loss.item() * accumulation_steps, wandb.log({"loss": loss.item() * accumulation_steps,
"lr": optimizer.param_groups[-1]['lr'], "lr": optimizer.param_groups[-1]['lr'],
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
@ -124,6 +125,7 @@ def init_distributed_mode():
DEVICE = f"cuda:{ddp_local_rank}" DEVICE = f"cuda:{ddp_local_rank}"
torch.cuda.set_device(DEVICE) torch.cuda.set_device(DEVICE)
# torchrun --nproc_per_node 2 1-pretrain.py # torchrun --nproc_per_node 2 1-pretrain.py
# I/O # I/O
if __name__ == "__main__": if __name__ == "__main__":
@ -143,7 +145,7 @@ if __name__ == "__main__":
torch.manual_seed(1337) torch.manual_seed(1337)
device_type = device if "cuda" in device else "cpu" device_type = device if "cuda" in device else "cpu"
use_wandb = True #是否使用wandb use_wandb = False # 是否使用wandb
wandb_project = "MiniMind-Pretrain" wandb_project = "MiniMind-Pretrain"
wandb_run_name = f"MiniMind-Pretrain-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}" wandb_run_name = f"MiniMind-Pretrain-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
if use_wandb: if use_wandb:
@ -152,7 +154,6 @@ if __name__ == "__main__":
else: else:
wandb = None wandb = None
ctx = ( ctx = (
nullcontext() nullcontext()
if device_type == "cpu" if device_type == "cpu"

View File

@ -85,8 +85,10 @@ def train_epoch(epoch, wandb):
loss, loss,
optimizer.param_groups[-1]['lr'], optimizer.param_groups[-1]['lr'],
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
if use_wandb != None:
wandb.log({"loss": loss, "lr": optimizer.param_groups[-1]['lr'], if (use_wandb is not None) and (not ddp or dist.get_rank() == 0):
wandb.log({"loss": loss,
"lr": optimizer.param_groups[-1]['lr'],
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
if (step + 1) % 1000 == 0 and (not ddp or dist.get_rank() == 0): if (step + 1) % 1000 == 0 and (not ddp or dist.get_rank() == 0):
@ -161,11 +163,12 @@ if __name__ == "__main__":
torch.manual_seed(1337) torch.manual_seed(1337)
device_type = device if "cuda" in device else "cpu" device_type = device if "cuda" in device else "cpu"
use_wandb = True #是否使用wandb use_wandb = False # 是否使用wandb
wandb_project = "MiniMind-Full-SFT" wandb_project = "MiniMind-Full-SFT"
wandb_run_name = f"MiniMind-Full-SFT-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}" wandb_run_name = f"MiniMind-Full-SFT-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
if use_wandb: if use_wandb:
import wandb import wandb
wandb.init(project=wandb_project, name=wandb_run_name) wandb.init(project=wandb_project, name=wandb_run_name)
else: else:
wandb = None wandb = None
@ -219,5 +222,5 @@ if __name__ == "__main__":
model = DistributedDataParallel(model, device_ids=[ddp_local_rank]) model = DistributedDataParallel(model, device_ids=[ddp_local_rank])
# training loop # training loop
for epoch in range(epochs,wandb): for epoch in range(epochs):
train_epoch(epoch) train_epoch(epoch, wandb)

View File

@ -72,7 +72,8 @@ def train_epoch(epoch, wandb):
loss.item(), loss.item(),
optimizer.param_groups[-1]['lr'], optimizer.param_groups[-1]['lr'],
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
if use_wandb != None:
if use_wandb is not None:
wandb.log({"loss": loss.item(), "lr": optimizer.param_groups[-1]['lr'], wandb.log({"loss": loss.item(), "lr": optimizer.param_groups[-1]['lr'],
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
@ -91,8 +92,8 @@ def find_all_linear_names(model):
def init_model(): def init_model():
model_name_or_path = "./minimind" model_name_or_path = "./minimind-v1-small"
tokenizer_name_or_path = "./minimind" tokenizer_name_or_path = "./minimind-v1-small"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True, use_fast=False) tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).to(device) model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).to(device)
@ -131,11 +132,12 @@ if __name__ == "__main__":
torch.manual_seed(1337) torch.manual_seed(1337)
device_type = device if "cuda" in device else "cpu" device_type = device if "cuda" in device else "cpu"
use_wandb = True #是否使用wandb use_wandb = False # 是否使用wandb
wandb_project = "MiniMind-LoRA" wandb_project = "MiniMind-LoRA-SFT"
wandb_run_name = f"MiniMind-LoRA-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}" wandb_run_name = f"MiniMind-LoRA-SFT-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
if use_wandb: if use_wandb:
import wandb import wandb
wandb.init(project=wandb_project, name=wandb_run_name) wandb.init(project=wandb_project, name=wandb_run_name)
else: else:
wandb = None wandb = None
@ -150,7 +152,7 @@ if __name__ == "__main__":
model, tokenizer = init_model() model, tokenizer = init_model()
# -----init dataloader------ # -----init dataloader------
df = pd.read_csv('./dataset/sft_data.csv') df = pd.read_csv('./dataset/sft_data_single.csv')
df = df.sample(frac=1.0) df = df.sample(frac=1.0)
train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len) train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len)
train_loader = DataLoader( train_loader = DataLoader(

View File

@ -69,10 +69,9 @@ https://github.com/user-attachments/assets/88b98128-636e-43bc-a419-b1b1403c2055
- 公开MiniMind模型代码包含Dense和MoE模型、Pretrain、SFT指令微调、LoRA微调、DPO偏好优化的全过程代码、数据集和来源。 - 公开MiniMind模型代码包含Dense和MoE模型、Pretrain、SFT指令微调、LoRA微调、DPO偏好优化的全过程代码、数据集和来源。
- 兼容`transformers``accelerate``trl``peft`等流行框架。 - 兼容`transformers``accelerate``trl``peft`等流行框架。
- 训练支持单机单卡、单机多卡(DDP、DeepSpeed)训练。训练过程中支持在任意位置停止,及在任意位置继续训练。 - 训练支持单机单卡、单机多卡(DDP、DeepSpeed)训练使用wandb可视化训练流程。支持在任意位置停止,及在任意位置继续训练。
- 在Ceval数据集上进行模型测试的代码。 - 在Ceval数据集上进行模型测试的代码。
- 实现Openai-Api基本的chat接口便于集成到第三方ChatUI使用FastGPT、Open-WebUI等 - 实现Openai-Api基本的chat接口便于集成到第三方ChatUI使用FastGPT、Open-WebUI等
- 使用wandb可视化训练流程。
希望此开源项目可以帮助LLM初学者快速入门 希望此开源项目可以帮助LLM初学者快速入门

View File

@ -75,13 +75,10 @@ The project includes:
- Public MiniMind model code (including Dense and MoE models), code for Pretrain, SFT instruction fine-tuning, LoRA - Public MiniMind model code (including Dense and MoE models), code for Pretrain, SFT instruction fine-tuning, LoRA
fine-tuning, and DPO preference optimization, along with datasets and sources. fine-tuning, and DPO preference optimization, along with datasets and sources.
- Compatibility with popular frameworks such as `transformers`, `accelerate`, `trl`, and `peft`. - Compatibility with popular frameworks such as `transformers`, `accelerate`, `trl`, and `peft`.
- Training support for single-GPU and multi-GPU setups(DDP、DeepSpeed). The training process allows for stopping and - Training support for single-GPU and multi-GPU setups(DDP、DeepSpeed), Use wandb to visualize the training process. The training process allows for stopping and resuming at any point.
resuming at any
point.
- Code for testing the model on the Ceval dataset. - Code for testing the model on the Ceval dataset.
- Implementation of a basic chat interface compatible with OpenAI's API, facilitating integration into third-party Chat - Implementation of a basic chat interface compatible with OpenAI's API, facilitating integration into third-party Chat
UIs (such as FastGPT, Open-WebUI, etc.). UIs (such as FastGPT, Open-WebUI, etc.).
- Use wandb to visualize the training process.
We hope this open-source project helps LLM beginners get started quickly! We hope this open-source project helps LLM beginners get started quickly!