Merge pull request #44 from iomgaa-ycz/wandb

修复wandb bug & 添加了argparse
This commit is contained in:
jingyaogong 2024-09-24 14:10:05 +08:00 committed by GitHub
commit 13105cfa0c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 219 additions and 218 deletions

View File

@ -1,5 +1,6 @@
import os
import platform
import argparse
import time
import math
import warnings
@ -23,67 +24,66 @@ def Logger(content):
def get_lr(it, all):
warmup_iters = 0
warmup_iters = args.warmup_iters
lr_decay_iters = all
min_lr = learning_rate / 10
min_lr = args.learning_rate / 10
if it < warmup_iters:
return learning_rate * it / warmup_iters
return args.learning_rate * it / warmup_iters
if it > lr_decay_iters:
return min_lr
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
return min_lr + coeff * (learning_rate - min_lr)
return min_lr + coeff * (args.learning_rate - min_lr)
def train_epoch(epoch, wandb, accumulation_steps=8):
def train_epoch(epoch, wandb):
start_time = time.time()
for step, (X, Y) in enumerate(train_loader):
X = X.to(device)
Y = Y.to(device)
X = X.to(args.device)
Y = Y.to(args.device)
lr = get_lr(epoch * iter_per_epoch + step, epochs * iter_per_epoch)
lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
with ctx:
out = model(X, Y)
loss = out.last_loss / accumulation_steps
loss = out.last_loss / args.accumulation_steps
scaler.scale(loss).backward()
if (step + 1) % accumulation_steps == 0:
if (step + 1) % args.accumulation_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad(set_to_none=True)
if step % 100 == 0:
if step % args.log_interval == 0:
spend_time = time.time() - start_time
Logger(
'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
epoch,
epochs,
args.epochs,
step,
iter_per_epoch,
loss.item() * accumulation_steps,
loss.item() * args.accumulation_steps,
optimizer.param_groups[-1]['lr'],
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
if (wandb is not None) and (not ddp or dist.get_rank() == 0):
wandb.log({"loss": loss.item() * accumulation_steps,
wandb.log({"loss": loss.item() * args.accumulation_steps,
"lr": optimizer.param_groups[-1]['lr'],
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
if (step + 1) % 1000 == 0 and (not ddp or dist.get_rank() == 0):
if (step + 1) % args.save_interval == 0 and (not ddp or dist.get_rank() == 0):
model.eval()
# torch.save(model.state_dict(), '{}/iter_{}.pth'.format(save_dir, int(step + epoch * iter_per_epoch)))
moe_path = '_moe' if lm_config.use_moe else ''
ckp = f'{save_dir}/pretrain_{lm_config.dim}{moe_path}.pth'
ckp = f'{args.save_dir}/pretrain_{lm_config.dim}{moe_path}.pth'
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
state_dict = model.module.state_dict()
@ -98,17 +98,8 @@ def init_model():
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
# model init
model = Transformer(lm_config).to(device)
model = Transformer(lm_config).to(args.device)
moe_path = '_moe' if lm_config.use_moe else ''
# ckp = f'{save_dir}/pretrain_{lm_config.dim}{moe_path}.pth'
#
# state_dict = torch.load(ckp, map_location=device)
# unwanted_prefix = '_orig_mod.'
# for k, v in list(state_dict.items()):
# if k.startswith(unwanted_prefix):
# state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
# model.load_state_dict(state_dict, strict=False)
Logger(f'LLM总参数量{count_parameters(model) / 1e6:.3f} 百万')
return model
@ -127,79 +118,79 @@ def init_distributed_mode():
# torchrun --nproc_per_node 2 1-pretrain.py
# I/O
if __name__ == "__main__":
# -----------------------------------------------------------------------------
parser = argparse.ArgumentParser(description="MiniMind Pretraining")
parser.add_argument("--out_dir", type=str, default="out", help="Output directory")
parser.add_argument("--epochs", type=int, default=20, help="Number of epochs")
parser.add_argument("--batch_size", type=int, default=64, help="Batch size")
parser.add_argument("--learning_rate", type=float, default=2e-4, help="Learning rate")
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", help="Device to use")
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases")
parser.add_argument("--wandb_project", type=str, default="MiniMind-Pretrain", help="Weights & Biases project name")
parser.add_argument("--num_workers", type=int, default=8, help="Number of workers for data loading")
parser.add_argument("--data_path", type=str, default="./dataset/pretrain_data.bin", help="Path to training data")
parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel")
parser.add_argument("--accumulation_steps", type=int, default=8, help="Gradient accumulation steps")
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
parser.add_argument("--warmup_iters", type=int, default=0, help="Number of warmup iterations")
parser.add_argument("--log_interval", type=int, default=100, help="Logging interval")
parser.add_argument("--save_interval", type=int, default=1000, help="Model saving interval")
args = parser.parse_args()
lm_config = LMConfig()
max_seq_len = lm_config.max_seq_len
out_dir = 'out'
epochs = 20
batch_size = 64
learning_rate = 2e-4
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16'
save_dir = os.path.join(out_dir)
os.makedirs(save_dir, exist_ok=True)
os.makedirs(out_dir, exist_ok=True)
tokens_per_iter = batch_size * max_seq_len
args.save_dir = os.path.join(args.out_dir)
os.makedirs(args.save_dir, exist_ok=True)
os.makedirs(args.out_dir, exist_ok=True)
tokens_per_iter = args.batch_size * max_seq_len
torch.manual_seed(1337)
device_type = device if "cuda" in device else "cpu"
device_type = "cuda" if "cuda" in args.device else "cpu"
use_wandb = False # 是否使用wandb
wandb_project = "MiniMind-Pretrain"
wandb_run_name = f"MiniMind-Pretrain-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
if use_wandb:
import wandb
wandb.init(project=wandb_project, name=wandb_run_name)
else:
wandb = None
args.wandb_run_name = f"MiniMind-Pretrain-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
ctx = (
nullcontext()
if device_type == "cpu"
else torch.cuda.amp.autocast()
)
ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run?
ddp_local_rank, DEVICE = 0, "cuda:0"
if ddp:
init_distributed_mode()
device = torch.device(DEVICE)
# -----------------------------------------------------------------------------
args.device = torch.device(DEVICE)
# -----init dataloader------
data_path_list = ['./dataset/pretrain_data.bin']
if args.use_wandb and (not ddp or ddp_local_rank == 0):
import wandb
wandb.init(project=args.wandb_project, name=args.wandb_run_name)
else:
wandb = None
data_path_list = [args.data_path]
train_ds = PretrainDataset(data_path_list, max_length=max_seq_len, memmap=True)
train_sampler = DistributedSampler(train_ds) if ddp else None
num_workers = 8 # 可以根据系统的 CPU 核心数来调整
train_loader = DataLoader(
train_ds,
batch_size=batch_size,
batch_size=args.batch_size,
pin_memory=True,
drop_last=False,
shuffle=False,
num_workers=num_workers,
num_workers=args.num_workers,
sampler=train_sampler
)
# init model
model = init_model()
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == dtype))
# optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# compile the model
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype == args.dtype))
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
if False and platform.system() != 'Windows' and float(torch.__version__.split('.')[0]) >= 2:
Logger("compiling the model... (takes a ~minute)")
unoptimized_model = model
model = torch.compile(model)
if ddp:
# Ignore the freqs_cis buffer so that DDP does not broadcast it at
# construction time since NCCL does not support ComplexFloat
model._ddp_params_and_buffers_to_ignore = {"pos_cis"}
model = DistributedDataParallel(model, device_ids=[ddp_local_rank])
# training loop
iter_per_epoch = len(train_loader)
for epoch in range(epochs):
for epoch in range(args.epochs):
train_epoch(epoch, wandb)

View File

@ -1,5 +1,6 @@
import os
import platform
import argparse
import time
import math
import warnings
@ -12,7 +13,6 @@ from contextlib import nullcontext
from torch import optim
from torch.nn.parallel import DistributedDataParallel
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader, DistributedSampler
from transformers import AutoTokenizer, AutoModel
from model.model import Transformer
@ -28,28 +28,27 @@ def Logger(content):
def get_lr(it, all):
warmup_iters = 0
warmup_iters = args.warmup_iters
lr_decay_iters = all
min_lr = learning_rate / epochs
min_lr = args.learning_rate / 10
if it < warmup_iters:
return learning_rate * it / warmup_iters
return args.learning_rate * it / warmup_iters
if it > lr_decay_iters:
return min_lr
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
return min_lr + coeff * (learning_rate - min_lr)
return min_lr + coeff * (args.learning_rate - min_lr)
# ------------------------------------------------------------------------------
def train_epoch(epoch, wandb):
start_time = time.time()
for step, (X, Y, loss_mask) in enumerate(train_loader):
X = X.to(device)
Y = Y.to(device)
loss_mask = loss_mask.to(device)
lr = get_lr(epoch * iter_per_epoch + step, epochs * iter_per_epoch)
X = X.to(args.device)
Y = Y.to(args.device)
loss_mask = loss_mask.to(args.device)
lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
@ -59,30 +58,26 @@ def train_epoch(epoch, wandb):
loss_mask = loss_mask.view(-1)
loss = torch.sum(loss * loss_mask) / loss_mask.sum()
# Backward pass
scaler.scale(loss).backward()
# Unscale gradients and clip them
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
if (step + 1) % args.accumulation_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
# Update parameters
scaler.step(optimizer)
scaler.update()
scaler.step(optimizer)
scaler.update()
# Zero the gradients
optimizer.zero_grad(set_to_none=True)
optimizer.zero_grad(set_to_none=True)
# 打印日志
if step % 100 == 0:
if step % args.log_interval == 0:
spend_time = time.time() - start_time
Logger(
'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.8f} epoch_Time:{}min:'.format(
'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
epoch,
epochs,
args.epochs,
step,
iter_per_epoch,
loss,
loss.item(),
optimizer.param_groups[-1]['lr'],
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
@ -91,11 +86,11 @@ def train_epoch(epoch, wandb):
"lr": optimizer.param_groups[-1]['lr'],
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
if (step + 1) % 1000 == 0 and (not ddp or dist.get_rank() == 0):
if (step + 1) % args.save_interval == 0 and (not ddp or dist.get_rank() == 0):
model.eval()
# torch.save(model.state_dict(), '{}/sft_iter_{}.pth'.format(save_dir, int(step + epoch * iter_per_epoch)))
moe_path = '_moe' if lm_config.use_moe else ''
ckp = f'{save_dir}/full_sft_{lm_config.dim}{moe_path}.pth'
ckp = f'{args.save_dir}/full_sft_{lm_config.dim}{moe_path}.pth'
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
state_dict = model.module.state_dict()
else:
@ -105,7 +100,7 @@ def train_epoch(epoch, wandb):
model.train()
def init_model(lm_config):
def init_model():
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
model_from = 1 # 1从权重2用transformers
@ -116,7 +111,7 @@ def init_model(lm_config):
model = Transformer(lm_config)
moe_path = '_moe' if lm_config.use_moe else ''
ckp = f'./out/pretrain_{lm_config.dim}{moe_path}.pth'
state_dict = torch.load(ckp, map_location=device)
state_dict = torch.load(ckp, map_location=args.device)
unwanted_prefix = '_orig_mod.'
for k, v in list(state_dict.items()):
if k.startswith(unwanted_prefix):
@ -126,7 +121,7 @@ def init_model(lm_config):
model = AutoModel.from_pretrained('./minimind', trust_remote_code=True)
Logger(f'LLM总参数量{count_parameters(model) / 1e6:.3f} 百万')
model = model.to(device)
model = model.to(args.device)
return model, tokenizer
@ -143,84 +138,78 @@ def init_distributed_mode():
torch.cuda.set_device(DEVICE)
# I/O
if __name__ == "__main__":
# -----------------------------------------------------------------------------
parser = argparse.ArgumentParser(description="MiniMind Full SFT")
parser.add_argument("--out_dir", type=str, default="out", help="Output directory")
parser.add_argument("--epochs", type=int, default=19, help="Number of epochs")
parser.add_argument("--batch_size", type=int, default=40, help="Batch size")
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", help="Device to use")
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases")
parser.add_argument("--wandb_project", type=str, default="MiniMind-Full-SFT", help="Weights & Biases project name")
parser.add_argument("--num_workers", type=int, default=8, help="Number of workers for data loading")
parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel")
parser.add_argument("--accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
parser.add_argument("--warmup_iters", type=int, default=0, help="Number of warmup iterations")
parser.add_argument("--log_interval", type=int, default=100, help="Logging interval")
parser.add_argument("--save_interval", type=int, default=1000, help="Model saving interval")
args = parser.parse_args()
lm_config = LMConfig()
max_seq_len = lm_config.max_seq_len
out_dir = 'out'
epochs = 19
gradient_accumulation_steps = 1
batch_size = 40
learning_rate = 1e-4
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16'
# dtype = 'float16'
save_dir = os.path.join(out_dir)
os.makedirs(save_dir, exist_ok=True)
tokens_per_iter = gradient_accumulation_steps * batch_size * max_seq_len
os.makedirs(out_dir, exist_ok=True)
args.save_dir = os.path.join(args.out_dir)
os.makedirs(args.save_dir, exist_ok=True)
os.makedirs(args.out_dir, exist_ok=True)
tokens_per_iter = args.batch_size * max_seq_len
torch.manual_seed(1337)
device_type = device if "cuda" in device else "cpu"
device_type = "cuda" if "cuda" in args.device else "cpu"
use_wandb = False # 是否使用wandb
wandb_project = "MiniMind-Full-SFT"
wandb_run_name = f"MiniMind-Full-SFT-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
if use_wandb:
import wandb
args.wandb_run_name = f"MiniMind-Full-SFT-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
wandb.init(project=wandb_project, name=wandb_run_name)
else:
wandb = None
ctx = (
nullcontext()
if device_type == "cpu"
else torch.cuda.amp.autocast()
)
### ddp config
ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run?
ddp_local_rank, DEVICE = 0, "cuda:0"
if ddp:
init_distributed_mode()
device = torch.device(DEVICE)
# -----------------------------------------------------------------------------
args.device = torch.device(DEVICE)
if args.use_wandb and (not ddp or ddp_local_rank == 0):
import wandb
wandb.init(project=args.wandb_project, name=args.wandb_run_name)
else:
wandb = None
model, tokenizer = init_model()
model, tokenizer = init_model(lm_config)
# -----init dataloader------
df = pd.read_csv('./dataset/sft_data_single.csv')
df = df.sample(frac=1.0)
train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len)
train_sampler = DistributedSampler(train_ds) if ddp else None
train_loader = DataLoader(
train_ds,
batch_size=batch_size,
pin_memory=False,
batch_size=args.batch_size,
pin_memory=True,
drop_last=False,
shuffle=False,
num_workers=8,
num_workers=args.num_workers,
sampler=train_sampler
)
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == dtype))
# optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype == args.dtype))
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
iter_per_epoch = len(train_loader)
# compile the model
if False and not lm_config.use_moe and platform.system() != 'Windows' and float(
torch.__version__.split('.')[0]) >= 2:
if False and not lm_config.use_moe and platform.system() != 'Windows' and float(torch.__version__.split('.')[0]) >= 2:
Logger("compiling the model... (takes a ~minute)")
unoptimized_model = model
model = torch.compile(model) # requires PyTorch 2.0
model = torch.compile(model)
if ddp:
# Ignore the pos_cis buffer so that DDP does not broadcast it at
# construction time since NCCL does not support ComplexFloat
model._ddp_params_and_buffers_to_ignore = {"pos_cis"}
model = DistributedDataParallel(model, device_ids=[ddp_local_rank])
# training loop
for epoch in range(epochs):
iter_per_epoch = len(train_loader)
for epoch in range(args.epochs):
train_epoch(epoch, wandb)

View File

@ -1,5 +1,6 @@
import os
import platform
import argparse
import time
import math
import warnings
@ -16,32 +17,36 @@ from torch.utils.data import DataLoader
from model.LMConfig import LMConfig
from model.dataset import SFTDataset
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore')
def get_lr(it):
warmup_iters = 1000
lr_decay_iters = 80000
min_lr = 1e-5
def Logger(content):
print(content)
def get_lr(it, all):
warmup_iters = args.warmup_iters
lr_decay_iters = all
min_lr = args.learning_rate / 10
if it < warmup_iters:
return learning_rate * it / warmup_iters
return args.learning_rate * it / warmup_iters
if it > lr_decay_iters:
return min_lr
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
return min_lr + coeff * (learning_rate - min_lr)
return min_lr + coeff * (args.learning_rate - min_lr)
# ------------------------------------------------------------------------------
def train_epoch(epoch, wandb):
start_time = time.time()
for step, (X, Y, loss_mask) in enumerate(train_loader):
X = X.to(device)
Y = Y.to(device)
loss_mask = loss_mask.to(device)
lr = get_lr(epoch * iter_per_epoch + step)
X = X.to(args.device)
Y = Y.to(args.device)
loss_mask = loss_mask.to(args.device)
lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
@ -50,33 +55,38 @@ def train_epoch(epoch, wandb):
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=0, reduction='none')
loss_mask = loss_mask.view(-1)
loss = torch.sum(loss * loss_mask) / loss_mask.sum()
loss = loss / args.accumulation_steps
scaler.scale(loss).backward()
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
if (step + 1) % args.accumulation_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
scaler.step(optimizer)
scaler.update()
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad(set_to_none=True)
optimizer.zero_grad(set_to_none=True)
if step % 100 == 0:
if step % args.log_interval == 0:
spend_time = time.time() - start_time
print(
Logger(
'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
epoch,
epochs,
args.epochs,
step,
iter_per_epoch,
loss.item(),
loss.item() * args.accumulation_steps,
optimizer.param_groups[-1]['lr'],
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
if wandb is not None:
wandb.log({"loss": loss.item(), "lr": optimizer.param_groups[-1]['lr'],
wandb.log({"loss": loss.item() * args.accumulation_steps,
"lr": optimizer.param_groups[-1]['lr'],
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
if (step + 1) % args.save_interval == 0:
model.save_pretrained(args.save_dir)
def find_all_linear_names(model):
cls = torch.nn.Linear
@ -95,7 +105,7 @@ def init_model():
model_name_or_path = "./minimind-v1-small"
tokenizer_name_or_path = "./minimind-v1-small"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).to(device)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).to(args.device)
target_modules = find_all_linear_names(model)
peft_config = LoraConfig(
@ -108,74 +118,70 @@ def init_model():
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model = model.to(device)
model = model.to(args.device)
return model, tokenizer
# I/O
if __name__ == "__main__":
# -----------------------------------------------------------------------------
parser = argparse.ArgumentParser(description="MiniMind LoRA Fine-tuning")
parser.add_argument("--out_dir", type=str, default="out", help="Output directory")
parser.add_argument("--epochs", type=int, default=20, help="Number of epochs")
parser.add_argument("--batch_size", type=int, default=16, help="Batch size")
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", help="Device to use")
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases")
parser.add_argument("--wandb_project", type=str, default="MiniMind-LoRA", help="Weights & Biases project name")
parser.add_argument("--num_workers", type=int, default=0, help="Number of workers for data loading")
parser.add_argument("--accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
parser.add_argument("--warmup_iters", type=int, default=1000, help="Number of warmup iterations")
parser.add_argument("--log_interval", type=int, default=100, help="Logging interval")
parser.add_argument("--save_interval", type=int, default=1000, help="Model saving interval")
args = parser.parse_args()
lm_config = LMConfig()
max_seq_len = lm_config.max_seq_len
out_dir = 'out'
epochs = 20
gradient_accumulation_steps = 1
batch_size = 16
learning_rate = 1e-4
weight_decay = 1e-1
device = 'cuda:0'
dtype = 'bfloat16'
save_dir = os.path.join(out_dir)
os.makedirs(save_dir, exist_ok=True)
tokens_per_iter = gradient_accumulation_steps * batch_size * max_seq_len
os.makedirs(out_dir, exist_ok=True)
args.save_dir = os.path.join(args.out_dir)
os.makedirs(args.save_dir, exist_ok=True)
os.makedirs(args.out_dir, exist_ok=True)
tokens_per_iter = args.batch_size * max_seq_len
torch.manual_seed(1337)
device_type = device if "cuda" in device else "cpu"
device_type = "cuda" if "cuda" in args.device else "cpu"
use_wandb = False # 是否使用wandb
wandb_project = "MiniMind-LoRA-SFT"
wandb_run_name = f"MiniMind-LoRA-SFT-Epoch-{epochs}-BatchSize-{batch_size}-LearningRate-{learning_rate}"
if use_wandb:
args.wandb_run_name = f"MiniMind-LoRA-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
if args.use_wandb:
import wandb
wandb.init(project=wandb_project, name=wandb_run_name)
wandb.init(project=args.wandb_project, name=args.wandb_run_name)
else:
wandb = None
ctx = (
nullcontext()
if device_type == "cpu"
else torch.cuda.amp.autocast()
)
# -----------------------------------------------------------------------------
model, tokenizer = init_model()
# -----init dataloader------
df = pd.read_csv('./dataset/sft_data_single.csv')
df = pd.read_csv('./dataset/sft_data.csv')
df = df.sample(frac=1.0)
train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len)
train_loader = DataLoader(
train_ds,
batch_size=batch_size,
pin_memory=False,
batch_size=args.batch_size,
pin_memory=True,
drop_last=False,
shuffle=False,
num_workers=0,
num_workers=args.num_workers,
)
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
iter_per_epoch = len(train_loader)
# compile the model
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype == 'float16'))
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
if False and platform.system() != 'Windows' and float(torch.__version__.split('.')[0]) >= 2:
print("compiling the model... (takes a ~minute)")
Logger("compiling the model... (takes a ~minute)")
unoptimized_model = model
model = torch.compile(model)
raw_model = model
# training loop
for epoch in range(epochs):
iter_per_epoch = len(train_loader)
for epoch in range(args.epochs):
train_epoch(epoch, wandb)
model.save_pretrained('minimind')

View File

@ -213,6 +213,14 @@ streamlit run fast_inference.py
deepspeed --master_port 29500 --num_gpus=N 3-full_sft.py
```
* 记录训练过程
```bash
torchrun --nproc_per_node N 1-pretrain.py --use_wandb
# and
python 1-pretrain.py --use_wandb
```
通过添加`--use_wandb`参数可以记录训练过程训练完成后可以在wandb网站上查看训练过程。通过修改`wandb_project``wandb_run_name`参数,可以指定项目名称和运行名称。
# 📌 Data sources
- 🤖 分词器nlp中的Tokenizer类似于词典将单词从自然语言通过“词典”映射到0,1,36这样的数字可以理解为数字就代表了单词在“词典”中的页码。

View File

@ -234,6 +234,13 @@ git clone https://github.com/jingyaogong/minimind.git
# and
deepspeed --master_port 29500 --num_gpus=N 3-full_sft.py
```
* Record the training process
```bash
torchrun --nproc_per_node N 1-pretrain.py --use_wandb
# and
python 1-pretrain.py --use_wandb
```
By adding the `--use_wandb` parameter, you can record the training process. After training is complete, you can view the training process on the wandb website. You can specify the project name and run name by modifying the `wandb_project` and `wandb_run_name` parameters.
# 📌 Data sources