From 29d6cd9cd28a48fb063763099b94db030cd35d5d Mon Sep 17 00:00:00 2001 From: gongjy <2474590974@qq.com> Date: Sun, 9 Feb 2025 23:52:50 +0800 Subject: [PATCH] update scripts file --- 0-eval_pretrain.py | 153 ------------------------- 1-pretrain.py | 209 --------------------------------- 2-eval_chat.py | 186 ------------------------------ 3-full_sft.py | 216 ----------------------------------- 4-lora_sft.py | 188 ------------------------------ 5-dpo_train.py | 48 -------- scripts/data_process.py | 185 ------------------------------ scripts/load_test_dataset.py | 97 ---------------- 8 files changed, 1282 deletions(-) delete mode 100644 0-eval_pretrain.py delete mode 100644 1-pretrain.py delete mode 100644 2-eval_chat.py delete mode 100644 3-full_sft.py delete mode 100644 4-lora_sft.py delete mode 100644 5-dpo_train.py delete mode 100644 scripts/data_process.py delete mode 100644 scripts/load_test_dataset.py diff --git a/0-eval_pretrain.py b/0-eval_pretrain.py deleted file mode 100644 index d54e505..0000000 --- a/0-eval_pretrain.py +++ /dev/null @@ -1,153 +0,0 @@ -import random -import time - -import numpy as np -import torch -import warnings -from transformers import AutoTokenizer, AutoModelForCausalLM -from model.model import Transformer -from model.LMConfig import LMConfig - -warnings.filterwarnings('ignore') - - -def count_parameters(model): - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - -def init_model(lm_config): - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - model_from = 1 # 1从权重,2用transformers - - if model_from == 1: - moe_path = '_moe' if lm_config.use_moe else '' - ckp = f'./out/pretrain_{lm_config.dim}{moe_path}.pth' - - model = Transformer(lm_config) - state_dict = torch.load(ckp, map_location=device) - - # 处理不需要的前缀 - unwanted_prefix = '_orig_mod.' - for k, v in list(state_dict.items()): - if k.startswith(unwanted_prefix): - state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) - - for k, v in list(state_dict.items()): - if 'mask' in k: - del state_dict[k] - - # 加载到模型中 - model.load_state_dict(state_dict, strict=False) - else: - model = AutoModelForCausalLM.from_pretrained('minimind', trust_remote_code=True) - model = model.to(device) - - print(f'模型参数: {count_parameters(model) / 1e6} 百万 = {count_parameters(model) / 1e9} B (Billion)') - return model, tokenizer - - -def setup_seed(seed): - random.seed(seed) # 设置 Python 的随机种子 - np.random.seed(seed) # 设置 NumPy 的随机种子 - torch.manual_seed(seed) # 设置 PyTorch 的随机种子 - torch.cuda.manual_seed(seed) # 为当前 GPU 设置随机种子(如果有) - torch.cuda.manual_seed_all(seed) # 为所有 GPU 设置随机种子(如果有) - torch.backends.cudnn.deterministic = True # 确保每次返回的卷积算法是确定的 - torch.backends.cudnn.benchmark = False # 关闭 cuDNN 的自动调优,避免不确定性 - - -if __name__ == "__main__": - # ----------------------------------------------------------------------------- - out_dir = 'out' - start = "" - temperature = 0.7 - top_k = 8 - setup_seed(1337) - # device = 'cpu' - device = 'cuda:0' if torch.cuda.is_available() else 'cpu' - dtype = 'bfloat16' - max_seq_len = 512 - lm_config = LMConfig() - lm_config.max_seq_len = max_seq_len - # ----------------------------------------------------------------------------- - - model, tokenizer = init_model(lm_config) - model = model.eval() - # int(input('输入0自动测试,输入1问题测试:')) - answer_way = 0 - stream = True - - prompt_datas = [ - '椭圆和圆的区别', - '中国关于马克思主义基本原理', - '人类大脑的主要功能是', - '万有引力是', - '世界上人口最多的国家是', - 'DNA的全称是', - '数学中π的值大约是', - '世界上最高的山峰是', - '太阳系中最大的行星是', - '二氧化碳的化学分子式是', - '地球上最大的动物是', - '地球自转一圈大约需要', - '杭州市的美食有', - '江苏省的最好的大学', - ] - - qa_index = 0 - while True: - start = time.time() - if answer_way == 1: - # run generation - prompt = input('用户:') - else: - if qa_index >= len(prompt_datas): - break - prompt = prompt_datas[qa_index] - print('问题:', prompt) - qa_index += 1 - - prompt = tokenizer.bos_token + prompt - x = tokenizer(prompt).data['input_ids'] - x = (torch.tensor(x, dtype=torch.long, device=device)[None, ...]) - - with torch.no_grad(): - res_y = model.generate(x, tokenizer.eos_token_id, max_new_tokens=max_seq_len, temperature=temperature, - top_k=top_k, stream=stream) - print('回答:', end='') - try: - y = next(res_y) - except StopIteration: - print("No answer") - continue - - history_idx = 0 - while y != None: - answer = tokenizer.decode(y[0].tolist()) - if answer and answer[-1] == '�': - try: - y = next(res_y) - except: - break - continue - # print(answer) - if not len(answer): - try: - y = next(res_y) - except: - break - continue - - print(answer[history_idx:], end='', flush=True) - try: - y = next(res_y) - except: - break - history_idx = len(answer) - if not stream: - break - - print('\n') - - end = time.time() - print(end - start, 's') diff --git a/1-pretrain.py b/1-pretrain.py deleted file mode 100644 index 175c655..0000000 --- a/1-pretrain.py +++ /dev/null @@ -1,209 +0,0 @@ -import os -import platform -import argparse -import time -import math -import warnings - -import pandas as pd -import torch -import torch.distributed as dist -from torch import optim -from torch.nn.parallel import DistributedDataParallel -from torch.optim.lr_scheduler import CosineAnnealingLR -from torch.utils.data import DataLoader, DistributedSampler -from contextlib import nullcontext - -from transformers import AutoTokenizer - -from model.model import Transformer -from model.LMConfig import LMConfig -from model.dataset import PretrainDataset - -warnings.filterwarnings('ignore') - - -def Logger(content): - if not ddp or dist.get_rank() == 0: - print(content) - - -def get_lr(it, all): - warmup_iters = args.warmup_iters - lr_decay_iters = all - min_lr = args.learning_rate / 10 - - if it < warmup_iters: - return args.learning_rate * it / warmup_iters - if it > lr_decay_iters: - return min_lr - decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters) - assert 0 <= decay_ratio <= 1 - coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) - return min_lr + coeff * (args.learning_rate - min_lr) - - -def train_epoch(epoch, wandb): - start_time = time.time() - for step, (X, Y, loss_mask) in enumerate(train_loader): - X = X.to(args.device) - Y = Y.to(args.device) - loss_mask = loss_mask.to(args.device) - - lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch) - for param_group in optimizer.param_groups: - param_group['lr'] = lr - - with ctx: - out = model(X, Y) - loss = out.last_loss / args.accumulation_steps - loss_mask = loss_mask.view(-1) - loss = torch.sum(loss * loss_mask) / loss_mask.sum() - - scaler.scale(loss).backward() - - if (step + 1) % args.accumulation_steps == 0: - scaler.unscale_(optimizer) - torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) - - scaler.step(optimizer) - scaler.update() - - optimizer.zero_grad(set_to_none=True) - - if step % args.log_interval == 0: - spend_time = time.time() - start_time - Logger( - 'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format( - epoch, - args.epochs, - step, - iter_per_epoch, - loss.item() * args.accumulation_steps, - optimizer.param_groups[-1]['lr'], - spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) - - if (wandb is not None) and (not ddp or dist.get_rank() == 0): - wandb.log({"loss": loss.item() * args.accumulation_steps, - "lr": optimizer.param_groups[-1]['lr'], - "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) - - if (step + 1) % args.save_interval == 0 and (not ddp or dist.get_rank() == 0): - model.eval() - moe_path = '_moe' if lm_config.use_moe else '' - ckp = f'{args.save_dir}/pretrain_{lm_config.dim}{moe_path}.pth' - - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - - torch.save(state_dict, ckp) - model.train() - - -def init_model(): - def count_parameters(model): - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - - model = Transformer(lm_config).to(args.device) - # moe_path = '_moe' if lm_config.use_moe else '' - - Logger(f'LLM总参数量:{count_parameters(model) / 1e6:.3f} 百万') - return model, tokenizer - - -def init_distributed_mode(): - if not ddp: return - global ddp_local_rank, DEVICE - - dist.init_process_group(backend="nccl") - ddp_rank = int(os.environ["RANK"]) - ddp_local_rank = int(os.environ["LOCAL_RANK"]) - ddp_world_size = int(os.environ["WORLD_SIZE"]) - DEVICE = f"cuda:{ddp_local_rank}" - torch.cuda.set_device(DEVICE) - - -# torchrun --nproc_per_node 2 1-pretrain.py -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="MiniMind Pretraining") - parser.add_argument("--out_dir", type=str, default="out", help="Output directory") - parser.add_argument("--epochs", type=int, default=20, help="Number of epochs") - parser.add_argument("--batch_size", type=int, default=64, help="Batch size") - parser.add_argument("--learning_rate", type=float, default=2e-4, help="Learning rate") - parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", - help="Device to use") - parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type") - parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases") - parser.add_argument("--wandb_project", type=str, default="MiniMind-Pretrain", help="Weights & Biases project name") - parser.add_argument("--num_workers", type=int, default=1, help="Number of workers for data loading") - parser.add_argument("--data_path", type=str, default="./dataset/pretrain_data.csv", help="Path to training data") - parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel") - parser.add_argument("--accumulation_steps", type=int, default=8, help="Gradient accumulation steps") - parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold") - parser.add_argument("--warmup_iters", type=int, default=0, help="Number of warmup iterations") - parser.add_argument("--log_interval", type=int, default=100, help="Logging interval") - parser.add_argument("--save_interval", type=int, default=1000, help="Model saving interval") - parser.add_argument('--local_rank', type=int, default=-1, help='local rank for distributed training') - - args = parser.parse_args() - - lm_config = LMConfig() - max_seq_len = lm_config.max_seq_len - args.save_dir = os.path.join(args.out_dir) - os.makedirs(args.save_dir, exist_ok=True) - os.makedirs(args.out_dir, exist_ok=True) - tokens_per_iter = args.batch_size * max_seq_len - torch.manual_seed(1337) - device_type = "cuda" if "cuda" in args.device else "cpu" - - args.wandb_run_name = f"MiniMind-Pretrain-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}" - - ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast() - - ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run? - ddp_local_rank, DEVICE = 0, "cuda:0" - if ddp: - init_distributed_mode() - args.device = torch.device(DEVICE) - - if args.use_wandb and (not ddp or ddp_local_rank == 0): - import wandb - - wandb.init(project=args.wandb_project, name=args.wandb_run_name) - else: - wandb = None - - model, tokenizer = init_model() - df = pd.read_csv(args.data_path) - df = df.sample(frac=1.0) - train_ds = PretrainDataset(df, tokenizer, max_length=max_seq_len) - train_sampler = DistributedSampler(train_ds) if ddp else None - train_loader = DataLoader( - train_ds, - batch_size=args.batch_size, - pin_memory=True, - drop_last=False, - shuffle=False, - num_workers=args.num_workers, - sampler=train_sampler - ) - - scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16'])) - optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) - - if False and platform.system() != 'Windows' and float(torch.__version__.split('.')[0]) >= 2: - Logger("compiling the model... (takes a ~minute)") - unoptimized_model = model - model = torch.compile(model) - - if ddp: - model._ddp_params_and_buffers_to_ignore = {"pos_cis"} - model = DistributedDataParallel(model, device_ids=[ddp_local_rank]) - - iter_per_epoch = len(train_loader) - for epoch in range(args.epochs): - train_epoch(epoch, wandb) diff --git a/2-eval_chat.py b/2-eval_chat.py deleted file mode 100644 index 6579d4f..0000000 --- a/2-eval_chat.py +++ /dev/null @@ -1,186 +0,0 @@ -import random -import time - -import numpy as np -import torch -import warnings -from transformers import AutoTokenizer, AutoModelForCausalLM -from model.model import Transformer -from model.LMConfig import LMConfig - -warnings.filterwarnings('ignore') - - -def count_parameters(model): - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - -def init_model(lm_config): - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - model_from = 1 # 1从权重,2用transformers - - if model_from == 1: - moe_path = '_moe' if lm_config.use_moe else '' - # ckp = f'./out/multi_chat/full_sft_{lm_config.dim}{moe_path}.pth' - - model = Transformer(lm_config) - # state_dict = torch.load(ckp, map_location=device) - - # # 处理不需要的前缀 - # unwanted_prefix = '_orig_mod.' - # for k, v in list(state_dict.items()): - # if k.startswith(unwanted_prefix): - # state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) - # - # for k, v in list(state_dict.items()): - # if 'mask' in k: - # del state_dict[k] - # - # # 加载到模型中 - # model.load_state_dict(state_dict, strict=False) - else: - model = AutoModelForCausalLM.from_pretrained('./minimind-v1-small', trust_remote_code=True) - model = model.to(device) - - print(f'模型参数: {count_parameters(model) / 1e6} 百万 = {count_parameters(model) / 1e9} B (Billion)') - return model, tokenizer - - -def setup_seed(seed): - random.seed(seed) # 设置 Python 的随机种子 - np.random.seed(seed) # 设置 NumPy 的随机种子 - torch.manual_seed(seed) # 设置 PyTorch 的随机种子 - torch.cuda.manual_seed(seed) # 为当前 GPU 设置随机种子(如果有) - torch.cuda.manual_seed_all(seed) # 为所有 GPU 设置随机种子(如果有) - torch.backends.cudnn.deterministic = True # 确保每次返回的卷积算法是确定的 - torch.backends.cudnn.benchmark = False # 关闭 cuDNN 的自动调优,避免不确定性 - - -if __name__ == "__main__": - # ----------------------------------------------------------------------------- - out_dir = 'out' - start = "" - temperature = 0.1 - top_k = 16 - # device = 'cpu' - device = 'cuda:0' if torch.cuda.is_available() else 'cpu' - dtype = 'bfloat16' - max_seq_len = 1 * 1024 - lm_config = LMConfig() - lm_config.max_seq_len = max_seq_len - # 对话是否携带历史对话(当前模型没有在连续对话数据集上训练,增大历史上文基本不会有新的问答能力) - contain_history_chat = False - # ----------------------------------------------------------------------------- - - model, tokenizer = init_model(lm_config) - - model = model.eval() - # 推送到huggingface - # model.push_to_hub("minimind") - # tokenizer.push_to_hub("minimind") - - # answer_way = int(input('输入0自动测试,输入1问题测试:')) - answer_way = 0 - stream = True - - prompt_datas = [ - '你叫什么名字', - '你是谁', - '中国有哪些比较好的大学?', - '全世界最好的大学是什么?', - '你知道光速是多少吗?', - '你知道长江吗?', - '人类的血液主要由哪些成分组成?', - '第一颗人造卫星是哪个国家发射的?', - '你知道杭州有什么美食吗?', - '你知道泰山在哪里吗?', - '地球上最大的动物是什么?', - '地球自转一圈大约需要多少时间?', - '人类最早使用的金属是什么?', - '水的化学分子式是什么?', - '大气层中含量最多的气体是什么?', - '世界上最高的山峰是什么?', - '你知道世界上最深的海沟是什么吗?', - '最早发明印刷术的是哪个国家?', - '万有引力是谁提出的?', - '光合作用的主要原理是什么?', - '你知道大熊猫的主要食物是什么吗?', - '海水为什么是咸的?', - '我们平时喝的牛奶主要含有什么营养成分?', - '一星期有多少天?' - ] - - messages_origin = [] - messages = messages_origin - - i = 0 - while i < len(prompt_datas): - # Generate a random seed - # random_seed = random.randint(0, 2 ** 32 - 1) - # setup_seed(random_seed) - if not contain_history_chat: - messages = messages_origin.copy() - - if answer_way == 1: - prompt = input('[Q]: ') - else: - prompt = prompt_datas[i] - print(f'[Q]: {prompt}') - i += 1 - - prompt = '请问,' + prompt - messages.append({"role": "user", "content": prompt}) - - # print(messages) - new_prompt = tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True - )[-(max_seq_len - 1):] - - x = tokenizer(new_prompt).data['input_ids'] - x = (torch.tensor(x, dtype=torch.long, device=device)[None, ...]) - - answer = new_prompt - - with torch.no_grad(): - res_y = model.generate(x, tokenizer.eos_token_id, max_new_tokens=max_seq_len, temperature=temperature, - top_k=top_k, stream=stream) - print('[A]: ', end='') - try: - y = next(res_y) - except StopIteration: - print("No answer") - continue - - history_idx = 0 - while y != None: - answer = tokenizer.decode(y[0].tolist()) - if answer and answer[-1] == '�': - try: - y = next(res_y) - except: - break - continue - # print(answer) - if not len(answer): - try: - y = next(res_y) - except: - break - continue - - print(answer[history_idx:], end='', flush=True) - try: - y = next(res_y) - except: - break - history_idx = len(answer) - if not stream: - break - - print('\n') - - if contain_history_chat: - assistant_answer = answer.replace(new_prompt, "") - messages.append({"role": "assistant", "content": assistant_answer}) diff --git a/3-full_sft.py b/3-full_sft.py deleted file mode 100644 index d8aa768..0000000 --- a/3-full_sft.py +++ /dev/null @@ -1,216 +0,0 @@ -import os -import platform -import argparse -import time -import math -import warnings - -import pandas as pd -import torch -import torch.nn.functional as F -import torch.distributed as dist -from contextlib import nullcontext - -from torch import optim -from torch.nn.parallel import DistributedDataParallel -from torch.utils.data import DataLoader, DistributedSampler -from transformers import AutoTokenizer, AutoModelForCausalLM -from model.model import Transformer -from model.LMConfig import LMConfig -from model.dataset import SFTDataset - -warnings.filterwarnings('ignore') - - -def Logger(content): - if not ddp or dist.get_rank() == 0: - print(content) - - -def get_lr(it, all): - warmup_iters = args.warmup_iters - lr_decay_iters = all - min_lr = args.learning_rate / 10 - - if it < warmup_iters: - return args.learning_rate * it / warmup_iters - if it > lr_decay_iters: - return min_lr - decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters) - assert 0 <= decay_ratio <= 1 - coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) - return min_lr + coeff * (args.learning_rate - min_lr) - - -def train_epoch(epoch, wandb): - start_time = time.time() - for step, (X, Y, loss_mask) in enumerate(train_loader): - X = X.to(args.device) - Y = Y.to(args.device) - loss_mask = loss_mask.to(args.device) - lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch) - for param_group in optimizer.param_groups: - param_group['lr'] = lr - - with ctx: - logits = model(X, Y).logits - loss = F.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=0, reduction='none') - loss_mask = loss_mask.view(-1) - loss = torch.sum(loss * loss_mask) / loss_mask.sum() - - scaler.scale(loss).backward() - - if (step + 1) % args.accumulation_steps == 0: - scaler.unscale_(optimizer) - torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) - - scaler.step(optimizer) - scaler.update() - - optimizer.zero_grad(set_to_none=True) - - if step % args.log_interval == 0: - spend_time = time.time() - start_time - Logger( - 'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format( - epoch, - args.epochs, - step, - iter_per_epoch, - loss.item(), - optimizer.param_groups[-1]['lr'], - spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) - - if (wandb is not None) and (not ddp or dist.get_rank() == 0): - wandb.log({"loss": loss, - "lr": optimizer.param_groups[-1]['lr'], - "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) - - if (step + 1) % args.save_interval == 0 and (not ddp or dist.get_rank() == 0): - model.eval() - moe_path = '_moe' if lm_config.use_moe else '' - ckp = f'{args.save_dir}/full_sft_{lm_config.dim}{moe_path}.pth' - - if isinstance(model, torch.nn.parallel.DistributedDataParallel): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - - torch.save(state_dict, ckp) - model.train() - - -def init_model(): - tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') - model_from = 1 # 1从权重,2用transformers - - def count_parameters(model): - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - if model_from == 1: - model = Transformer(lm_config) - moe_path = '_moe' if lm_config.use_moe else '' - ckp = f'./out/pretrain_{lm_config.dim}{moe_path}.pth' - state_dict = torch.load(ckp, map_location=args.device) - unwanted_prefix = '_orig_mod.' - for k, v in list(state_dict.items()): - if k.startswith(unwanted_prefix): - state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) - model.load_state_dict(state_dict, strict=False) - else: - model = AutoModelForCausalLM.from_pretrained('./minimind-v1-small', trust_remote_code=True) - - Logger(f'LLM总参数量:{count_parameters(model) / 1e6:.3f} 百万') - model = model.to(args.device) - - return model, tokenizer - - -def init_distributed_mode(): - if not ddp: return - global ddp_local_rank, DEVICE - - dist.init_process_group(backend="nccl") - ddp_rank = int(os.environ["RANK"]) - ddp_local_rank = int(os.environ["LOCAL_RANK"]) - ddp_world_size = int(os.environ["WORLD_SIZE"]) - DEVICE = f"cuda:{ddp_local_rank}" - torch.cuda.set_device(DEVICE) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="MiniMind Full SFT") - parser.add_argument("--out_dir", type=str, default="out", help="Output directory") - parser.add_argument("--epochs", type=int, default=19, help="Number of epochs") - parser.add_argument("--batch_size", type=int, default=32, help="Batch size") - parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate") - parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", help="Device to use") - parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type") - parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases") - parser.add_argument("--wandb_project", type=str, default="MiniMind-Full-SFT", help="Weights & Biases project name") - parser.add_argument("--num_workers", type=int, default=1, help="Number of workers for data loading") - parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel") - parser.add_argument("--accumulation_steps", type=int, default=1, help="Gradient accumulation steps") - parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold") - parser.add_argument("--warmup_iters", type=int, default=0, help="Number of warmup iterations") - parser.add_argument("--log_interval", type=int, default=100, help="Logging interval") - parser.add_argument("--save_interval", type=int, default=1000, help="Model saving interval") - parser.add_argument('--local_rank', type=int, default=-1, help='local rank for distributed training') - - args = parser.parse_args() - - lm_config = LMConfig() - max_seq_len = lm_config.max_seq_len - args.save_dir = os.path.join(args.out_dir) - os.makedirs(args.save_dir, exist_ok=True) - os.makedirs(args.out_dir, exist_ok=True) - tokens_per_iter = args.batch_size * max_seq_len - torch.manual_seed(1337) - device_type = "cuda" if "cuda" in args.device else "cpu" - - args.wandb_run_name = f"MiniMind-Full-SFT-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}" - - ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast() - ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run? - ddp_local_rank, DEVICE = 0, "cuda:0" - if ddp: - init_distributed_mode() - args.device = torch.device(DEVICE) - - if args.use_wandb and (not ddp or ddp_local_rank == 0): - import wandb - wandb.init(project=args.wandb_project, name=args.wandb_run_name) - else: - wandb = None - - model, tokenizer = init_model() - - df = pd.read_csv('./dataset/sft_data_single.csv') - df = df.sample(frac=1.0) - train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len) - train_sampler = DistributedSampler(train_ds) if ddp else None - train_loader = DataLoader( - train_ds, - batch_size=args.batch_size, - pin_memory=True, - drop_last=False, - shuffle=False, - num_workers=args.num_workers, - sampler=train_sampler - ) - - scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16'])) - optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) - - if False and not lm_config.use_moe and platform.system() != 'Windows' and float(torch.__version__.split('.')[0]) >= 2: - Logger("compiling the model... (takes a ~minute)") - unoptimized_model = model - model = torch.compile(model) - - if ddp: - model._ddp_params_and_buffers_to_ignore = {"pos_cis"} - model = DistributedDataParallel(model, device_ids=[ddp_local_rank]) - - iter_per_epoch = len(train_loader) - for epoch in range(args.epochs): - train_epoch(epoch, wandb) diff --git a/4-lora_sft.py b/4-lora_sft.py deleted file mode 100644 index 482844d..0000000 --- a/4-lora_sft.py +++ /dev/null @@ -1,188 +0,0 @@ -import os -import platform -import argparse -import time -import math -import warnings -import torch -import pandas as pd -import torch.nn.functional as F -from contextlib import nullcontext - -from torch import optim -from transformers import AutoTokenizer -from transformers import AutoModelForCausalLM -from peft import get_peft_model, LoraConfig, TaskType -from torch.utils.data import DataLoader -from model.LMConfig import LMConfig -from model.dataset import SFTDataset -from model.model import Transformer - -warnings.filterwarnings('ignore') - - -def Logger(content): - print(content) - - -def get_lr(it, all): - warmup_iters = args.warmup_iters - lr_decay_iters = all - min_lr = args.learning_rate / 10 - - if it < warmup_iters: - return args.learning_rate * it / warmup_iters - if it > lr_decay_iters: - return min_lr - decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters) - assert 0 <= decay_ratio <= 1 - coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) - return min_lr + coeff * (args.learning_rate - min_lr) - - -def train_epoch(epoch, wandb): - start_time = time.time() - for step, (X, Y, loss_mask) in enumerate(train_loader): - X = X.to(args.device) - Y = Y.to(args.device) - loss_mask = loss_mask.to(args.device) - - lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch) - for param_group in optimizer.param_groups: - param_group['lr'] = lr - - with ctx: - logits = model(X, Y).logits - loss = F.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=0, reduction='none') - loss_mask = loss_mask.view(-1) - loss = torch.sum(loss * loss_mask) / loss_mask.sum() - loss = loss / args.accumulation_steps - - scaler.scale(loss).backward() - - if (step + 1) % args.accumulation_steps == 0: - scaler.unscale_(optimizer) - torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) - - scaler.step(optimizer) - scaler.update() - - optimizer.zero_grad(set_to_none=True) - - if step % args.log_interval == 0: - spend_time = time.time() - start_time - Logger( - 'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format( - epoch, - args.epochs, - step, - iter_per_epoch, - loss.item() * args.accumulation_steps, - optimizer.param_groups[-1]['lr'], - spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60)) - if wandb is not None: - wandb.log({"loss": loss.item() * args.accumulation_steps, - "lr": optimizer.param_groups[-1]['lr'], - "epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60}) - - if (step + 1) % args.save_interval == 0: - model.save_pretrained(args.save_dir) - - -def find_linear_with_keys(model, keys=["wq", "wk"]): - cls = torch.nn.Linear - linear_names = [] - for name, module in model.named_modules(): - if isinstance(module, cls): - for key in keys: - if key in name: - linear_names.append(name) - break - return linear_names - - -def init_model(): - model_name_or_path = "./minimind-v1-small" - tokenizer_name_or_path = "./minimind-v1-small" - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True, use_fast=False) - model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).to(args.device) - - target_modules = find_linear_with_keys(model) - peft_config = LoraConfig( - r=8, - target_modules=target_modules - ) - model = get_peft_model(model, peft_config) - model.print_trainable_parameters() - model = model.to(args.device) - return model, tokenizer - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="MiniMind LoRA Fine-tuning") - parser.add_argument("--out_dir", type=str, default="out", help="Output directory") - parser.add_argument("--epochs", type=int, default=20, help="Number of epochs") - parser.add_argument("--batch_size", type=int, default=32, help="Batch size") - parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate") - parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", - help="Device to use") - parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type") - parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases") - parser.add_argument("--wandb_project", type=str, default="MiniMind-LoRA", help="Weights & Biases project name") - parser.add_argument("--num_workers", type=int, default=1, help="Number of workers for data loading") - parser.add_argument("--accumulation_steps", type=int, default=1, help="Gradient accumulation steps") - parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold") - parser.add_argument("--warmup_iters", type=int, default=1000, help="Number of warmup iterations") - parser.add_argument("--log_interval", type=int, default=100, help="Logging interval") - parser.add_argument("--save_interval", type=int, default=1000, help="Model saving interval") - - args = parser.parse_args() - - lm_config = LMConfig() - max_seq_len = lm_config.max_seq_len - args.save_dir = os.path.join(args.out_dir) - os.makedirs(args.save_dir, exist_ok=True) - os.makedirs(args.out_dir, exist_ok=True) - tokens_per_iter = args.batch_size * max_seq_len - torch.manual_seed(1337) - device_type = "cuda" if "cuda" in args.device else "cpu" - - args.wandb_run_name = f"MiniMind-LoRA-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}" - - ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast() - - if args.use_wandb: - import wandb - - wandb.init(project=args.wandb_project, name=args.wandb_run_name) - else: - wandb = None - - model, tokenizer = init_model() - - df = pd.read_csv('./dataset/sft_data_single.csv') - df = df.sample(frac=1.0) - train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len) - train_loader = DataLoader( - train_ds, - batch_size=args.batch_size, - pin_memory=True, - drop_last=False, - shuffle=False, - num_workers=args.num_workers, - ) - - scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16'])) - optimizer = optim.Adam( - filter(lambda p: p.requires_grad, model.parameters()), - lr=args.learning_rate - ) - - if False and platform.system() != 'Windows' and float(torch.__version__.split('.')[0]) >= 2: - Logger("compiling the model... (takes a ~minute)") - unoptimized_model = model - model = torch.compile(model) - - iter_per_epoch = len(train_loader) - for epoch in range(args.epochs): - train_epoch(epoch, wandb) diff --git a/5-dpo_train.py b/5-dpo_train.py deleted file mode 100644 index ef17e2c..0000000 --- a/5-dpo_train.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -import warnings - -os.environ['CUDA_VISIBLE_DEVICES'] = '0' -from transformers import TrainingArguments, AutoModelForCausalLM, AutoTokenizer -from trl import DPOConfig, DPOTrainer -from datasets import load_dataset - -warnings.filterwarnings('ignore') - - -def init_model(): - device = 'cuda:0' - # Do model patching and add fast LoRA weights - model_name_or_path = "minimind-v1" - tokenizer_name_or_path = "minimind-v1" - model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True, use_fast=False) - tokenizer.pad_token = tokenizer.eos_token - model = model.to(device) - return model, tokenizer - - -if __name__ == '__main__': - model, tokenizer = init_model() - training_config = DPOConfig( - output_dir="./minimind_dpo", - per_device_train_batch_size=1, - remove_unused_columns=False, - report_to="none", - save_steps=2000, - learning_rate=4e-5 - ) - - dataset_path = './dataset/dpo/train_data.json' - train_dataset = load_dataset('json', data_files=dataset_path) - - dpo_trainer = DPOTrainer( - model, - ref_model=None, - args=training_config, - beta=0.1, - train_dataset=train_dataset['train'], - tokenizer=tokenizer, - max_length=512, - max_prompt_length=512 - ) - dpo_trainer.train() diff --git a/scripts/data_process.py b/scripts/data_process.py deleted file mode 100644 index c820bbe..0000000 --- a/scripts/data_process.py +++ /dev/null @@ -1,185 +0,0 @@ -import csv -import glob -import os -import re -import json -import jsonlines -import pandas as pd -from tqdm import tqdm - -bos_token = "" -eos_token = "" - - -def pretrain_process(): - # 定义输入和输出路径 - input_dir = '../CCI3-HQ/data' - output_file = '../dataset/pretrain_data_hq.csv' - jsonl_files = glob.glob(os.path.join(input_dir, 'part_*.jsonl')) - total_lines = 0 - print("正在计算总行数...") - for file in jsonl_files: - with open(file, 'r', encoding='utf-8') as f: - for _ in f: - total_lines += 1 - with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile) - writer.writerow(['text', 'score']) # 写入表头 - for jsonl_file in jsonl_files: - with open(jsonl_file, 'r', encoding='utf-8') as f: - for line in tqdm(f, desc=f'处理 {os.path.basename(jsonl_file)}', total=total_lines, unit='行', - leave=False): - try: - data = json.loads(line) - text = data.get('text', '') - score = data.get('score', 0) - if len(text) <= 512 and score > 3.5: - writer.writerow([text, score]) - except json.JSONDecodeError: - continue - print(f"筛选完成,结果已保存到 {output_file}") - - -def sft_process(): - sft_file_name = 'sft_data.csv' - - def process_and_write_data(data): - q_lst, a_lst, history_lst = [], [], [] - for per in data: - history, q, a = per['history'], per['q'], per['a'] - if not q or not a: - continue - history_len = sum(len(s) for s in history) - message_len = history_len + len(q) + len(a) - if message_len < 70 or message_len > 512: - continue - q_lst.append(q) - a_lst.append(a) - history_lst.append(history) - - df = pd.DataFrame({'history': history_lst, 'q': q_lst, 'a': a_lst}) - df.to_csv(f'../dataset/{sft_file_name}', - mode='a', header=False, index=False, - lineterminator='\r\n', escapechar='\\', encoding='utf-8') - - chunk_size = 1000 - data = [] - with open(f'../dataset/{sft_file_name}', 'w', encoding='utf-8') as f: - f.write('history,q,a\n') - - # sft_path = ['/root/shared-nvme/sft_data_zh.jsonl', '/root/shared-nvme/sft_data_en.jsonl'] - sft_path = ['/root/shared-nvme/sft_data_en.jsonl'] - chunk_num = 0 - for path in sft_path: - with jsonlines.open(path) as reader: - for idx, obj in enumerate(reader): - try: - data.append({ - 'history': obj.get('history', ''), - 'q': obj.get('input', '') + obj.get('q', ''), - 'a': obj.get('output', '') + obj.get('a', '') - }) - - if len(data) >= chunk_size: - chunk_num += 1 - process_and_write_data(data) - data = [] - if chunk_num % 100 == 0: - print(f'chunk:{chunk_num} process end') - except jsonlines.InvalidLineError as e: - print(f"Skipping invalid JSON line {idx + 1}: {e}") - continue - - if data: - process_and_write_data(data) - data = [] - - -def rl_process(): - # 偏好数据默认只用中文(建议) - input_paths = [ - # "../dataset/dpo_en.json", - "../dataset/dpo_zh.json" - ] - output_path = "../dataset/dpo_data.jsonl" # 修改输出文件扩展名为 .jsonl - all_converted = [] - - for input_path in input_paths: - with open(input_path, "r", encoding="utf-8") as f: - data = json.load(f) # data is likely a list - - for item in data: - new_data = { - "chosen": [], - "rejected": [] - } - for turn in item["conversations"]: - role = "user" if turn["from"] == "human" else "assistant" - message = {"role": role, "content": turn["value"]} - new_data["chosen"].append(message) - new_data["rejected"].append(message) - new_data["chosen"].append({ - "role": "assistant", - "content": item["chosen"]["value"] - }) - new_data["rejected"].append({ - "role": "assistant", - "content": item["rejected"]["value"] - }) - all_converted.append(new_data) - - with open(output_path, "w", encoding="utf-8") as f: - for item in all_converted: - f.write(json.dumps(item, ensure_ascii=False) + "\n") - - -def lora_dataset(): - import json - import csv - - # 读取JSON文件 - with open('../dataset/Chinese-medical-dialogue.json', 'r', encoding='utf-8') as f: - data = json.load(f) - - # 准备CSV数据 - csv_data = [] - for item in data: - # 提取input和output并去除首尾空白 - q = item['input'].strip() - a = item['output'].strip() - - # 检查长度是否符合要求 - if len(q) + len(a) < 160: - csv_data.append({ - 'history': '[]', - 'q': q, - 'a': a - }) - - # 写入CSV文件 - with open('../dataset/medical_sft.csv', 'w', newline='', encoding='utf-8') as csvfile: - fieldnames = ['history', 'q', 'a'] - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - - writer.writeheader() - writer.writerows(csv_data) - - print(f'转换完成,共处理 {len(csv_data)} 条有效数据') - - -if __name__ == "__main__": - ################ - # 1: pretrain - # 2: sft - # 3: RL - ################ - process_type = 4 - - if process_type == 1: - pretrain_process() - if process_type == 2: - sft_process() - if process_type == 3: - rl_process() - if process_type == 4: - lora_dataset() diff --git a/scripts/load_test_dataset.py b/scripts/load_test_dataset.py deleted file mode 100644 index f186f46..0000000 --- a/scripts/load_test_dataset.py +++ /dev/null @@ -1,97 +0,0 @@ -# from datasets import load_dataset -# -# dataset_paths = [ -# ['ceval/ceval-exam', -# ['computer_network', 'operating_system', 'computer_architecture', 'college_programming', 'college_physics', -# 'college_chemistry', 'advanced_mathematics', 'probability_and_statistics', 'discrete_mathematics', -# 'electrical_engineer', 'metrology_engineer', 'high_school_mathematics', 'high_school_physics', -# 'high_school_chemistry', 'high_school_biology', 'middle_school_mathematics', 'middle_school_biology', -# 'middle_school_physics', 'middle_school_chemistry', 'veterinary_medicine', 'college_economics', -# 'business_administration', 'marxism', 'mao_zedong_thought', 'education_science', 'teacher_qualification', -# 'high_school_politics', 'high_school_geography', 'middle_school_politics', 'middle_school_geography', -# 'modern_chinese_history', 'ideological_and_moral_cultivation', 'logic', 'law', 'chinese_language_and_literature', -# 'art_studies', 'professional_tour_guide', 'legal_professional', 'high_school_chinese', 'high_school_history', -# 'middle_school_history', 'civil_servant', 'sports_science', 'plant_protection', 'basic_medicine', -# 'clinical_medicine', 'urban_and_rural_planner', 'accountant', 'fire_engineer', -# 'environmental_impact_assessment_engineer', 'tax_accountant', 'physician']], # ceval* -# ['haonan-li/cmmlu', [ -# 'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', -# 'chinese_civil_service_exam', 'chinese_driving_rule', 'chinese_food_culture', -# 'chinese_foreign_policy', 'chinese_history', 'chinese_literature', -# 'chinese_teacher_qualification', 'clinical_knowledge', 'college_actuarial_science', -# 'college_education', 'college_engineering_hydrology', 'college_law', -# 'college_mathematics', 'college_medical_statistics', 'college_medicine', -# 'computer_science', 'computer_security', 'conceptual_physics', -# 'construction_project_management', 'economics', 'education', 'electrical_engineering', -# 'elementary_chinese', 'elementary_commonsense', 'elementary_information_and_technology', -# 'elementary_mathematics', 'ethnology', 'food_science', 'genetics', 'global_facts', -# 'high_school_biology', 'high_school_chemistry', 'high_school_geography', -# 'high_school_mathematics', 'high_school_physics', 'high_school_politics', -# 'human_sexuality', 'international_law', 'journalism', 'jurisprudence', -# 'legal_and_moral_basis', 'logical', 'machine_learning', 'management', 'marketing', -# 'marxist_theory', 'modern_chinese', 'nutrition', 'philosophy', 'professional_accounting', -# 'professional_law', 'professional_medicine', 'professional_psychology', -# 'public_relations', 'security_study', 'sociology', 'sports_science', -# 'traditional_chinese_medicine', 'virology', 'world_history', 'world_religions' -# ]], # cmmlu* -# ['tyouisen/aclue', -# ['polysemy_resolution', 'poetry_sentiment_analysis', 'named_entity_recognition', 'basic_ancient_chinese', -# 'poetry_context_prediction', 'sentence_segmentation', 'couplet_prediction', 'poetry_appreciate', -# 'ancient_chinese_culture', 'ancient_phonetics', 'homographic_character_resolution', 'ancient_literature', -# 'ancient_medical', 'poetry_quality_assessment', 'reading_comprehension']], # aclue -# ['juletxara/mgsm', ['zh']], # mgsm_direct_zh -# ['openbookqa', ['main']], # openbookqa -# ['ZoneTwelve/tmmluplus', -# ['dentistry', 'traditional_chinese_medicine_clinical_medicine', 'clinical_psychology', 'technical', -# 'culinary_skills', 'mechanical', 'logic_reasoning', 'real_estate', 'general_principles_of_law', 'finance_banking', -# 'anti_money_laundering', 'ttqav2', 'marketing_management', 'business_management', 'organic_chemistry', -# 'advance_chemistry', 'physics', 'secondary_physics', 'human_behavior', 'national_protection', 'jce_humanities', -# 'politic_science', 'agriculture', 'official_document_management', 'financial_analysis', 'pharmacy', -# 'educational_psychology', 'statistics_and_machine_learning', 'management_accounting', 'introduction_to_law', -# 'computer_science', 'veterinary_pathology', 'accounting', 'fire_science', 'optometry', 'insurance_studies', -# 'pharmacology', 'taxation', 'education_(profession_level)', 'economics', 'veterinary_pharmacology', -# 'nautical_science', 'occupational_therapy_for_psychological_disorders', 'trust_practice', 'geography_of_taiwan', -# 'physical_education', 'auditing', 'administrative_law', 'basic_medical_science', 'macroeconomics', 'trade', -# 'chinese_language_and_literature', 'tve_design', 'junior_science_exam', 'junior_math_exam', 'junior_chinese_exam', -# 'junior_social_studies', 'tve_mathematics', 'tve_chinese_language', 'tve_natural_sciences', 'junior_chemistry', -# 'music', 'education', 'three_principles_of_people', 'taiwanese_hokkien', 'engineering_math', 'linear_algebra']] -# # tmmluplus -# -# ] -# -# for dataset_path in dataset_paths: -# for dataset_name in dataset_path[1]: -# datasets = load_dataset(dataset_path[0], dataset_name, cache_dir='./test_dataset_cache') -# -# """ -# export HF_HUB_OFFLINE=1 && lm_eval --model hf --model_args pretrained=/xxx/minimind/minimind-v2-small/,device=cuda,dtype=auto --tasks ceval* --batch_size 8 --trust_remote_code -# """ -""" -$env:HF_HUB_OFFLINE=1; lm_eval --model hf --model_args pretrained=../minimind-v2-small/,device=cuda,dtype=auto --tasks ceval* --batch_size 8 --trust_remote_code -""" - -import subprocess - -# 定义要执行的命令 -command = ( - 'set HF_HUB_OFFLINE=1 & ' - 'lm_eval --model hf --model_args pretrained=../minimind-v2-small/,device=cuda,dtype=auto ' - '--tasks ceval* --batch_size 8 --trust_remote_code' -) - -# 使用 subprocess 执行命令 -try: - process = subprocess.run( - command, - shell=True, - check=True, - text=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - # 打印命令的输出 - print("STDOUT:", process.stdout) - print("STDERR:", process.stderr) -except subprocess.CalledProcessError as e: - print(f"命令执行失败,返回码: {e.returncode}") - print("STDERR:", e.stderr) \ No newline at end of file