diff --git a/0-eval_pretrain.py b/0-eval_pretrain.py index edb3ad8..570d42d 100644 --- a/0-eval_pretrain.py +++ b/0-eval_pretrain.py @@ -69,18 +69,11 @@ if __name__ == "__main__": max_seq_len = 512 lm_config = LMConfig() lm_config.max_seq_len = max_seq_len - # 对话是否携带历史对话(当前模型太弱,增大历史上下文,基本导致胡言乱语) - contain_history_chat = False # ----------------------------------------------------------------------------- model, tokenizer = init_model(lm_config) - model = model.eval() - # 推送到huggingface - # model.push_to_hub("minimind") - # tokenizer.push_to_hub("minimind") - - # answer_way = int(input('输入0自动测试,输入1问题测试:')) + # int(input('输入0自动测试,输入1问题测试:')) answer_way = 0 stream = True @@ -101,15 +94,9 @@ if __name__ == "__main__": '江苏省的最好的大学', ] - messages_origin = [] - messages = messages_origin - qa_index = 0 while True: start = time.time() - if not contain_history_chat: - messages = messages_origin.copy() - if answer_way == 1: # run generation prompt = input('用户:') @@ -120,20 +107,9 @@ if __name__ == "__main__": print('问题:', prompt) qa_index += 1 - messages.append({"role": "user", "content": prompt}) - - # print(messages) - new_prompt = tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True - )[-(max_seq_len - 1):] - x = tokenizer(prompt).data['input_ids'] x = (torch.tensor(x, dtype=torch.long, device=device)[None, ...]) - answer = new_prompt - with torch.no_grad(): res_y = model.generate(x, tokenizer.eos_token_id, max_new_tokens=max_seq_len, temperature=temperature, top_k=top_k, stream=stream) @@ -172,8 +148,5 @@ if __name__ == "__main__": print('\n') - if contain_history_chat: - assistant_answer = answer.replace(new_prompt, "") - messages.append({"role": "assistant", "content": assistant_answer}) end = time.time() - print(end - start,'s') + print(end - start, 's') diff --git a/1-pretrain.py b/1-pretrain.py index 6cce4f0..20fdf42 100644 --- a/1-pretrain.py +++ b/1-pretrain.py @@ -4,6 +4,8 @@ import argparse import time import math import warnings + +import pandas as pd import torch import torch.distributed as dist from torch import optim @@ -11,6 +13,9 @@ from torch.nn.parallel import DistributedDataParallel from torch.optim.lr_scheduler import CosineAnnealingLR from torch.utils.data import DataLoader, DistributedSampler from contextlib import nullcontext + +from transformers import AutoTokenizer + from model.model import Transformer from model.LMConfig import LMConfig from model.dataset import PretrainDataset @@ -98,11 +103,13 @@ def init_model(): def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) + tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer') + model = Transformer(lm_config).to(args.device) - moe_path = '_moe' if lm_config.use_moe else '' + # moe_path = '_moe' if lm_config.use_moe else '' Logger(f'LLM总参数量:{count_parameters(model) / 1e6:.3f} 百万') - return model + return model, tokenizer def init_distributed_mode(): @@ -122,14 +129,15 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="MiniMind Pretraining") parser.add_argument("--out_dir", type=str, default="out", help="Output directory") parser.add_argument("--epochs", type=int, default=20, help="Number of epochs") - parser.add_argument("--batch_size", type=int, default=32, help="Batch size") + parser.add_argument("--batch_size", type=int, default=48, help="Batch size") parser.add_argument("--learning_rate", type=float, default=2e-4, help="Learning rate") - parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", help="Device to use") + parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", + help="Device to use") parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type") parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases") parser.add_argument("--wandb_project", type=str, default="MiniMind-Pretrain", help="Weights & Biases project name") parser.add_argument("--num_workers", type=int, default=8, help="Number of workers for data loading") - parser.add_argument("--data_path", type=str, default="./dataset/pretrain_data.bin", help="Path to training data") + parser.add_argument("--data_path", type=str, default="./dataset/pretrain_data.csv", help="Path to training data") parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel") parser.add_argument("--accumulation_steps", type=int, default=8, help="Gradient accumulation steps") parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold") @@ -160,12 +168,15 @@ if __name__ == "__main__": if args.use_wandb and (not ddp or ddp_local_rank == 0): import wandb + wandb.init(project=args.wandb_project, name=args.wandb_run_name) else: wandb = None - data_path_list = [args.data_path] - train_ds = PretrainDataset(data_path_list, max_length=max_seq_len, memmap=True) + model, tokenizer = init_model() + df = pd.read_csv(args.data_path) + df = df.sample(frac=1.0) + train_ds = PretrainDataset(df, tokenizer, max_length=max_seq_len) train_sampler = DistributedSampler(train_ds) if ddp else None train_loader = DataLoader( train_ds, @@ -177,8 +188,6 @@ if __name__ == "__main__": sampler=train_sampler ) - model = init_model() - scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16'])) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) diff --git a/README.md b/README.md index 265b5a0..d57023a 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,18 @@ https://github.com/user-attachments/assets/88b98128-636e-43bc-a419-b1b1403c2055 ### 👉**最近更新**
- 2024-09-17 (new🎉) + 2024-09-27 + +- 09-27更新pretrain数据集的预处理方式,为了保证文本完整性,放弃预处理成.bin训练的形式(轻微牺牲训练速度)。 + +- 目前pretrain预处理后的文件命名为:pretrain_data.csv。 + +- 删除了一些冗余的代码。 + +
+ +
+ 2024-09-17 - 更新minimind-v1-moe模型 diff --git a/README_en.md b/README_en.md index a0fa5a4..fd0f86f 100644 --- a/README_en.md +++ b/README_en.md @@ -84,6 +84,17 @@ We hope this open-source project helps LLM beginners get started quickly! ### 👉**Recent Updates** +
+ 2024-09-27 + +- Updated the preprocessing method for the pretrain dataset on 09-27 to ensure text integrity, opting to abandon the preprocessing into .bin training format (slightly sacrificing training speed). + +- The current filename for the pretrain data after preprocessing is: pretrain_data.csv. + +- Removed some redundant code. + +
+
2024-09-17 (new🎉) diff --git a/data_process.py b/data_process.py index 047ff0e..bea9371 100644 --- a/data_process.py +++ b/data_process.py @@ -1,3 +1,4 @@ +import csv import itertools import re import json @@ -12,110 +13,31 @@ from datasets import load_dataset bos_token = "" eos_token = "" -# pretrain -def process_wiki_clean(): - with open('./dataset/clean-wikipedia-cn.json', 'r', encoding='utf-8') as f_read: - data = [ujson.loads(line) for line in f_read] - data_len = len(data) - doc_ids = [] - for idx, line in enumerate(data): - text = line['response'] - text_id = tokenizer(f'{bos_token}{text}{eos_token}').data['input_ids'] - if len(text_id) > 5: - doc_ids += text_id - if idx % (int(data_len / 20)) == 0: - print(f"[{idx}/{data_len}] {text}") - arr = np.array(doc_ids, dtype=np.uint16) - with open('./dataset/clean-wikipedia-cn.bin', 'wb') as f: - f.write(arr.tobytes()) - -# pretrain -def process_other(): - data = [] - - with open('./dataset/alpaca_gpt4_data_zh.json', 'r', encoding='utf-8') as f: - data_ = json.load(f) - data += data_ - - with open('./dataset/alpaca_data_zh_51k.json', 'r', encoding='utf-8') as f: - data_ = json.load(f) - data += data_ - - doc_ids = [] - for idx, per in enumerate(data): - q = per['instruction'] - i = per['input'] - a = per['output'] - q = q + i - if len(q) < 10 or len(a) < 5: - continue - if len(q) > 256 or len(a) > 256: - continue - text_id = tokenizer(f'{bos_token}{q},{a}{eos_token}').data['input_ids'] - if len(text_id) > 5: - doc_ids += text_id - if idx % 50000 == 0: - print(idx, len(data)) - - arr = np.array(doc_ids, dtype=np.uint16) - with open('./dataset/clean_other.bin', 'wb') as f: - f.write(arr.tobytes()) - - -def process_seq_monkey(chunk_size=50000): - doc_ids = [] +def pretrain_process(chunk_size=50000): chunk_idx = 0 with jsonlines.open('./dataset/mobvoi_seq_monkey_general_open_corpus.jsonl') as reader: - while True: - chunk = list(itertools.islice(reader, chunk_size)) - if not chunk: - break + with open('./dataset/pretrain_data.csv', 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + writer.writerow(['text']) - for idx, obj in enumerate(chunk): - try: - content = obj.get('text', '') - if len(content) > 512: + while True: + chunk = list(itertools.islice(reader, chunk_size)) + if not chunk: + break + + for idx, obj in enumerate(chunk): + try: + content = obj.get('text', '') + if len(content) > 512: + continue + writer.writerow([content]) + except UnicodeDecodeError as e: + print(f"Skipping invalid line {chunk_idx * chunk_size + idx + 1}: {e}") continue - text_id = tokenizer(f'{bos_token}{content}{eos_token}').data['input_ids'] - doc_ids += text_id - except UnicodeDecodeError as e: - print(f"Skipping invalid line {chunk_idx * chunk_size + idx + 1}: {e}") - continue - - chunk_idx += 1 - print(f"Processed chunk {chunk_idx} with {chunk_size} lines") - - if len(doc_ids) > 1000000: - arr = np.array(doc_ids, dtype=np.uint16) - with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f: - f.write(arr.tobytes()) - doc_ids = [] - - if doc_ids: - arr = np.array(doc_ids, dtype=np.uint16) - with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f: - f.write(arr.tobytes()) - - -def pretrain_process(): - # process_wiki_clean() - process_seq_monkey() - - data_path_list = [ - # './dataset/clean-wikipedia-cn.bin', - './dataset/clean_seq_monkey.bin' - ] - data_lst = [] - for data_path in data_path_list: - with open(data_path, 'rb') as f: - data = np.fromfile(f, dtype=np.uint16) - data_lst.append(data) - arr = np.concatenate(data_lst) - print(arr.shape) - with open('./dataset/pretrain_data.bin', 'wb') as f: - f.write(arr.tobytes()) + chunk_idx += 1 + print('chunk:', ((chunk_idx - 1) * chunk_size, chunk_idx * chunk_size), 'process end') def sft_process(contain_history=False): @@ -186,6 +108,7 @@ def sft_process(contain_history=False): process_and_write_data(data) data = [] + def rl_process(): ################ # Dataset diff --git a/model/dataset.py b/model/dataset.py index ef58956..2e417f6 100644 --- a/model/dataset.py +++ b/model/dataset.py @@ -13,37 +13,28 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false" class PretrainDataset(Dataset): - def __init__(self, data_path_lst, max_length=512, memmap=False): + def __init__(self, df, tokenizer, max_length=512): super().__init__() - # - if memmap: - with open(data_path_lst[0], 'r') as f: - nbytes = f.seek(0, 2) - flen = f.tell() // np.dtype('uint16').itemsize - self.data = np.memmap(data_path_lst[0], dtype=np.dtype('uint16'), shape=(flen // max_length, max_length)) - else: - data_lst = [] - for data_path in data_path_lst: - with open(data_path, 'rb') as f: - data = np.fromfile(f, dtype=np.uint16) - data_lst.append(data) - data = np.concatenate(data_lst) - data = data[:max_length * int(len(data) / max_length)] - # np.random.shuffle(data) - self.data = data.reshape(-1, max_length) - # - print("memmap:{} train data.shape:{}".format(memmap, self.data.shape)) - print("downloading finished.....") + self.df = df + self.tokenizer = tokenizer + self.max_length = max_length + self.padding = 0 def __len__(self): - return self.data.shape[0] + return self.df.shape[0] def __getitem__(self, index: int): # - sample = self.data[index] - X = np.array(sample[:-1]).astype(np.int64) - Y = np.array(sample[1:]).astype(np.int64) + sample = self.df.iloc[index] + text = f"{self.tokenizer.bos_token}{str(sample['text'])}{self.tokenizer.eos_token}" + input_id = self.tokenizer(text).data['input_ids'][:self.max_length] + # 没满最大长度的剩余部分 + padding_len = self.max_length - len(input_id) + input_id = input_id + [self.padding] * padding_len + input_id = np.array(input_id) + X = np.array(input_id[:-1]).astype(np.int64) + Y = np.array(input_id[1:]).astype(np.int64) return torch.from_numpy(X), torch.from_numpy(Y) @@ -56,7 +47,7 @@ class SFTDataset(Dataset): self.answer_max_len = answer_max_len # self.tokenizer = tokenizer - self.padding = 0 # self.tokenizer.special_tokens[''] + self.padding = 0 self.bos_id = self.tokenizer('assistant').data['input_ids'] def __len__(self):