Update data preprocessing methods

This commit is contained in:
gongjy 2024-09-27 16:19:30 +08:00
parent d57037624b
commit a8ae342775
6 changed files with 80 additions and 162 deletions

View File

@ -69,18 +69,11 @@ if __name__ == "__main__":
max_seq_len = 512 max_seq_len = 512
lm_config = LMConfig() lm_config = LMConfig()
lm_config.max_seq_len = max_seq_len lm_config.max_seq_len = max_seq_len
# 对话是否携带历史对话(当前模型太弱,增大历史上下文,基本导致胡言乱语)
contain_history_chat = False
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
model, tokenizer = init_model(lm_config) model, tokenizer = init_model(lm_config)
model = model.eval() model = model.eval()
# 推送到huggingface # int(input('输入0自动测试输入1问题测试'))
# model.push_to_hub("minimind")
# tokenizer.push_to_hub("minimind")
# answer_way = int(input('输入0自动测试输入1问题测试'))
answer_way = 0 answer_way = 0
stream = True stream = True
@ -101,15 +94,9 @@ if __name__ == "__main__":
'江苏省的最好的大学', '江苏省的最好的大学',
] ]
messages_origin = []
messages = messages_origin
qa_index = 0 qa_index = 0
while True: while True:
start = time.time() start = time.time()
if not contain_history_chat:
messages = messages_origin.copy()
if answer_way == 1: if answer_way == 1:
# run generation # run generation
prompt = input('用户:') prompt = input('用户:')
@ -120,20 +107,9 @@ if __name__ == "__main__":
print('问题:', prompt) print('问题:', prompt)
qa_index += 1 qa_index += 1
messages.append({"role": "user", "content": prompt})
# print(messages)
new_prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)[-(max_seq_len - 1):]
x = tokenizer(prompt).data['input_ids'] x = tokenizer(prompt).data['input_ids']
x = (torch.tensor(x, dtype=torch.long, device=device)[None, ...]) x = (torch.tensor(x, dtype=torch.long, device=device)[None, ...])
answer = new_prompt
with torch.no_grad(): with torch.no_grad():
res_y = model.generate(x, tokenizer.eos_token_id, max_new_tokens=max_seq_len, temperature=temperature, res_y = model.generate(x, tokenizer.eos_token_id, max_new_tokens=max_seq_len, temperature=temperature,
top_k=top_k, stream=stream) top_k=top_k, stream=stream)
@ -172,8 +148,5 @@ if __name__ == "__main__":
print('\n') print('\n')
if contain_history_chat:
assistant_answer = answer.replace(new_prompt, "")
messages.append({"role": "assistant", "content": assistant_answer})
end = time.time() end = time.time()
print(end - start,'s') print(end - start, 's')

View File

@ -4,6 +4,8 @@ import argparse
import time import time
import math import math
import warnings import warnings
import pandas as pd
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from torch import optim from torch import optim
@ -11,6 +13,9 @@ from torch.nn.parallel import DistributedDataParallel
from torch.optim.lr_scheduler import CosineAnnealingLR from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader, DistributedSampler from torch.utils.data import DataLoader, DistributedSampler
from contextlib import nullcontext from contextlib import nullcontext
from transformers import AutoTokenizer
from model.model import Transformer from model.model import Transformer
from model.LMConfig import LMConfig from model.LMConfig import LMConfig
from model.dataset import PretrainDataset from model.dataset import PretrainDataset
@ -98,11 +103,13 @@ def init_model():
def count_parameters(model): def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad) return sum(p.numel() for p in model.parameters() if p.requires_grad)
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
model = Transformer(lm_config).to(args.device) model = Transformer(lm_config).to(args.device)
moe_path = '_moe' if lm_config.use_moe else '' # moe_path = '_moe' if lm_config.use_moe else ''
Logger(f'LLM总参数量{count_parameters(model) / 1e6:.3f} 百万') Logger(f'LLM总参数量{count_parameters(model) / 1e6:.3f} 百万')
return model return model, tokenizer
def init_distributed_mode(): def init_distributed_mode():
@ -122,14 +129,15 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description="MiniMind Pretraining") parser = argparse.ArgumentParser(description="MiniMind Pretraining")
parser.add_argument("--out_dir", type=str, default="out", help="Output directory") parser.add_argument("--out_dir", type=str, default="out", help="Output directory")
parser.add_argument("--epochs", type=int, default=20, help="Number of epochs") parser.add_argument("--epochs", type=int, default=20, help="Number of epochs")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size") parser.add_argument("--batch_size", type=int, default=48, help="Batch size")
parser.add_argument("--learning_rate", type=float, default=2e-4, help="Learning rate") parser.add_argument("--learning_rate", type=float, default=2e-4, help="Learning rate")
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", help="Device to use") parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu",
help="Device to use")
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type") parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases") parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases")
parser.add_argument("--wandb_project", type=str, default="MiniMind-Pretrain", help="Weights & Biases project name") parser.add_argument("--wandb_project", type=str, default="MiniMind-Pretrain", help="Weights & Biases project name")
parser.add_argument("--num_workers", type=int, default=8, help="Number of workers for data loading") parser.add_argument("--num_workers", type=int, default=8, help="Number of workers for data loading")
parser.add_argument("--data_path", type=str, default="./dataset/pretrain_data.bin", help="Path to training data") parser.add_argument("--data_path", type=str, default="./dataset/pretrain_data.csv", help="Path to training data")
parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel") parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel")
parser.add_argument("--accumulation_steps", type=int, default=8, help="Gradient accumulation steps") parser.add_argument("--accumulation_steps", type=int, default=8, help="Gradient accumulation steps")
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold") parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
@ -160,12 +168,15 @@ if __name__ == "__main__":
if args.use_wandb and (not ddp or ddp_local_rank == 0): if args.use_wandb and (not ddp or ddp_local_rank == 0):
import wandb import wandb
wandb.init(project=args.wandb_project, name=args.wandb_run_name) wandb.init(project=args.wandb_project, name=args.wandb_run_name)
else: else:
wandb = None wandb = None
data_path_list = [args.data_path] model, tokenizer = init_model()
train_ds = PretrainDataset(data_path_list, max_length=max_seq_len, memmap=True) df = pd.read_csv(args.data_path)
df = df.sample(frac=1.0)
train_ds = PretrainDataset(df, tokenizer, max_length=max_seq_len)
train_sampler = DistributedSampler(train_ds) if ddp else None train_sampler = DistributedSampler(train_ds) if ddp else None
train_loader = DataLoader( train_loader = DataLoader(
train_ds, train_ds,
@ -177,8 +188,6 @@ if __name__ == "__main__":
sampler=train_sampler sampler=train_sampler
) )
model = init_model()
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16'])) scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16']))
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

View File

@ -78,7 +78,18 @@ https://github.com/user-attachments/assets/88b98128-636e-43bc-a419-b1b1403c2055
### 👉**最近更新** ### 👉**最近更新**
<details close> <details close>
<summary> <b>2024-09-17 (new🎉)</b> </summary> <summary> <b>2024-09-27</b> </summary>
- 09-27更新pretrain数据集的预处理方式为了保证文本完整性放弃预处理成.bin训练的形式轻微牺牲训练速度
- 目前pretrain预处理后的文件命名为pretrain_data.csv。
- 删除了一些冗余的代码。
</details>
<details close>
<summary> <b>2024-09-17</b> </summary>
- 更新minimind-v1-moe模型 - 更新minimind-v1-moe模型

View File

@ -84,6 +84,17 @@ We hope this open-source project helps LLM beginners get started quickly!
### 👉**Recent Updates** ### 👉**Recent Updates**
<details close>
<summary> <b>2024-09-27</b> </summary>
- Updated the preprocessing method for the pretrain dataset on 09-27 to ensure text integrity, opting to abandon the preprocessing into .bin training format (slightly sacrificing training speed).
- The current filename for the pretrain data after preprocessing is: pretrain_data.csv.
- Removed some redundant code.
</details>
<details close> <details close>
<summary> <b>2024-09-17 (new🎉)</b> </summary> <summary> <b>2024-09-17 (new🎉)</b> </summary>

View File

@ -1,3 +1,4 @@
import csv
import itertools import itertools
import re import re
import json import json
@ -12,110 +13,31 @@ from datasets import load_dataset
bos_token = "<s>" bos_token = "<s>"
eos_token = "</s>" eos_token = "</s>"
# pretrain
def process_wiki_clean():
with open('./dataset/clean-wikipedia-cn.json', 'r', encoding='utf-8') as f_read:
data = [ujson.loads(line) for line in f_read]
data_len = len(data)
doc_ids = []
for idx, line in enumerate(data):
text = line['response']
text_id = tokenizer(f'{bos_token}{text}{eos_token}').data['input_ids']
if len(text_id) > 5:
doc_ids += text_id
if idx % (int(data_len / 20)) == 0:
print(f"[{idx}/{data_len}] {text}")
arr = np.array(doc_ids, dtype=np.uint16)
with open('./dataset/clean-wikipedia-cn.bin', 'wb') as f:
f.write(arr.tobytes())
def pretrain_process(chunk_size=50000):
# pretrain
def process_other():
data = []
with open('./dataset/alpaca_gpt4_data_zh.json', 'r', encoding='utf-8') as f:
data_ = json.load(f)
data += data_
with open('./dataset/alpaca_data_zh_51k.json', 'r', encoding='utf-8') as f:
data_ = json.load(f)
data += data_
doc_ids = []
for idx, per in enumerate(data):
q = per['instruction']
i = per['input']
a = per['output']
q = q + i
if len(q) < 10 or len(a) < 5:
continue
if len(q) > 256 or len(a) > 256:
continue
text_id = tokenizer(f'{bos_token}{q}{a}{eos_token}').data['input_ids']
if len(text_id) > 5:
doc_ids += text_id
if idx % 50000 == 0:
print(idx, len(data))
arr = np.array(doc_ids, dtype=np.uint16)
with open('./dataset/clean_other.bin', 'wb') as f:
f.write(arr.tobytes())
def process_seq_monkey(chunk_size=50000):
doc_ids = []
chunk_idx = 0 chunk_idx = 0
with jsonlines.open('./dataset/mobvoi_seq_monkey_general_open_corpus.jsonl') as reader: with jsonlines.open('./dataset/mobvoi_seq_monkey_general_open_corpus.jsonl') as reader:
while True: with open('./dataset/pretrain_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
chunk = list(itertools.islice(reader, chunk_size)) writer = csv.writer(csvfile)
if not chunk: writer.writerow(['text'])
break
for idx, obj in enumerate(chunk): while True:
try: chunk = list(itertools.islice(reader, chunk_size))
content = obj.get('text', '') if not chunk:
if len(content) > 512: break
for idx, obj in enumerate(chunk):
try:
content = obj.get('text', '')
if len(content) > 512:
continue
writer.writerow([content])
except UnicodeDecodeError as e:
print(f"Skipping invalid line {chunk_idx * chunk_size + idx + 1}: {e}")
continue continue
text_id = tokenizer(f'{bos_token}{content}{eos_token}').data['input_ids'] chunk_idx += 1
doc_ids += text_id print('chunk:', ((chunk_idx - 1) * chunk_size, chunk_idx * chunk_size), 'process end')
except UnicodeDecodeError as e:
print(f"Skipping invalid line {chunk_idx * chunk_size + idx + 1}: {e}")
continue
chunk_idx += 1
print(f"Processed chunk {chunk_idx} with {chunk_size} lines")
if len(doc_ids) > 1000000:
arr = np.array(doc_ids, dtype=np.uint16)
with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f:
f.write(arr.tobytes())
doc_ids = []
if doc_ids:
arr = np.array(doc_ids, dtype=np.uint16)
with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f:
f.write(arr.tobytes())
def pretrain_process():
# process_wiki_clean()
process_seq_monkey()
data_path_list = [
# './dataset/clean-wikipedia-cn.bin',
'./dataset/clean_seq_monkey.bin'
]
data_lst = []
for data_path in data_path_list:
with open(data_path, 'rb') as f:
data = np.fromfile(f, dtype=np.uint16)
data_lst.append(data)
arr = np.concatenate(data_lst)
print(arr.shape)
with open('./dataset/pretrain_data.bin', 'wb') as f:
f.write(arr.tobytes())
def sft_process(contain_history=False): def sft_process(contain_history=False):
@ -186,6 +108,7 @@ def sft_process(contain_history=False):
process_and_write_data(data) process_and_write_data(data)
data = [] data = []
def rl_process(): def rl_process():
################ ################
# Dataset # Dataset

View File

@ -13,37 +13,28 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
class PretrainDataset(Dataset): class PretrainDataset(Dataset):
def __init__(self, data_path_lst, max_length=512, memmap=False): def __init__(self, df, tokenizer, max_length=512):
super().__init__() super().__init__()
# self.df = df
if memmap: self.tokenizer = tokenizer
with open(data_path_lst[0], 'r') as f: self.max_length = max_length
nbytes = f.seek(0, 2) self.padding = 0
flen = f.tell() // np.dtype('uint16').itemsize
self.data = np.memmap(data_path_lst[0], dtype=np.dtype('uint16'), shape=(flen // max_length, max_length))
else:
data_lst = []
for data_path in data_path_lst:
with open(data_path, 'rb') as f:
data = np.fromfile(f, dtype=np.uint16)
data_lst.append(data)
data = np.concatenate(data_lst)
data = data[:max_length * int(len(data) / max_length)]
# np.random.shuffle(data)
self.data = data.reshape(-1, max_length)
#
print("memmap:{} train data.shape:{}".format(memmap, self.data.shape))
print("downloading finished.....")
def __len__(self): def __len__(self):
return self.data.shape[0] return self.df.shape[0]
def __getitem__(self, index: int): def __getitem__(self, index: int):
# #
sample = self.data[index] sample = self.df.iloc[index]
X = np.array(sample[:-1]).astype(np.int64) text = f"{self.tokenizer.bos_token}{str(sample['text'])}{self.tokenizer.eos_token}"
Y = np.array(sample[1:]).astype(np.int64) input_id = self.tokenizer(text).data['input_ids'][:self.max_length]
# 没满最大长度的剩余部分
padding_len = self.max_length - len(input_id)
input_id = input_id + [self.padding] * padding_len
input_id = np.array(input_id)
X = np.array(input_id[:-1]).astype(np.int64)
Y = np.array(input_id[1:]).astype(np.int64)
return torch.from_numpy(X), torch.from_numpy(Y) return torch.from_numpy(X), torch.from_numpy(Y)
@ -56,7 +47,7 @@ class SFTDataset(Dataset):
self.answer_max_len = answer_max_len self.answer_max_len = answer_max_len
# #
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.padding = 0 # self.tokenizer.special_tokens['<pad>'] self.padding = 0
self.bos_id = self.tokenizer('<s>assistant').data['input_ids'] self.bos_id = self.tokenizer('<s>assistant').data['input_ids']
def __len__(self): def __len__(self):