Update data preprocessing methods
This commit is contained in:
parent
d57037624b
commit
a8ae342775
@ -69,18 +69,11 @@ if __name__ == "__main__":
|
|||||||
max_seq_len = 512
|
max_seq_len = 512
|
||||||
lm_config = LMConfig()
|
lm_config = LMConfig()
|
||||||
lm_config.max_seq_len = max_seq_len
|
lm_config.max_seq_len = max_seq_len
|
||||||
# 对话是否携带历史对话(当前模型太弱,增大历史上下文,基本导致胡言乱语)
|
|
||||||
contain_history_chat = False
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
model, tokenizer = init_model(lm_config)
|
model, tokenizer = init_model(lm_config)
|
||||||
|
|
||||||
model = model.eval()
|
model = model.eval()
|
||||||
# 推送到huggingface
|
# int(input('输入0自动测试,输入1问题测试:'))
|
||||||
# model.push_to_hub("minimind")
|
|
||||||
# tokenizer.push_to_hub("minimind")
|
|
||||||
|
|
||||||
# answer_way = int(input('输入0自动测试,输入1问题测试:'))
|
|
||||||
answer_way = 0
|
answer_way = 0
|
||||||
stream = True
|
stream = True
|
||||||
|
|
||||||
@ -101,15 +94,9 @@ if __name__ == "__main__":
|
|||||||
'江苏省的最好的大学',
|
'江苏省的最好的大学',
|
||||||
]
|
]
|
||||||
|
|
||||||
messages_origin = []
|
|
||||||
messages = messages_origin
|
|
||||||
|
|
||||||
qa_index = 0
|
qa_index = 0
|
||||||
while True:
|
while True:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
if not contain_history_chat:
|
|
||||||
messages = messages_origin.copy()
|
|
||||||
|
|
||||||
if answer_way == 1:
|
if answer_way == 1:
|
||||||
# run generation
|
# run generation
|
||||||
prompt = input('用户:')
|
prompt = input('用户:')
|
||||||
@ -120,20 +107,9 @@ if __name__ == "__main__":
|
|||||||
print('问题:', prompt)
|
print('问题:', prompt)
|
||||||
qa_index += 1
|
qa_index += 1
|
||||||
|
|
||||||
messages.append({"role": "user", "content": prompt})
|
|
||||||
|
|
||||||
# print(messages)
|
|
||||||
new_prompt = tokenizer.apply_chat_template(
|
|
||||||
messages,
|
|
||||||
tokenize=False,
|
|
||||||
add_generation_prompt=True
|
|
||||||
)[-(max_seq_len - 1):]
|
|
||||||
|
|
||||||
x = tokenizer(prompt).data['input_ids']
|
x = tokenizer(prompt).data['input_ids']
|
||||||
x = (torch.tensor(x, dtype=torch.long, device=device)[None, ...])
|
x = (torch.tensor(x, dtype=torch.long, device=device)[None, ...])
|
||||||
|
|
||||||
answer = new_prompt
|
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
res_y = model.generate(x, tokenizer.eos_token_id, max_new_tokens=max_seq_len, temperature=temperature,
|
res_y = model.generate(x, tokenizer.eos_token_id, max_new_tokens=max_seq_len, temperature=temperature,
|
||||||
top_k=top_k, stream=stream)
|
top_k=top_k, stream=stream)
|
||||||
@ -172,8 +148,5 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
print('\n')
|
print('\n')
|
||||||
|
|
||||||
if contain_history_chat:
|
|
||||||
assistant_answer = answer.replace(new_prompt, "")
|
|
||||||
messages.append({"role": "assistant", "content": assistant_answer})
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print(end - start, 's')
|
print(end - start, 's')
|
||||||
|
@ -4,6 +4,8 @@ import argparse
|
|||||||
import time
|
import time
|
||||||
import math
|
import math
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from torch import optim
|
from torch import optim
|
||||||
@ -11,6 +13,9 @@ from torch.nn.parallel import DistributedDataParallel
|
|||||||
from torch.optim.lr_scheduler import CosineAnnealingLR
|
from torch.optim.lr_scheduler import CosineAnnealingLR
|
||||||
from torch.utils.data import DataLoader, DistributedSampler
|
from torch.utils.data import DataLoader, DistributedSampler
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from model.model import Transformer
|
from model.model import Transformer
|
||||||
from model.LMConfig import LMConfig
|
from model.LMConfig import LMConfig
|
||||||
from model.dataset import PretrainDataset
|
from model.dataset import PretrainDataset
|
||||||
@ -98,11 +103,13 @@ def init_model():
|
|||||||
def count_parameters(model):
|
def count_parameters(model):
|
||||||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||||
|
|
||||||
model = Transformer(lm_config).to(args.device)
|
model = Transformer(lm_config).to(args.device)
|
||||||
moe_path = '_moe' if lm_config.use_moe else ''
|
# moe_path = '_moe' if lm_config.use_moe else ''
|
||||||
|
|
||||||
Logger(f'LLM总参数量:{count_parameters(model) / 1e6:.3f} 百万')
|
Logger(f'LLM总参数量:{count_parameters(model) / 1e6:.3f} 百万')
|
||||||
return model
|
return model, tokenizer
|
||||||
|
|
||||||
|
|
||||||
def init_distributed_mode():
|
def init_distributed_mode():
|
||||||
@ -122,14 +129,15 @@ if __name__ == "__main__":
|
|||||||
parser = argparse.ArgumentParser(description="MiniMind Pretraining")
|
parser = argparse.ArgumentParser(description="MiniMind Pretraining")
|
||||||
parser.add_argument("--out_dir", type=str, default="out", help="Output directory")
|
parser.add_argument("--out_dir", type=str, default="out", help="Output directory")
|
||||||
parser.add_argument("--epochs", type=int, default=20, help="Number of epochs")
|
parser.add_argument("--epochs", type=int, default=20, help="Number of epochs")
|
||||||
parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
|
parser.add_argument("--batch_size", type=int, default=48, help="Batch size")
|
||||||
parser.add_argument("--learning_rate", type=float, default=2e-4, help="Learning rate")
|
parser.add_argument("--learning_rate", type=float, default=2e-4, help="Learning rate")
|
||||||
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", help="Device to use")
|
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu",
|
||||||
|
help="Device to use")
|
||||||
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
|
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
|
||||||
parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases")
|
parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases")
|
||||||
parser.add_argument("--wandb_project", type=str, default="MiniMind-Pretrain", help="Weights & Biases project name")
|
parser.add_argument("--wandb_project", type=str, default="MiniMind-Pretrain", help="Weights & Biases project name")
|
||||||
parser.add_argument("--num_workers", type=int, default=8, help="Number of workers for data loading")
|
parser.add_argument("--num_workers", type=int, default=8, help="Number of workers for data loading")
|
||||||
parser.add_argument("--data_path", type=str, default="./dataset/pretrain_data.bin", help="Path to training data")
|
parser.add_argument("--data_path", type=str, default="./dataset/pretrain_data.csv", help="Path to training data")
|
||||||
parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel")
|
parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel")
|
||||||
parser.add_argument("--accumulation_steps", type=int, default=8, help="Gradient accumulation steps")
|
parser.add_argument("--accumulation_steps", type=int, default=8, help="Gradient accumulation steps")
|
||||||
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
|
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
|
||||||
@ -160,12 +168,15 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
if args.use_wandb and (not ddp or ddp_local_rank == 0):
|
if args.use_wandb and (not ddp or ddp_local_rank == 0):
|
||||||
import wandb
|
import wandb
|
||||||
|
|
||||||
wandb.init(project=args.wandb_project, name=args.wandb_run_name)
|
wandb.init(project=args.wandb_project, name=args.wandb_run_name)
|
||||||
else:
|
else:
|
||||||
wandb = None
|
wandb = None
|
||||||
|
|
||||||
data_path_list = [args.data_path]
|
model, tokenizer = init_model()
|
||||||
train_ds = PretrainDataset(data_path_list, max_length=max_seq_len, memmap=True)
|
df = pd.read_csv(args.data_path)
|
||||||
|
df = df.sample(frac=1.0)
|
||||||
|
train_ds = PretrainDataset(df, tokenizer, max_length=max_seq_len)
|
||||||
train_sampler = DistributedSampler(train_ds) if ddp else None
|
train_sampler = DistributedSampler(train_ds) if ddp else None
|
||||||
train_loader = DataLoader(
|
train_loader = DataLoader(
|
||||||
train_ds,
|
train_ds,
|
||||||
@ -177,8 +188,6 @@ if __name__ == "__main__":
|
|||||||
sampler=train_sampler
|
sampler=train_sampler
|
||||||
)
|
)
|
||||||
|
|
||||||
model = init_model()
|
|
||||||
|
|
||||||
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16']))
|
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16']))
|
||||||
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
|
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
|
||||||
|
|
||||||
|
13
README.md
13
README.md
@ -78,7 +78,18 @@ https://github.com/user-attachments/assets/88b98128-636e-43bc-a419-b1b1403c2055
|
|||||||
### 👉**最近更新**
|
### 👉**最近更新**
|
||||||
|
|
||||||
<details close>
|
<details close>
|
||||||
<summary> <b>2024-09-17 (new🎉)</b> </summary>
|
<summary> <b>2024-09-27</b> </summary>
|
||||||
|
|
||||||
|
- 09-27更新pretrain数据集的预处理方式,为了保证文本完整性,放弃预处理成.bin训练的形式(轻微牺牲训练速度)。
|
||||||
|
|
||||||
|
- 目前pretrain预处理后的文件命名为:pretrain_data.csv。
|
||||||
|
|
||||||
|
- 删除了一些冗余的代码。
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details close>
|
||||||
|
<summary> <b>2024-09-17</b> </summary>
|
||||||
|
|
||||||
- 更新minimind-v1-moe模型
|
- 更新minimind-v1-moe模型
|
||||||
|
|
||||||
|
11
README_en.md
11
README_en.md
@ -84,6 +84,17 @@ We hope this open-source project helps LLM beginners get started quickly!
|
|||||||
|
|
||||||
### 👉**Recent Updates**
|
### 👉**Recent Updates**
|
||||||
|
|
||||||
|
<details close>
|
||||||
|
<summary> <b>2024-09-27</b> </summary>
|
||||||
|
|
||||||
|
- Updated the preprocessing method for the pretrain dataset on 09-27 to ensure text integrity, opting to abandon the preprocessing into .bin training format (slightly sacrificing training speed).
|
||||||
|
|
||||||
|
- The current filename for the pretrain data after preprocessing is: pretrain_data.csv.
|
||||||
|
|
||||||
|
- Removed some redundant code.
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
<details close>
|
<details close>
|
||||||
<summary> <b>2024-09-17 (new🎉)</b> </summary>
|
<summary> <b>2024-09-17 (new🎉)</b> </summary>
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import csv
|
||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
@ -12,62 +13,15 @@ from datasets import load_dataset
|
|||||||
bos_token = "<s>"
|
bos_token = "<s>"
|
||||||
eos_token = "</s>"
|
eos_token = "</s>"
|
||||||
|
|
||||||
# pretrain
|
|
||||||
def process_wiki_clean():
|
|
||||||
with open('./dataset/clean-wikipedia-cn.json', 'r', encoding='utf-8') as f_read:
|
|
||||||
data = [ujson.loads(line) for line in f_read]
|
|
||||||
data_len = len(data)
|
|
||||||
doc_ids = []
|
|
||||||
for idx, line in enumerate(data):
|
|
||||||
text = line['response']
|
|
||||||
text_id = tokenizer(f'{bos_token}{text}{eos_token}').data['input_ids']
|
|
||||||
if len(text_id) > 5:
|
|
||||||
doc_ids += text_id
|
|
||||||
if idx % (int(data_len / 20)) == 0:
|
|
||||||
print(f"[{idx}/{data_len}] {text}")
|
|
||||||
arr = np.array(doc_ids, dtype=np.uint16)
|
|
||||||
with open('./dataset/clean-wikipedia-cn.bin', 'wb') as f:
|
|
||||||
f.write(arr.tobytes())
|
|
||||||
|
|
||||||
|
def pretrain_process(chunk_size=50000):
|
||||||
# pretrain
|
|
||||||
def process_other():
|
|
||||||
data = []
|
|
||||||
|
|
||||||
with open('./dataset/alpaca_gpt4_data_zh.json', 'r', encoding='utf-8') as f:
|
|
||||||
data_ = json.load(f)
|
|
||||||
data += data_
|
|
||||||
|
|
||||||
with open('./dataset/alpaca_data_zh_51k.json', 'r', encoding='utf-8') as f:
|
|
||||||
data_ = json.load(f)
|
|
||||||
data += data_
|
|
||||||
|
|
||||||
doc_ids = []
|
|
||||||
for idx, per in enumerate(data):
|
|
||||||
q = per['instruction']
|
|
||||||
i = per['input']
|
|
||||||
a = per['output']
|
|
||||||
q = q + i
|
|
||||||
if len(q) < 10 or len(a) < 5:
|
|
||||||
continue
|
|
||||||
if len(q) > 256 or len(a) > 256:
|
|
||||||
continue
|
|
||||||
text_id = tokenizer(f'{bos_token}{q},{a}{eos_token}').data['input_ids']
|
|
||||||
if len(text_id) > 5:
|
|
||||||
doc_ids += text_id
|
|
||||||
if idx % 50000 == 0:
|
|
||||||
print(idx, len(data))
|
|
||||||
|
|
||||||
arr = np.array(doc_ids, dtype=np.uint16)
|
|
||||||
with open('./dataset/clean_other.bin', 'wb') as f:
|
|
||||||
f.write(arr.tobytes())
|
|
||||||
|
|
||||||
|
|
||||||
def process_seq_monkey(chunk_size=50000):
|
|
||||||
doc_ids = []
|
|
||||||
chunk_idx = 0
|
chunk_idx = 0
|
||||||
|
|
||||||
with jsonlines.open('./dataset/mobvoi_seq_monkey_general_open_corpus.jsonl') as reader:
|
with jsonlines.open('./dataset/mobvoi_seq_monkey_general_open_corpus.jsonl') as reader:
|
||||||
|
with open('./dataset/pretrain_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
||||||
|
writer = csv.writer(csvfile)
|
||||||
|
writer.writerow(['text'])
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
chunk = list(itertools.islice(reader, chunk_size))
|
chunk = list(itertools.islice(reader, chunk_size))
|
||||||
if not chunk:
|
if not chunk:
|
||||||
@ -78,44 +32,12 @@ def process_seq_monkey(chunk_size=50000):
|
|||||||
content = obj.get('text', '')
|
content = obj.get('text', '')
|
||||||
if len(content) > 512:
|
if len(content) > 512:
|
||||||
continue
|
continue
|
||||||
text_id = tokenizer(f'{bos_token}{content}{eos_token}').data['input_ids']
|
writer.writerow([content])
|
||||||
doc_ids += text_id
|
|
||||||
except UnicodeDecodeError as e:
|
except UnicodeDecodeError as e:
|
||||||
print(f"Skipping invalid line {chunk_idx * chunk_size + idx + 1}: {e}")
|
print(f"Skipping invalid line {chunk_idx * chunk_size + idx + 1}: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk_idx += 1
|
chunk_idx += 1
|
||||||
print(f"Processed chunk {chunk_idx} with {chunk_size} lines")
|
print('chunk:', ((chunk_idx - 1) * chunk_size, chunk_idx * chunk_size), 'process end')
|
||||||
|
|
||||||
if len(doc_ids) > 1000000:
|
|
||||||
arr = np.array(doc_ids, dtype=np.uint16)
|
|
||||||
with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f:
|
|
||||||
f.write(arr.tobytes())
|
|
||||||
doc_ids = []
|
|
||||||
|
|
||||||
if doc_ids:
|
|
||||||
arr = np.array(doc_ids, dtype=np.uint16)
|
|
||||||
with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f:
|
|
||||||
f.write(arr.tobytes())
|
|
||||||
|
|
||||||
|
|
||||||
def pretrain_process():
|
|
||||||
# process_wiki_clean()
|
|
||||||
process_seq_monkey()
|
|
||||||
|
|
||||||
data_path_list = [
|
|
||||||
# './dataset/clean-wikipedia-cn.bin',
|
|
||||||
'./dataset/clean_seq_monkey.bin'
|
|
||||||
]
|
|
||||||
data_lst = []
|
|
||||||
for data_path in data_path_list:
|
|
||||||
with open(data_path, 'rb') as f:
|
|
||||||
data = np.fromfile(f, dtype=np.uint16)
|
|
||||||
data_lst.append(data)
|
|
||||||
arr = np.concatenate(data_lst)
|
|
||||||
print(arr.shape)
|
|
||||||
with open('./dataset/pretrain_data.bin', 'wb') as f:
|
|
||||||
f.write(arr.tobytes())
|
|
||||||
|
|
||||||
|
|
||||||
def sft_process(contain_history=False):
|
def sft_process(contain_history=False):
|
||||||
@ -186,6 +108,7 @@ def sft_process(contain_history=False):
|
|||||||
process_and_write_data(data)
|
process_and_write_data(data)
|
||||||
data = []
|
data = []
|
||||||
|
|
||||||
|
|
||||||
def rl_process():
|
def rl_process():
|
||||||
################
|
################
|
||||||
# Dataset
|
# Dataset
|
||||||
|
@ -13,37 +13,28 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|||||||
|
|
||||||
|
|
||||||
class PretrainDataset(Dataset):
|
class PretrainDataset(Dataset):
|
||||||
def __init__(self, data_path_lst, max_length=512, memmap=False):
|
def __init__(self, df, tokenizer, max_length=512):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
#
|
self.df = df
|
||||||
if memmap:
|
self.tokenizer = tokenizer
|
||||||
with open(data_path_lst[0], 'r') as f:
|
self.max_length = max_length
|
||||||
nbytes = f.seek(0, 2)
|
self.padding = 0
|
||||||
flen = f.tell() // np.dtype('uint16').itemsize
|
|
||||||
self.data = np.memmap(data_path_lst[0], dtype=np.dtype('uint16'), shape=(flen // max_length, max_length))
|
|
||||||
else:
|
|
||||||
data_lst = []
|
|
||||||
for data_path in data_path_lst:
|
|
||||||
with open(data_path, 'rb') as f:
|
|
||||||
data = np.fromfile(f, dtype=np.uint16)
|
|
||||||
data_lst.append(data)
|
|
||||||
data = np.concatenate(data_lst)
|
|
||||||
data = data[:max_length * int(len(data) / max_length)]
|
|
||||||
# np.random.shuffle(data)
|
|
||||||
self.data = data.reshape(-1, max_length)
|
|
||||||
#
|
|
||||||
print("memmap:{} train data.shape:{}".format(memmap, self.data.shape))
|
|
||||||
print("downloading finished.....")
|
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.data.shape[0]
|
return self.df.shape[0]
|
||||||
|
|
||||||
def __getitem__(self, index: int):
|
def __getitem__(self, index: int):
|
||||||
#
|
#
|
||||||
sample = self.data[index]
|
sample = self.df.iloc[index]
|
||||||
X = np.array(sample[:-1]).astype(np.int64)
|
text = f"{self.tokenizer.bos_token}{str(sample['text'])}{self.tokenizer.eos_token}"
|
||||||
Y = np.array(sample[1:]).astype(np.int64)
|
input_id = self.tokenizer(text).data['input_ids'][:self.max_length]
|
||||||
|
# 没满最大长度的剩余部分
|
||||||
|
padding_len = self.max_length - len(input_id)
|
||||||
|
input_id = input_id + [self.padding] * padding_len
|
||||||
|
|
||||||
|
input_id = np.array(input_id)
|
||||||
|
X = np.array(input_id[:-1]).astype(np.int64)
|
||||||
|
Y = np.array(input_id[1:]).astype(np.int64)
|
||||||
return torch.from_numpy(X), torch.from_numpy(Y)
|
return torch.from_numpy(X), torch.from_numpy(Y)
|
||||||
|
|
||||||
|
|
||||||
@ -56,7 +47,7 @@ class SFTDataset(Dataset):
|
|||||||
self.answer_max_len = answer_max_len
|
self.answer_max_len = answer_max_len
|
||||||
#
|
#
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.padding = 0 # self.tokenizer.special_tokens['<pad>']
|
self.padding = 0
|
||||||
self.bos_id = self.tokenizer('<s>assistant').data['input_ids']
|
self.bos_id = self.tokenizer('<s>assistant').data['input_ids']
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user