update scripts file
This commit is contained in:
parent
58e3af0359
commit
29d6cd9cd2
@ -1,153 +0,0 @@
|
||||
import random
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import warnings
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from model.model import Transformer
|
||||
from model.LMConfig import LMConfig
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
|
||||
def count_parameters(model):
|
||||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
|
||||
def init_model(lm_config):
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
model_from = 1 # 1从权重,2用transformers
|
||||
|
||||
if model_from == 1:
|
||||
moe_path = '_moe' if lm_config.use_moe else ''
|
||||
ckp = f'./out/pretrain_{lm_config.dim}{moe_path}.pth'
|
||||
|
||||
model = Transformer(lm_config)
|
||||
state_dict = torch.load(ckp, map_location=device)
|
||||
|
||||
# 处理不需要的前缀
|
||||
unwanted_prefix = '_orig_mod.'
|
||||
for k, v in list(state_dict.items()):
|
||||
if k.startswith(unwanted_prefix):
|
||||
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
||||
|
||||
for k, v in list(state_dict.items()):
|
||||
if 'mask' in k:
|
||||
del state_dict[k]
|
||||
|
||||
# 加载到模型中
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained('minimind', trust_remote_code=True)
|
||||
model = model.to(device)
|
||||
|
||||
print(f'模型参数: {count_parameters(model) / 1e6} 百万 = {count_parameters(model) / 1e9} B (Billion)')
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def setup_seed(seed):
|
||||
random.seed(seed) # 设置 Python 的随机种子
|
||||
np.random.seed(seed) # 设置 NumPy 的随机种子
|
||||
torch.manual_seed(seed) # 设置 PyTorch 的随机种子
|
||||
torch.cuda.manual_seed(seed) # 为当前 GPU 设置随机种子(如果有)
|
||||
torch.cuda.manual_seed_all(seed) # 为所有 GPU 设置随机种子(如果有)
|
||||
torch.backends.cudnn.deterministic = True # 确保每次返回的卷积算法是确定的
|
||||
torch.backends.cudnn.benchmark = False # 关闭 cuDNN 的自动调优,避免不确定性
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# -----------------------------------------------------------------------------
|
||||
out_dir = 'out'
|
||||
start = ""
|
||||
temperature = 0.7
|
||||
top_k = 8
|
||||
setup_seed(1337)
|
||||
# device = 'cpu'
|
||||
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
|
||||
dtype = 'bfloat16'
|
||||
max_seq_len = 512
|
||||
lm_config = LMConfig()
|
||||
lm_config.max_seq_len = max_seq_len
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
model, tokenizer = init_model(lm_config)
|
||||
model = model.eval()
|
||||
# int(input('输入0自动测试,输入1问题测试:'))
|
||||
answer_way = 0
|
||||
stream = True
|
||||
|
||||
prompt_datas = [
|
||||
'椭圆和圆的区别',
|
||||
'中国关于马克思主义基本原理',
|
||||
'人类大脑的主要功能是',
|
||||
'万有引力是',
|
||||
'世界上人口最多的国家是',
|
||||
'DNA的全称是',
|
||||
'数学中π的值大约是',
|
||||
'世界上最高的山峰是',
|
||||
'太阳系中最大的行星是',
|
||||
'二氧化碳的化学分子式是',
|
||||
'地球上最大的动物是',
|
||||
'地球自转一圈大约需要',
|
||||
'杭州市的美食有',
|
||||
'江苏省的最好的大学',
|
||||
]
|
||||
|
||||
qa_index = 0
|
||||
while True:
|
||||
start = time.time()
|
||||
if answer_way == 1:
|
||||
# run generation
|
||||
prompt = input('用户:')
|
||||
else:
|
||||
if qa_index >= len(prompt_datas):
|
||||
break
|
||||
prompt = prompt_datas[qa_index]
|
||||
print('问题:', prompt)
|
||||
qa_index += 1
|
||||
|
||||
prompt = tokenizer.bos_token + prompt
|
||||
x = tokenizer(prompt).data['input_ids']
|
||||
x = (torch.tensor(x, dtype=torch.long, device=device)[None, ...])
|
||||
|
||||
with torch.no_grad():
|
||||
res_y = model.generate(x, tokenizer.eos_token_id, max_new_tokens=max_seq_len, temperature=temperature,
|
||||
top_k=top_k, stream=stream)
|
||||
print('回答:', end='')
|
||||
try:
|
||||
y = next(res_y)
|
||||
except StopIteration:
|
||||
print("No answer")
|
||||
continue
|
||||
|
||||
history_idx = 0
|
||||
while y != None:
|
||||
answer = tokenizer.decode(y[0].tolist())
|
||||
if answer and answer[-1] == '<EFBFBD>':
|
||||
try:
|
||||
y = next(res_y)
|
||||
except:
|
||||
break
|
||||
continue
|
||||
# print(answer)
|
||||
if not len(answer):
|
||||
try:
|
||||
y = next(res_y)
|
||||
except:
|
||||
break
|
||||
continue
|
||||
|
||||
print(answer[history_idx:], end='', flush=True)
|
||||
try:
|
||||
y = next(res_y)
|
||||
except:
|
||||
break
|
||||
history_idx = len(answer)
|
||||
if not stream:
|
||||
break
|
||||
|
||||
print('\n')
|
||||
|
||||
end = time.time()
|
||||
print(end - start, 's')
|
209
1-pretrain.py
209
1-pretrain.py
@ -1,209 +0,0 @@
|
||||
import os
|
||||
import platform
|
||||
import argparse
|
||||
import time
|
||||
import math
|
||||
import warnings
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch import optim
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
from torch.optim.lr_scheduler import CosineAnnealingLR
|
||||
from torch.utils.data import DataLoader, DistributedSampler
|
||||
from contextlib import nullcontext
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from model.model import Transformer
|
||||
from model.LMConfig import LMConfig
|
||||
from model.dataset import PretrainDataset
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
|
||||
def Logger(content):
|
||||
if not ddp or dist.get_rank() == 0:
|
||||
print(content)
|
||||
|
||||
|
||||
def get_lr(it, all):
|
||||
warmup_iters = args.warmup_iters
|
||||
lr_decay_iters = all
|
||||
min_lr = args.learning_rate / 10
|
||||
|
||||
if it < warmup_iters:
|
||||
return args.learning_rate * it / warmup_iters
|
||||
if it > lr_decay_iters:
|
||||
return min_lr
|
||||
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
|
||||
assert 0 <= decay_ratio <= 1
|
||||
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
|
||||
return min_lr + coeff * (args.learning_rate - min_lr)
|
||||
|
||||
|
||||
def train_epoch(epoch, wandb):
|
||||
start_time = time.time()
|
||||
for step, (X, Y, loss_mask) in enumerate(train_loader):
|
||||
X = X.to(args.device)
|
||||
Y = Y.to(args.device)
|
||||
loss_mask = loss_mask.to(args.device)
|
||||
|
||||
lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
|
||||
with ctx:
|
||||
out = model(X, Y)
|
||||
loss = out.last_loss / args.accumulation_steps
|
||||
loss_mask = loss_mask.view(-1)
|
||||
loss = torch.sum(loss * loss_mask) / loss_mask.sum()
|
||||
|
||||
scaler.scale(loss).backward()
|
||||
|
||||
if (step + 1) % args.accumulation_steps == 0:
|
||||
scaler.unscale_(optimizer)
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
|
||||
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
|
||||
optimizer.zero_grad(set_to_none=True)
|
||||
|
||||
if step % args.log_interval == 0:
|
||||
spend_time = time.time() - start_time
|
||||
Logger(
|
||||
'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
|
||||
epoch,
|
||||
args.epochs,
|
||||
step,
|
||||
iter_per_epoch,
|
||||
loss.item() * args.accumulation_steps,
|
||||
optimizer.param_groups[-1]['lr'],
|
||||
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
|
||||
|
||||
if (wandb is not None) and (not ddp or dist.get_rank() == 0):
|
||||
wandb.log({"loss": loss.item() * args.accumulation_steps,
|
||||
"lr": optimizer.param_groups[-1]['lr'],
|
||||
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
|
||||
|
||||
if (step + 1) % args.save_interval == 0 and (not ddp or dist.get_rank() == 0):
|
||||
model.eval()
|
||||
moe_path = '_moe' if lm_config.use_moe else ''
|
||||
ckp = f'{args.save_dir}/pretrain_{lm_config.dim}{moe_path}.pth'
|
||||
|
||||
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
|
||||
state_dict = model.module.state_dict()
|
||||
else:
|
||||
state_dict = model.state_dict()
|
||||
|
||||
torch.save(state_dict, ckp)
|
||||
model.train()
|
||||
|
||||
|
||||
def init_model():
|
||||
def count_parameters(model):
|
||||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
|
||||
model = Transformer(lm_config).to(args.device)
|
||||
# moe_path = '_moe' if lm_config.use_moe else ''
|
||||
|
||||
Logger(f'LLM总参数量:{count_parameters(model) / 1e6:.3f} 百万')
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def init_distributed_mode():
|
||||
if not ddp: return
|
||||
global ddp_local_rank, DEVICE
|
||||
|
||||
dist.init_process_group(backend="nccl")
|
||||
ddp_rank = int(os.environ["RANK"])
|
||||
ddp_local_rank = int(os.environ["LOCAL_RANK"])
|
||||
ddp_world_size = int(os.environ["WORLD_SIZE"])
|
||||
DEVICE = f"cuda:{ddp_local_rank}"
|
||||
torch.cuda.set_device(DEVICE)
|
||||
|
||||
|
||||
# torchrun --nproc_per_node 2 1-pretrain.py
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="MiniMind Pretraining")
|
||||
parser.add_argument("--out_dir", type=str, default="out", help="Output directory")
|
||||
parser.add_argument("--epochs", type=int, default=20, help="Number of epochs")
|
||||
parser.add_argument("--batch_size", type=int, default=64, help="Batch size")
|
||||
parser.add_argument("--learning_rate", type=float, default=2e-4, help="Learning rate")
|
||||
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu",
|
||||
help="Device to use")
|
||||
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
|
||||
parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases")
|
||||
parser.add_argument("--wandb_project", type=str, default="MiniMind-Pretrain", help="Weights & Biases project name")
|
||||
parser.add_argument("--num_workers", type=int, default=1, help="Number of workers for data loading")
|
||||
parser.add_argument("--data_path", type=str, default="./dataset/pretrain_data.csv", help="Path to training data")
|
||||
parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel")
|
||||
parser.add_argument("--accumulation_steps", type=int, default=8, help="Gradient accumulation steps")
|
||||
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
|
||||
parser.add_argument("--warmup_iters", type=int, default=0, help="Number of warmup iterations")
|
||||
parser.add_argument("--log_interval", type=int, default=100, help="Logging interval")
|
||||
parser.add_argument("--save_interval", type=int, default=1000, help="Model saving interval")
|
||||
parser.add_argument('--local_rank', type=int, default=-1, help='local rank for distributed training')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
lm_config = LMConfig()
|
||||
max_seq_len = lm_config.max_seq_len
|
||||
args.save_dir = os.path.join(args.out_dir)
|
||||
os.makedirs(args.save_dir, exist_ok=True)
|
||||
os.makedirs(args.out_dir, exist_ok=True)
|
||||
tokens_per_iter = args.batch_size * max_seq_len
|
||||
torch.manual_seed(1337)
|
||||
device_type = "cuda" if "cuda" in args.device else "cpu"
|
||||
|
||||
args.wandb_run_name = f"MiniMind-Pretrain-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
|
||||
|
||||
ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
|
||||
|
||||
ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run?
|
||||
ddp_local_rank, DEVICE = 0, "cuda:0"
|
||||
if ddp:
|
||||
init_distributed_mode()
|
||||
args.device = torch.device(DEVICE)
|
||||
|
||||
if args.use_wandb and (not ddp or ddp_local_rank == 0):
|
||||
import wandb
|
||||
|
||||
wandb.init(project=args.wandb_project, name=args.wandb_run_name)
|
||||
else:
|
||||
wandb = None
|
||||
|
||||
model, tokenizer = init_model()
|
||||
df = pd.read_csv(args.data_path)
|
||||
df = df.sample(frac=1.0)
|
||||
train_ds = PretrainDataset(df, tokenizer, max_length=max_seq_len)
|
||||
train_sampler = DistributedSampler(train_ds) if ddp else None
|
||||
train_loader = DataLoader(
|
||||
train_ds,
|
||||
batch_size=args.batch_size,
|
||||
pin_memory=True,
|
||||
drop_last=False,
|
||||
shuffle=False,
|
||||
num_workers=args.num_workers,
|
||||
sampler=train_sampler
|
||||
)
|
||||
|
||||
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16']))
|
||||
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
|
||||
|
||||
if False and platform.system() != 'Windows' and float(torch.__version__.split('.')[0]) >= 2:
|
||||
Logger("compiling the model... (takes a ~minute)")
|
||||
unoptimized_model = model
|
||||
model = torch.compile(model)
|
||||
|
||||
if ddp:
|
||||
model._ddp_params_and_buffers_to_ignore = {"pos_cis"}
|
||||
model = DistributedDataParallel(model, device_ids=[ddp_local_rank])
|
||||
|
||||
iter_per_epoch = len(train_loader)
|
||||
for epoch in range(args.epochs):
|
||||
train_epoch(epoch, wandb)
|
186
2-eval_chat.py
186
2-eval_chat.py
@ -1,186 +0,0 @@
|
||||
import random
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import warnings
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from model.model import Transformer
|
||||
from model.LMConfig import LMConfig
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
|
||||
def count_parameters(model):
|
||||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
|
||||
def init_model(lm_config):
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
model_from = 1 # 1从权重,2用transformers
|
||||
|
||||
if model_from == 1:
|
||||
moe_path = '_moe' if lm_config.use_moe else ''
|
||||
# ckp = f'./out/multi_chat/full_sft_{lm_config.dim}{moe_path}.pth'
|
||||
|
||||
model = Transformer(lm_config)
|
||||
# state_dict = torch.load(ckp, map_location=device)
|
||||
|
||||
# # 处理不需要的前缀
|
||||
# unwanted_prefix = '_orig_mod.'
|
||||
# for k, v in list(state_dict.items()):
|
||||
# if k.startswith(unwanted_prefix):
|
||||
# state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
||||
#
|
||||
# for k, v in list(state_dict.items()):
|
||||
# if 'mask' in k:
|
||||
# del state_dict[k]
|
||||
#
|
||||
# # 加载到模型中
|
||||
# model.load_state_dict(state_dict, strict=False)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained('./minimind-v1-small', trust_remote_code=True)
|
||||
model = model.to(device)
|
||||
|
||||
print(f'模型参数: {count_parameters(model) / 1e6} 百万 = {count_parameters(model) / 1e9} B (Billion)')
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def setup_seed(seed):
|
||||
random.seed(seed) # 设置 Python 的随机种子
|
||||
np.random.seed(seed) # 设置 NumPy 的随机种子
|
||||
torch.manual_seed(seed) # 设置 PyTorch 的随机种子
|
||||
torch.cuda.manual_seed(seed) # 为当前 GPU 设置随机种子(如果有)
|
||||
torch.cuda.manual_seed_all(seed) # 为所有 GPU 设置随机种子(如果有)
|
||||
torch.backends.cudnn.deterministic = True # 确保每次返回的卷积算法是确定的
|
||||
torch.backends.cudnn.benchmark = False # 关闭 cuDNN 的自动调优,避免不确定性
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# -----------------------------------------------------------------------------
|
||||
out_dir = 'out'
|
||||
start = ""
|
||||
temperature = 0.1
|
||||
top_k = 16
|
||||
# device = 'cpu'
|
||||
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
|
||||
dtype = 'bfloat16'
|
||||
max_seq_len = 1 * 1024
|
||||
lm_config = LMConfig()
|
||||
lm_config.max_seq_len = max_seq_len
|
||||
# 对话是否携带历史对话(当前模型没有在连续对话数据集上训练,增大历史上文基本不会有新的问答能力)
|
||||
contain_history_chat = False
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
model, tokenizer = init_model(lm_config)
|
||||
|
||||
model = model.eval()
|
||||
# 推送到huggingface
|
||||
# model.push_to_hub("minimind")
|
||||
# tokenizer.push_to_hub("minimind")
|
||||
|
||||
# answer_way = int(input('输入0自动测试,输入1问题测试:'))
|
||||
answer_way = 0
|
||||
stream = True
|
||||
|
||||
prompt_datas = [
|
||||
'你叫什么名字',
|
||||
'你是谁',
|
||||
'中国有哪些比较好的大学?',
|
||||
'全世界最好的大学是什么?',
|
||||
'你知道光速是多少吗?',
|
||||
'你知道长江吗?',
|
||||
'人类的血液主要由哪些成分组成?',
|
||||
'第一颗人造卫星是哪个国家发射的?',
|
||||
'你知道杭州有什么美食吗?',
|
||||
'你知道泰山在哪里吗?',
|
||||
'地球上最大的动物是什么?',
|
||||
'地球自转一圈大约需要多少时间?',
|
||||
'人类最早使用的金属是什么?',
|
||||
'水的化学分子式是什么?',
|
||||
'大气层中含量最多的气体是什么?',
|
||||
'世界上最高的山峰是什么?',
|
||||
'你知道世界上最深的海沟是什么吗?',
|
||||
'最早发明印刷术的是哪个国家?',
|
||||
'万有引力是谁提出的?',
|
||||
'光合作用的主要原理是什么?',
|
||||
'你知道大熊猫的主要食物是什么吗?',
|
||||
'海水为什么是咸的?',
|
||||
'我们平时喝的牛奶主要含有什么营养成分?',
|
||||
'一星期有多少天?'
|
||||
]
|
||||
|
||||
messages_origin = []
|
||||
messages = messages_origin
|
||||
|
||||
i = 0
|
||||
while i < len(prompt_datas):
|
||||
# Generate a random seed
|
||||
# random_seed = random.randint(0, 2 ** 32 - 1)
|
||||
# setup_seed(random_seed)
|
||||
if not contain_history_chat:
|
||||
messages = messages_origin.copy()
|
||||
|
||||
if answer_way == 1:
|
||||
prompt = input('[Q]: ')
|
||||
else:
|
||||
prompt = prompt_datas[i]
|
||||
print(f'[Q]: {prompt}')
|
||||
i += 1
|
||||
|
||||
prompt = '请问,' + prompt
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
# print(messages)
|
||||
new_prompt = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True
|
||||
)[-(max_seq_len - 1):]
|
||||
|
||||
x = tokenizer(new_prompt).data['input_ids']
|
||||
x = (torch.tensor(x, dtype=torch.long, device=device)[None, ...])
|
||||
|
||||
answer = new_prompt
|
||||
|
||||
with torch.no_grad():
|
||||
res_y = model.generate(x, tokenizer.eos_token_id, max_new_tokens=max_seq_len, temperature=temperature,
|
||||
top_k=top_k, stream=stream)
|
||||
print('[A]: ', end='')
|
||||
try:
|
||||
y = next(res_y)
|
||||
except StopIteration:
|
||||
print("No answer")
|
||||
continue
|
||||
|
||||
history_idx = 0
|
||||
while y != None:
|
||||
answer = tokenizer.decode(y[0].tolist())
|
||||
if answer and answer[-1] == '<EFBFBD>':
|
||||
try:
|
||||
y = next(res_y)
|
||||
except:
|
||||
break
|
||||
continue
|
||||
# print(answer)
|
||||
if not len(answer):
|
||||
try:
|
||||
y = next(res_y)
|
||||
except:
|
||||
break
|
||||
continue
|
||||
|
||||
print(answer[history_idx:], end='', flush=True)
|
||||
try:
|
||||
y = next(res_y)
|
||||
except:
|
||||
break
|
||||
history_idx = len(answer)
|
||||
if not stream:
|
||||
break
|
||||
|
||||
print('\n')
|
||||
|
||||
if contain_history_chat:
|
||||
assistant_answer = answer.replace(new_prompt, "")
|
||||
messages.append({"role": "assistant", "content": assistant_answer})
|
216
3-full_sft.py
216
3-full_sft.py
@ -1,216 +0,0 @@
|
||||
import os
|
||||
import platform
|
||||
import argparse
|
||||
import time
|
||||
import math
|
||||
import warnings
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.distributed as dist
|
||||
from contextlib import nullcontext
|
||||
|
||||
from torch import optim
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
from torch.utils.data import DataLoader, DistributedSampler
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from model.model import Transformer
|
||||
from model.LMConfig import LMConfig
|
||||
from model.dataset import SFTDataset
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
|
||||
def Logger(content):
|
||||
if not ddp or dist.get_rank() == 0:
|
||||
print(content)
|
||||
|
||||
|
||||
def get_lr(it, all):
|
||||
warmup_iters = args.warmup_iters
|
||||
lr_decay_iters = all
|
||||
min_lr = args.learning_rate / 10
|
||||
|
||||
if it < warmup_iters:
|
||||
return args.learning_rate * it / warmup_iters
|
||||
if it > lr_decay_iters:
|
||||
return min_lr
|
||||
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
|
||||
assert 0 <= decay_ratio <= 1
|
||||
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
|
||||
return min_lr + coeff * (args.learning_rate - min_lr)
|
||||
|
||||
|
||||
def train_epoch(epoch, wandb):
|
||||
start_time = time.time()
|
||||
for step, (X, Y, loss_mask) in enumerate(train_loader):
|
||||
X = X.to(args.device)
|
||||
Y = Y.to(args.device)
|
||||
loss_mask = loss_mask.to(args.device)
|
||||
lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
|
||||
with ctx:
|
||||
logits = model(X, Y).logits
|
||||
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=0, reduction='none')
|
||||
loss_mask = loss_mask.view(-1)
|
||||
loss = torch.sum(loss * loss_mask) / loss_mask.sum()
|
||||
|
||||
scaler.scale(loss).backward()
|
||||
|
||||
if (step + 1) % args.accumulation_steps == 0:
|
||||
scaler.unscale_(optimizer)
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
|
||||
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
|
||||
optimizer.zero_grad(set_to_none=True)
|
||||
|
||||
if step % args.log_interval == 0:
|
||||
spend_time = time.time() - start_time
|
||||
Logger(
|
||||
'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
|
||||
epoch,
|
||||
args.epochs,
|
||||
step,
|
||||
iter_per_epoch,
|
||||
loss.item(),
|
||||
optimizer.param_groups[-1]['lr'],
|
||||
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
|
||||
|
||||
if (wandb is not None) and (not ddp or dist.get_rank() == 0):
|
||||
wandb.log({"loss": loss,
|
||||
"lr": optimizer.param_groups[-1]['lr'],
|
||||
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
|
||||
|
||||
if (step + 1) % args.save_interval == 0 and (not ddp or dist.get_rank() == 0):
|
||||
model.eval()
|
||||
moe_path = '_moe' if lm_config.use_moe else ''
|
||||
ckp = f'{args.save_dir}/full_sft_{lm_config.dim}{moe_path}.pth'
|
||||
|
||||
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
|
||||
state_dict = model.module.state_dict()
|
||||
else:
|
||||
state_dict = model.state_dict()
|
||||
|
||||
torch.save(state_dict, ckp)
|
||||
model.train()
|
||||
|
||||
|
||||
def init_model():
|
||||
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
|
||||
model_from = 1 # 1从权重,2用transformers
|
||||
|
||||
def count_parameters(model):
|
||||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
if model_from == 1:
|
||||
model = Transformer(lm_config)
|
||||
moe_path = '_moe' if lm_config.use_moe else ''
|
||||
ckp = f'./out/pretrain_{lm_config.dim}{moe_path}.pth'
|
||||
state_dict = torch.load(ckp, map_location=args.device)
|
||||
unwanted_prefix = '_orig_mod.'
|
||||
for k, v in list(state_dict.items()):
|
||||
if k.startswith(unwanted_prefix):
|
||||
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained('./minimind-v1-small', trust_remote_code=True)
|
||||
|
||||
Logger(f'LLM总参数量:{count_parameters(model) / 1e6:.3f} 百万')
|
||||
model = model.to(args.device)
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def init_distributed_mode():
|
||||
if not ddp: return
|
||||
global ddp_local_rank, DEVICE
|
||||
|
||||
dist.init_process_group(backend="nccl")
|
||||
ddp_rank = int(os.environ["RANK"])
|
||||
ddp_local_rank = int(os.environ["LOCAL_RANK"])
|
||||
ddp_world_size = int(os.environ["WORLD_SIZE"])
|
||||
DEVICE = f"cuda:{ddp_local_rank}"
|
||||
torch.cuda.set_device(DEVICE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="MiniMind Full SFT")
|
||||
parser.add_argument("--out_dir", type=str, default="out", help="Output directory")
|
||||
parser.add_argument("--epochs", type=int, default=19, help="Number of epochs")
|
||||
parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
|
||||
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
|
||||
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", help="Device to use")
|
||||
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
|
||||
parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases")
|
||||
parser.add_argument("--wandb_project", type=str, default="MiniMind-Full-SFT", help="Weights & Biases project name")
|
||||
parser.add_argument("--num_workers", type=int, default=1, help="Number of workers for data loading")
|
||||
parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel")
|
||||
parser.add_argument("--accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
|
||||
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
|
||||
parser.add_argument("--warmup_iters", type=int, default=0, help="Number of warmup iterations")
|
||||
parser.add_argument("--log_interval", type=int, default=100, help="Logging interval")
|
||||
parser.add_argument("--save_interval", type=int, default=1000, help="Model saving interval")
|
||||
parser.add_argument('--local_rank', type=int, default=-1, help='local rank for distributed training')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
lm_config = LMConfig()
|
||||
max_seq_len = lm_config.max_seq_len
|
||||
args.save_dir = os.path.join(args.out_dir)
|
||||
os.makedirs(args.save_dir, exist_ok=True)
|
||||
os.makedirs(args.out_dir, exist_ok=True)
|
||||
tokens_per_iter = args.batch_size * max_seq_len
|
||||
torch.manual_seed(1337)
|
||||
device_type = "cuda" if "cuda" in args.device else "cpu"
|
||||
|
||||
args.wandb_run_name = f"MiniMind-Full-SFT-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
|
||||
|
||||
ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
|
||||
ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run?
|
||||
ddp_local_rank, DEVICE = 0, "cuda:0"
|
||||
if ddp:
|
||||
init_distributed_mode()
|
||||
args.device = torch.device(DEVICE)
|
||||
|
||||
if args.use_wandb and (not ddp or ddp_local_rank == 0):
|
||||
import wandb
|
||||
wandb.init(project=args.wandb_project, name=args.wandb_run_name)
|
||||
else:
|
||||
wandb = None
|
||||
|
||||
model, tokenizer = init_model()
|
||||
|
||||
df = pd.read_csv('./dataset/sft_data_single.csv')
|
||||
df = df.sample(frac=1.0)
|
||||
train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len)
|
||||
train_sampler = DistributedSampler(train_ds) if ddp else None
|
||||
train_loader = DataLoader(
|
||||
train_ds,
|
||||
batch_size=args.batch_size,
|
||||
pin_memory=True,
|
||||
drop_last=False,
|
||||
shuffle=False,
|
||||
num_workers=args.num_workers,
|
||||
sampler=train_sampler
|
||||
)
|
||||
|
||||
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16']))
|
||||
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
|
||||
|
||||
if False and not lm_config.use_moe and platform.system() != 'Windows' and float(torch.__version__.split('.')[0]) >= 2:
|
||||
Logger("compiling the model... (takes a ~minute)")
|
||||
unoptimized_model = model
|
||||
model = torch.compile(model)
|
||||
|
||||
if ddp:
|
||||
model._ddp_params_and_buffers_to_ignore = {"pos_cis"}
|
||||
model = DistributedDataParallel(model, device_ids=[ddp_local_rank])
|
||||
|
||||
iter_per_epoch = len(train_loader)
|
||||
for epoch in range(args.epochs):
|
||||
train_epoch(epoch, wandb)
|
188
4-lora_sft.py
188
4-lora_sft.py
@ -1,188 +0,0 @@
|
||||
import os
|
||||
import platform
|
||||
import argparse
|
||||
import time
|
||||
import math
|
||||
import warnings
|
||||
import torch
|
||||
import pandas as pd
|
||||
import torch.nn.functional as F
|
||||
from contextlib import nullcontext
|
||||
|
||||
from torch import optim
|
||||
from transformers import AutoTokenizer
|
||||
from transformers import AutoModelForCausalLM
|
||||
from peft import get_peft_model, LoraConfig, TaskType
|
||||
from torch.utils.data import DataLoader
|
||||
from model.LMConfig import LMConfig
|
||||
from model.dataset import SFTDataset
|
||||
from model.model import Transformer
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
|
||||
def Logger(content):
|
||||
print(content)
|
||||
|
||||
|
||||
def get_lr(it, all):
|
||||
warmup_iters = args.warmup_iters
|
||||
lr_decay_iters = all
|
||||
min_lr = args.learning_rate / 10
|
||||
|
||||
if it < warmup_iters:
|
||||
return args.learning_rate * it / warmup_iters
|
||||
if it > lr_decay_iters:
|
||||
return min_lr
|
||||
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
|
||||
assert 0 <= decay_ratio <= 1
|
||||
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
|
||||
return min_lr + coeff * (args.learning_rate - min_lr)
|
||||
|
||||
|
||||
def train_epoch(epoch, wandb):
|
||||
start_time = time.time()
|
||||
for step, (X, Y, loss_mask) in enumerate(train_loader):
|
||||
X = X.to(args.device)
|
||||
Y = Y.to(args.device)
|
||||
loss_mask = loss_mask.to(args.device)
|
||||
|
||||
lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
|
||||
with ctx:
|
||||
logits = model(X, Y).logits
|
||||
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=0, reduction='none')
|
||||
loss_mask = loss_mask.view(-1)
|
||||
loss = torch.sum(loss * loss_mask) / loss_mask.sum()
|
||||
loss = loss / args.accumulation_steps
|
||||
|
||||
scaler.scale(loss).backward()
|
||||
|
||||
if (step + 1) % args.accumulation_steps == 0:
|
||||
scaler.unscale_(optimizer)
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
|
||||
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
|
||||
optimizer.zero_grad(set_to_none=True)
|
||||
|
||||
if step % args.log_interval == 0:
|
||||
spend_time = time.time() - start_time
|
||||
Logger(
|
||||
'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
|
||||
epoch,
|
||||
args.epochs,
|
||||
step,
|
||||
iter_per_epoch,
|
||||
loss.item() * args.accumulation_steps,
|
||||
optimizer.param_groups[-1]['lr'],
|
||||
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
|
||||
if wandb is not None:
|
||||
wandb.log({"loss": loss.item() * args.accumulation_steps,
|
||||
"lr": optimizer.param_groups[-1]['lr'],
|
||||
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
|
||||
|
||||
if (step + 1) % args.save_interval == 0:
|
||||
model.save_pretrained(args.save_dir)
|
||||
|
||||
|
||||
def find_linear_with_keys(model, keys=["wq", "wk"]):
|
||||
cls = torch.nn.Linear
|
||||
linear_names = []
|
||||
for name, module in model.named_modules():
|
||||
if isinstance(module, cls):
|
||||
for key in keys:
|
||||
if key in name:
|
||||
linear_names.append(name)
|
||||
break
|
||||
return linear_names
|
||||
|
||||
|
||||
def init_model():
|
||||
model_name_or_path = "./minimind-v1-small"
|
||||
tokenizer_name_or_path = "./minimind-v1-small"
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True, use_fast=False)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).to(args.device)
|
||||
|
||||
target_modules = find_linear_with_keys(model)
|
||||
peft_config = LoraConfig(
|
||||
r=8,
|
||||
target_modules=target_modules
|
||||
)
|
||||
model = get_peft_model(model, peft_config)
|
||||
model.print_trainable_parameters()
|
||||
model = model.to(args.device)
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="MiniMind LoRA Fine-tuning")
|
||||
parser.add_argument("--out_dir", type=str, default="out", help="Output directory")
|
||||
parser.add_argument("--epochs", type=int, default=20, help="Number of epochs")
|
||||
parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
|
||||
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
|
||||
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu",
|
||||
help="Device to use")
|
||||
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
|
||||
parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases")
|
||||
parser.add_argument("--wandb_project", type=str, default="MiniMind-LoRA", help="Weights & Biases project name")
|
||||
parser.add_argument("--num_workers", type=int, default=1, help="Number of workers for data loading")
|
||||
parser.add_argument("--accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
|
||||
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
|
||||
parser.add_argument("--warmup_iters", type=int, default=1000, help="Number of warmup iterations")
|
||||
parser.add_argument("--log_interval", type=int, default=100, help="Logging interval")
|
||||
parser.add_argument("--save_interval", type=int, default=1000, help="Model saving interval")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
lm_config = LMConfig()
|
||||
max_seq_len = lm_config.max_seq_len
|
||||
args.save_dir = os.path.join(args.out_dir)
|
||||
os.makedirs(args.save_dir, exist_ok=True)
|
||||
os.makedirs(args.out_dir, exist_ok=True)
|
||||
tokens_per_iter = args.batch_size * max_seq_len
|
||||
torch.manual_seed(1337)
|
||||
device_type = "cuda" if "cuda" in args.device else "cpu"
|
||||
|
||||
args.wandb_run_name = f"MiniMind-LoRA-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
|
||||
|
||||
ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
|
||||
|
||||
if args.use_wandb:
|
||||
import wandb
|
||||
|
||||
wandb.init(project=args.wandb_project, name=args.wandb_run_name)
|
||||
else:
|
||||
wandb = None
|
||||
|
||||
model, tokenizer = init_model()
|
||||
|
||||
df = pd.read_csv('./dataset/sft_data_single.csv')
|
||||
df = df.sample(frac=1.0)
|
||||
train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len)
|
||||
train_loader = DataLoader(
|
||||
train_ds,
|
||||
batch_size=args.batch_size,
|
||||
pin_memory=True,
|
||||
drop_last=False,
|
||||
shuffle=False,
|
||||
num_workers=args.num_workers,
|
||||
)
|
||||
|
||||
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16']))
|
||||
optimizer = optim.Adam(
|
||||
filter(lambda p: p.requires_grad, model.parameters()),
|
||||
lr=args.learning_rate
|
||||
)
|
||||
|
||||
if False and platform.system() != 'Windows' and float(torch.__version__.split('.')[0]) >= 2:
|
||||
Logger("compiling the model... (takes a ~minute)")
|
||||
unoptimized_model = model
|
||||
model = torch.compile(model)
|
||||
|
||||
iter_per_epoch = len(train_loader)
|
||||
for epoch in range(args.epochs):
|
||||
train_epoch(epoch, wandb)
|
@ -1,48 +0,0 @@
|
||||
import os
|
||||
import warnings
|
||||
|
||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||
from transformers import TrainingArguments, AutoModelForCausalLM, AutoTokenizer
|
||||
from trl import DPOConfig, DPOTrainer
|
||||
from datasets import load_dataset
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
|
||||
def init_model():
|
||||
device = 'cuda:0'
|
||||
# Do model patching and add fast LoRA weights
|
||||
model_name_or_path = "minimind-v1"
|
||||
tokenizer_name_or_path = "minimind-v1"
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True, use_fast=False)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
model = model.to(device)
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
model, tokenizer = init_model()
|
||||
training_config = DPOConfig(
|
||||
output_dir="./minimind_dpo",
|
||||
per_device_train_batch_size=1,
|
||||
remove_unused_columns=False,
|
||||
report_to="none",
|
||||
save_steps=2000,
|
||||
learning_rate=4e-5
|
||||
)
|
||||
|
||||
dataset_path = './dataset/dpo/train_data.json'
|
||||
train_dataset = load_dataset('json', data_files=dataset_path)
|
||||
|
||||
dpo_trainer = DPOTrainer(
|
||||
model,
|
||||
ref_model=None,
|
||||
args=training_config,
|
||||
beta=0.1,
|
||||
train_dataset=train_dataset['train'],
|
||||
tokenizer=tokenizer,
|
||||
max_length=512,
|
||||
max_prompt_length=512
|
||||
)
|
||||
dpo_trainer.train()
|
@ -1,185 +0,0 @@
|
||||
import csv
|
||||
import glob
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import jsonlines
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
bos_token = "<s>"
|
||||
eos_token = "</s>"
|
||||
|
||||
|
||||
def pretrain_process():
|
||||
# 定义输入和输出路径
|
||||
input_dir = '../CCI3-HQ/data'
|
||||
output_file = '../dataset/pretrain_data_hq.csv'
|
||||
jsonl_files = glob.glob(os.path.join(input_dir, 'part_*.jsonl'))
|
||||
total_lines = 0
|
||||
print("正在计算总行数...")
|
||||
for file in jsonl_files:
|
||||
with open(file, 'r', encoding='utf-8') as f:
|
||||
for _ in f:
|
||||
total_lines += 1
|
||||
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(['text', 'score']) # 写入表头
|
||||
for jsonl_file in jsonl_files:
|
||||
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
||||
for line in tqdm(f, desc=f'处理 {os.path.basename(jsonl_file)}', total=total_lines, unit='行',
|
||||
leave=False):
|
||||
try:
|
||||
data = json.loads(line)
|
||||
text = data.get('text', '')
|
||||
score = data.get('score', 0)
|
||||
if len(text) <= 512 and score > 3.5:
|
||||
writer.writerow([text, score])
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
print(f"筛选完成,结果已保存到 {output_file}")
|
||||
|
||||
|
||||
def sft_process():
|
||||
sft_file_name = 'sft_data.csv'
|
||||
|
||||
def process_and_write_data(data):
|
||||
q_lst, a_lst, history_lst = [], [], []
|
||||
for per in data:
|
||||
history, q, a = per['history'], per['q'], per['a']
|
||||
if not q or not a:
|
||||
continue
|
||||
history_len = sum(len(s) for s in history)
|
||||
message_len = history_len + len(q) + len(a)
|
||||
if message_len < 70 or message_len > 512:
|
||||
continue
|
||||
q_lst.append(q)
|
||||
a_lst.append(a)
|
||||
history_lst.append(history)
|
||||
|
||||
df = pd.DataFrame({'history': history_lst, 'q': q_lst, 'a': a_lst})
|
||||
df.to_csv(f'../dataset/{sft_file_name}',
|
||||
mode='a', header=False, index=False,
|
||||
lineterminator='\r\n', escapechar='\\', encoding='utf-8')
|
||||
|
||||
chunk_size = 1000
|
||||
data = []
|
||||
with open(f'../dataset/{sft_file_name}', 'w', encoding='utf-8') as f:
|
||||
f.write('history,q,a\n')
|
||||
|
||||
# sft_path = ['/root/shared-nvme/sft_data_zh.jsonl', '/root/shared-nvme/sft_data_en.jsonl']
|
||||
sft_path = ['/root/shared-nvme/sft_data_en.jsonl']
|
||||
chunk_num = 0
|
||||
for path in sft_path:
|
||||
with jsonlines.open(path) as reader:
|
||||
for idx, obj in enumerate(reader):
|
||||
try:
|
||||
data.append({
|
||||
'history': obj.get('history', ''),
|
||||
'q': obj.get('input', '') + obj.get('q', ''),
|
||||
'a': obj.get('output', '') + obj.get('a', '')
|
||||
})
|
||||
|
||||
if len(data) >= chunk_size:
|
||||
chunk_num += 1
|
||||
process_and_write_data(data)
|
||||
data = []
|
||||
if chunk_num % 100 == 0:
|
||||
print(f'chunk:{chunk_num} process end')
|
||||
except jsonlines.InvalidLineError as e:
|
||||
print(f"Skipping invalid JSON line {idx + 1}: {e}")
|
||||
continue
|
||||
|
||||
if data:
|
||||
process_and_write_data(data)
|
||||
data = []
|
||||
|
||||
|
||||
def rl_process():
|
||||
# 偏好数据默认只用中文(建议)
|
||||
input_paths = [
|
||||
# "../dataset/dpo_en.json",
|
||||
"../dataset/dpo_zh.json"
|
||||
]
|
||||
output_path = "../dataset/dpo_data.jsonl" # 修改输出文件扩展名为 .jsonl
|
||||
all_converted = []
|
||||
|
||||
for input_path in input_paths:
|
||||
with open(input_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f) # data is likely a list
|
||||
|
||||
for item in data:
|
||||
new_data = {
|
||||
"chosen": [],
|
||||
"rejected": []
|
||||
}
|
||||
for turn in item["conversations"]:
|
||||
role = "user" if turn["from"] == "human" else "assistant"
|
||||
message = {"role": role, "content": turn["value"]}
|
||||
new_data["chosen"].append(message)
|
||||
new_data["rejected"].append(message)
|
||||
new_data["chosen"].append({
|
||||
"role": "assistant",
|
||||
"content": item["chosen"]["value"]
|
||||
})
|
||||
new_data["rejected"].append({
|
||||
"role": "assistant",
|
||||
"content": item["rejected"]["value"]
|
||||
})
|
||||
all_converted.append(new_data)
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
for item in all_converted:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
def lora_dataset():
|
||||
import json
|
||||
import csv
|
||||
|
||||
# 读取JSON文件
|
||||
with open('../dataset/Chinese-medical-dialogue.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# 准备CSV数据
|
||||
csv_data = []
|
||||
for item in data:
|
||||
# 提取input和output并去除首尾空白
|
||||
q = item['input'].strip()
|
||||
a = item['output'].strip()
|
||||
|
||||
# 检查长度是否符合要求
|
||||
if len(q) + len(a) < 160:
|
||||
csv_data.append({
|
||||
'history': '[]',
|
||||
'q': q,
|
||||
'a': a
|
||||
})
|
||||
|
||||
# 写入CSV文件
|
||||
with open('../dataset/medical_sft.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['history', 'q', 'a']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
writer.writerows(csv_data)
|
||||
|
||||
print(f'转换完成,共处理 {len(csv_data)} 条有效数据')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
################
|
||||
# 1: pretrain
|
||||
# 2: sft
|
||||
# 3: RL
|
||||
################
|
||||
process_type = 4
|
||||
|
||||
if process_type == 1:
|
||||
pretrain_process()
|
||||
if process_type == 2:
|
||||
sft_process()
|
||||
if process_type == 3:
|
||||
rl_process()
|
||||
if process_type == 4:
|
||||
lora_dataset()
|
@ -1,97 +0,0 @@
|
||||
# from datasets import load_dataset
|
||||
#
|
||||
# dataset_paths = [
|
||||
# ['ceval/ceval-exam',
|
||||
# ['computer_network', 'operating_system', 'computer_architecture', 'college_programming', 'college_physics',
|
||||
# 'college_chemistry', 'advanced_mathematics', 'probability_and_statistics', 'discrete_mathematics',
|
||||
# 'electrical_engineer', 'metrology_engineer', 'high_school_mathematics', 'high_school_physics',
|
||||
# 'high_school_chemistry', 'high_school_biology', 'middle_school_mathematics', 'middle_school_biology',
|
||||
# 'middle_school_physics', 'middle_school_chemistry', 'veterinary_medicine', 'college_economics',
|
||||
# 'business_administration', 'marxism', 'mao_zedong_thought', 'education_science', 'teacher_qualification',
|
||||
# 'high_school_politics', 'high_school_geography', 'middle_school_politics', 'middle_school_geography',
|
||||
# 'modern_chinese_history', 'ideological_and_moral_cultivation', 'logic', 'law', 'chinese_language_and_literature',
|
||||
# 'art_studies', 'professional_tour_guide', 'legal_professional', 'high_school_chinese', 'high_school_history',
|
||||
# 'middle_school_history', 'civil_servant', 'sports_science', 'plant_protection', 'basic_medicine',
|
||||
# 'clinical_medicine', 'urban_and_rural_planner', 'accountant', 'fire_engineer',
|
||||
# 'environmental_impact_assessment_engineer', 'tax_accountant', 'physician']], # ceval*
|
||||
# ['haonan-li/cmmlu', [
|
||||
# 'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics',
|
||||
# 'chinese_civil_service_exam', 'chinese_driving_rule', 'chinese_food_culture',
|
||||
# 'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
|
||||
# 'chinese_teacher_qualification', 'clinical_knowledge', 'college_actuarial_science',
|
||||
# 'college_education', 'college_engineering_hydrology', 'college_law',
|
||||
# 'college_mathematics', 'college_medical_statistics', 'college_medicine',
|
||||
# 'computer_science', 'computer_security', 'conceptual_physics',
|
||||
# 'construction_project_management', 'economics', 'education', 'electrical_engineering',
|
||||
# 'elementary_chinese', 'elementary_commonsense', 'elementary_information_and_technology',
|
||||
# 'elementary_mathematics', 'ethnology', 'food_science', 'genetics', 'global_facts',
|
||||
# 'high_school_biology', 'high_school_chemistry', 'high_school_geography',
|
||||
# 'high_school_mathematics', 'high_school_physics', 'high_school_politics',
|
||||
# 'human_sexuality', 'international_law', 'journalism', 'jurisprudence',
|
||||
# 'legal_and_moral_basis', 'logical', 'machine_learning', 'management', 'marketing',
|
||||
# 'marxist_theory', 'modern_chinese', 'nutrition', 'philosophy', 'professional_accounting',
|
||||
# 'professional_law', 'professional_medicine', 'professional_psychology',
|
||||
# 'public_relations', 'security_study', 'sociology', 'sports_science',
|
||||
# 'traditional_chinese_medicine', 'virology', 'world_history', 'world_religions'
|
||||
# ]], # cmmlu*
|
||||
# ['tyouisen/aclue',
|
||||
# ['polysemy_resolution', 'poetry_sentiment_analysis', 'named_entity_recognition', 'basic_ancient_chinese',
|
||||
# 'poetry_context_prediction', 'sentence_segmentation', 'couplet_prediction', 'poetry_appreciate',
|
||||
# 'ancient_chinese_culture', 'ancient_phonetics', 'homographic_character_resolution', 'ancient_literature',
|
||||
# 'ancient_medical', 'poetry_quality_assessment', 'reading_comprehension']], # aclue
|
||||
# ['juletxara/mgsm', ['zh']], # mgsm_direct_zh
|
||||
# ['openbookqa', ['main']], # openbookqa
|
||||
# ['ZoneTwelve/tmmluplus',
|
||||
# ['dentistry', 'traditional_chinese_medicine_clinical_medicine', 'clinical_psychology', 'technical',
|
||||
# 'culinary_skills', 'mechanical', 'logic_reasoning', 'real_estate', 'general_principles_of_law', 'finance_banking',
|
||||
# 'anti_money_laundering', 'ttqav2', 'marketing_management', 'business_management', 'organic_chemistry',
|
||||
# 'advance_chemistry', 'physics', 'secondary_physics', 'human_behavior', 'national_protection', 'jce_humanities',
|
||||
# 'politic_science', 'agriculture', 'official_document_management', 'financial_analysis', 'pharmacy',
|
||||
# 'educational_psychology', 'statistics_and_machine_learning', 'management_accounting', 'introduction_to_law',
|
||||
# 'computer_science', 'veterinary_pathology', 'accounting', 'fire_science', 'optometry', 'insurance_studies',
|
||||
# 'pharmacology', 'taxation', 'education_(profession_level)', 'economics', 'veterinary_pharmacology',
|
||||
# 'nautical_science', 'occupational_therapy_for_psychological_disorders', 'trust_practice', 'geography_of_taiwan',
|
||||
# 'physical_education', 'auditing', 'administrative_law', 'basic_medical_science', 'macroeconomics', 'trade',
|
||||
# 'chinese_language_and_literature', 'tve_design', 'junior_science_exam', 'junior_math_exam', 'junior_chinese_exam',
|
||||
# 'junior_social_studies', 'tve_mathematics', 'tve_chinese_language', 'tve_natural_sciences', 'junior_chemistry',
|
||||
# 'music', 'education', 'three_principles_of_people', 'taiwanese_hokkien', 'engineering_math', 'linear_algebra']]
|
||||
# # tmmluplus
|
||||
#
|
||||
# ]
|
||||
#
|
||||
# for dataset_path in dataset_paths:
|
||||
# for dataset_name in dataset_path[1]:
|
||||
# datasets = load_dataset(dataset_path[0], dataset_name, cache_dir='./test_dataset_cache')
|
||||
#
|
||||
# """
|
||||
# export HF_HUB_OFFLINE=1 && lm_eval --model hf --model_args pretrained=/xxx/minimind/minimind-v2-small/,device=cuda,dtype=auto --tasks ceval* --batch_size 8 --trust_remote_code
|
||||
# """
|
||||
"""
|
||||
$env:HF_HUB_OFFLINE=1; lm_eval --model hf --model_args pretrained=../minimind-v2-small/,device=cuda,dtype=auto --tasks ceval* --batch_size 8 --trust_remote_code
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
|
||||
# 定义要执行的命令
|
||||
command = (
|
||||
'set HF_HUB_OFFLINE=1 & '
|
||||
'lm_eval --model hf --model_args pretrained=../minimind-v2-small/,device=cuda,dtype=auto '
|
||||
'--tasks ceval* --batch_size 8 --trust_remote_code'
|
||||
)
|
||||
|
||||
# 使用 subprocess 执行命令
|
||||
try:
|
||||
process = subprocess.run(
|
||||
command,
|
||||
shell=True,
|
||||
check=True,
|
||||
text=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
)
|
||||
# 打印命令的输出
|
||||
print("STDOUT:", process.stdout)
|
||||
print("STDERR:", process.stderr)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"命令执行失败,返回码: {e.returncode}")
|
||||
print("STDERR:", e.stderr)
|
Loading…
x
Reference in New Issue
Block a user