update scripts file

This commit is contained in:
gongjy 2025-02-09 23:52:50 +08:00
parent 58e3af0359
commit 29d6cd9cd2
8 changed files with 0 additions and 1282 deletions

View File

@ -1,153 +0,0 @@
import random
import time
import numpy as np
import torch
import warnings
from transformers import AutoTokenizer, AutoModelForCausalLM
from model.model import Transformer
from model.LMConfig import LMConfig
warnings.filterwarnings('ignore')
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def init_model(lm_config):
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
model_from = 1 # 1从权重2用transformers
if model_from == 1:
moe_path = '_moe' if lm_config.use_moe else ''
ckp = f'./out/pretrain_{lm_config.dim}{moe_path}.pth'
model = Transformer(lm_config)
state_dict = torch.load(ckp, map_location=device)
# 处理不需要的前缀
unwanted_prefix = '_orig_mod.'
for k, v in list(state_dict.items()):
if k.startswith(unwanted_prefix):
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
for k, v in list(state_dict.items()):
if 'mask' in k:
del state_dict[k]
# 加载到模型中
model.load_state_dict(state_dict, strict=False)
else:
model = AutoModelForCausalLM.from_pretrained('minimind', trust_remote_code=True)
model = model.to(device)
print(f'模型参数: {count_parameters(model) / 1e6} 百万 = {count_parameters(model) / 1e9} B (Billion)')
return model, tokenizer
def setup_seed(seed):
random.seed(seed) # 设置 Python 的随机种子
np.random.seed(seed) # 设置 NumPy 的随机种子
torch.manual_seed(seed) # 设置 PyTorch 的随机种子
torch.cuda.manual_seed(seed) # 为当前 GPU 设置随机种子(如果有)
torch.cuda.manual_seed_all(seed) # 为所有 GPU 设置随机种子(如果有)
torch.backends.cudnn.deterministic = True # 确保每次返回的卷积算法是确定的
torch.backends.cudnn.benchmark = False # 关闭 cuDNN 的自动调优,避免不确定性
if __name__ == "__main__":
# -----------------------------------------------------------------------------
out_dir = 'out'
start = ""
temperature = 0.7
top_k = 8
setup_seed(1337)
# device = 'cpu'
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16'
max_seq_len = 512
lm_config = LMConfig()
lm_config.max_seq_len = max_seq_len
# -----------------------------------------------------------------------------
model, tokenizer = init_model(lm_config)
model = model.eval()
# int(input('输入0自动测试输入1问题测试'))
answer_way = 0
stream = True
prompt_datas = [
'椭圆和圆的区别',
'中国关于马克思主义基本原理',
'人类大脑的主要功能是',
'万有引力是',
'世界上人口最多的国家是',
'DNA的全称是',
'数学中π的值大约是',
'世界上最高的山峰是',
'太阳系中最大的行星是',
'二氧化碳的化学分子式是',
'地球上最大的动物是',
'地球自转一圈大约需要',
'杭州市的美食有',
'江苏省的最好的大学',
]
qa_index = 0
while True:
start = time.time()
if answer_way == 1:
# run generation
prompt = input('用户:')
else:
if qa_index >= len(prompt_datas):
break
prompt = prompt_datas[qa_index]
print('问题:', prompt)
qa_index += 1
prompt = tokenizer.bos_token + prompt
x = tokenizer(prompt).data['input_ids']
x = (torch.tensor(x, dtype=torch.long, device=device)[None, ...])
with torch.no_grad():
res_y = model.generate(x, tokenizer.eos_token_id, max_new_tokens=max_seq_len, temperature=temperature,
top_k=top_k, stream=stream)
print('回答:', end='')
try:
y = next(res_y)
except StopIteration:
print("No answer")
continue
history_idx = 0
while y != None:
answer = tokenizer.decode(y[0].tolist())
if answer and answer[-1] == '<EFBFBD>':
try:
y = next(res_y)
except:
break
continue
# print(answer)
if not len(answer):
try:
y = next(res_y)
except:
break
continue
print(answer[history_idx:], end='', flush=True)
try:
y = next(res_y)
except:
break
history_idx = len(answer)
if not stream:
break
print('\n')
end = time.time()
print(end - start, 's')

View File

@ -1,209 +0,0 @@
import os
import platform
import argparse
import time
import math
import warnings
import pandas as pd
import torch
import torch.distributed as dist
from torch import optim
from torch.nn.parallel import DistributedDataParallel
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader, DistributedSampler
from contextlib import nullcontext
from transformers import AutoTokenizer
from model.model import Transformer
from model.LMConfig import LMConfig
from model.dataset import PretrainDataset
warnings.filterwarnings('ignore')
def Logger(content):
if not ddp or dist.get_rank() == 0:
print(content)
def get_lr(it, all):
warmup_iters = args.warmup_iters
lr_decay_iters = all
min_lr = args.learning_rate / 10
if it < warmup_iters:
return args.learning_rate * it / warmup_iters
if it > lr_decay_iters:
return min_lr
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
return min_lr + coeff * (args.learning_rate - min_lr)
def train_epoch(epoch, wandb):
start_time = time.time()
for step, (X, Y, loss_mask) in enumerate(train_loader):
X = X.to(args.device)
Y = Y.to(args.device)
loss_mask = loss_mask.to(args.device)
lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
with ctx:
out = model(X, Y)
loss = out.last_loss / args.accumulation_steps
loss_mask = loss_mask.view(-1)
loss = torch.sum(loss * loss_mask) / loss_mask.sum()
scaler.scale(loss).backward()
if (step + 1) % args.accumulation_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad(set_to_none=True)
if step % args.log_interval == 0:
spend_time = time.time() - start_time
Logger(
'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
epoch,
args.epochs,
step,
iter_per_epoch,
loss.item() * args.accumulation_steps,
optimizer.param_groups[-1]['lr'],
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
if (wandb is not None) and (not ddp or dist.get_rank() == 0):
wandb.log({"loss": loss.item() * args.accumulation_steps,
"lr": optimizer.param_groups[-1]['lr'],
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
if (step + 1) % args.save_interval == 0 and (not ddp or dist.get_rank() == 0):
model.eval()
moe_path = '_moe' if lm_config.use_moe else ''
ckp = f'{args.save_dir}/pretrain_{lm_config.dim}{moe_path}.pth'
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
torch.save(state_dict, ckp)
model.train()
def init_model():
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
model = Transformer(lm_config).to(args.device)
# moe_path = '_moe' if lm_config.use_moe else ''
Logger(f'LLM总参数量{count_parameters(model) / 1e6:.3f} 百万')
return model, tokenizer
def init_distributed_mode():
if not ddp: return
global ddp_local_rank, DEVICE
dist.init_process_group(backend="nccl")
ddp_rank = int(os.environ["RANK"])
ddp_local_rank = int(os.environ["LOCAL_RANK"])
ddp_world_size = int(os.environ["WORLD_SIZE"])
DEVICE = f"cuda:{ddp_local_rank}"
torch.cuda.set_device(DEVICE)
# torchrun --nproc_per_node 2 1-pretrain.py
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="MiniMind Pretraining")
parser.add_argument("--out_dir", type=str, default="out", help="Output directory")
parser.add_argument("--epochs", type=int, default=20, help="Number of epochs")
parser.add_argument("--batch_size", type=int, default=64, help="Batch size")
parser.add_argument("--learning_rate", type=float, default=2e-4, help="Learning rate")
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu",
help="Device to use")
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases")
parser.add_argument("--wandb_project", type=str, default="MiniMind-Pretrain", help="Weights & Biases project name")
parser.add_argument("--num_workers", type=int, default=1, help="Number of workers for data loading")
parser.add_argument("--data_path", type=str, default="./dataset/pretrain_data.csv", help="Path to training data")
parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel")
parser.add_argument("--accumulation_steps", type=int, default=8, help="Gradient accumulation steps")
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
parser.add_argument("--warmup_iters", type=int, default=0, help="Number of warmup iterations")
parser.add_argument("--log_interval", type=int, default=100, help="Logging interval")
parser.add_argument("--save_interval", type=int, default=1000, help="Model saving interval")
parser.add_argument('--local_rank', type=int, default=-1, help='local rank for distributed training')
args = parser.parse_args()
lm_config = LMConfig()
max_seq_len = lm_config.max_seq_len
args.save_dir = os.path.join(args.out_dir)
os.makedirs(args.save_dir, exist_ok=True)
os.makedirs(args.out_dir, exist_ok=True)
tokens_per_iter = args.batch_size * max_seq_len
torch.manual_seed(1337)
device_type = "cuda" if "cuda" in args.device else "cpu"
args.wandb_run_name = f"MiniMind-Pretrain-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run?
ddp_local_rank, DEVICE = 0, "cuda:0"
if ddp:
init_distributed_mode()
args.device = torch.device(DEVICE)
if args.use_wandb and (not ddp or ddp_local_rank == 0):
import wandb
wandb.init(project=args.wandb_project, name=args.wandb_run_name)
else:
wandb = None
model, tokenizer = init_model()
df = pd.read_csv(args.data_path)
df = df.sample(frac=1.0)
train_ds = PretrainDataset(df, tokenizer, max_length=max_seq_len)
train_sampler = DistributedSampler(train_ds) if ddp else None
train_loader = DataLoader(
train_ds,
batch_size=args.batch_size,
pin_memory=True,
drop_last=False,
shuffle=False,
num_workers=args.num_workers,
sampler=train_sampler
)
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16']))
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
if False and platform.system() != 'Windows' and float(torch.__version__.split('.')[0]) >= 2:
Logger("compiling the model... (takes a ~minute)")
unoptimized_model = model
model = torch.compile(model)
if ddp:
model._ddp_params_and_buffers_to_ignore = {"pos_cis"}
model = DistributedDataParallel(model, device_ids=[ddp_local_rank])
iter_per_epoch = len(train_loader)
for epoch in range(args.epochs):
train_epoch(epoch, wandb)

View File

@ -1,186 +0,0 @@
import random
import time
import numpy as np
import torch
import warnings
from transformers import AutoTokenizer, AutoModelForCausalLM
from model.model import Transformer
from model.LMConfig import LMConfig
warnings.filterwarnings('ignore')
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def init_model(lm_config):
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
model_from = 1 # 1从权重2用transformers
if model_from == 1:
moe_path = '_moe' if lm_config.use_moe else ''
# ckp = f'./out/multi_chat/full_sft_{lm_config.dim}{moe_path}.pth'
model = Transformer(lm_config)
# state_dict = torch.load(ckp, map_location=device)
# # 处理不需要的前缀
# unwanted_prefix = '_orig_mod.'
# for k, v in list(state_dict.items()):
# if k.startswith(unwanted_prefix):
# state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
#
# for k, v in list(state_dict.items()):
# if 'mask' in k:
# del state_dict[k]
#
# # 加载到模型中
# model.load_state_dict(state_dict, strict=False)
else:
model = AutoModelForCausalLM.from_pretrained('./minimind-v1-small', trust_remote_code=True)
model = model.to(device)
print(f'模型参数: {count_parameters(model) / 1e6} 百万 = {count_parameters(model) / 1e9} B (Billion)')
return model, tokenizer
def setup_seed(seed):
random.seed(seed) # 设置 Python 的随机种子
np.random.seed(seed) # 设置 NumPy 的随机种子
torch.manual_seed(seed) # 设置 PyTorch 的随机种子
torch.cuda.manual_seed(seed) # 为当前 GPU 设置随机种子(如果有)
torch.cuda.manual_seed_all(seed) # 为所有 GPU 设置随机种子(如果有)
torch.backends.cudnn.deterministic = True # 确保每次返回的卷积算法是确定的
torch.backends.cudnn.benchmark = False # 关闭 cuDNN 的自动调优,避免不确定性
if __name__ == "__main__":
# -----------------------------------------------------------------------------
out_dir = 'out'
start = ""
temperature = 0.1
top_k = 16
# device = 'cpu'
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16'
max_seq_len = 1 * 1024
lm_config = LMConfig()
lm_config.max_seq_len = max_seq_len
# 对话是否携带历史对话(当前模型没有在连续对话数据集上训练,增大历史上文基本不会有新的问答能力)
contain_history_chat = False
# -----------------------------------------------------------------------------
model, tokenizer = init_model(lm_config)
model = model.eval()
# 推送到huggingface
# model.push_to_hub("minimind")
# tokenizer.push_to_hub("minimind")
# answer_way = int(input('输入0自动测试输入1问题测试'))
answer_way = 0
stream = True
prompt_datas = [
'你叫什么名字',
'你是谁',
'中国有哪些比较好的大学?',
'全世界最好的大学是什么?',
'你知道光速是多少吗?',
'你知道长江吗?',
'人类的血液主要由哪些成分组成?',
'第一颗人造卫星是哪个国家发射的?',
'你知道杭州有什么美食吗?',
'你知道泰山在哪里吗?',
'地球上最大的动物是什么?',
'地球自转一圈大约需要多少时间?',
'人类最早使用的金属是什么?',
'水的化学分子式是什么?',
'大气层中含量最多的气体是什么?',
'世界上最高的山峰是什么?',
'你知道世界上最深的海沟是什么吗?',
'最早发明印刷术的是哪个国家?',
'万有引力是谁提出的?',
'光合作用的主要原理是什么?',
'你知道大熊猫的主要食物是什么吗?',
'海水为什么是咸的?',
'我们平时喝的牛奶主要含有什么营养成分?',
'一星期有多少天?'
]
messages_origin = []
messages = messages_origin
i = 0
while i < len(prompt_datas):
# Generate a random seed
# random_seed = random.randint(0, 2 ** 32 - 1)
# setup_seed(random_seed)
if not contain_history_chat:
messages = messages_origin.copy()
if answer_way == 1:
prompt = input('[Q]: ')
else:
prompt = prompt_datas[i]
print(f'[Q]: {prompt}')
i += 1
prompt = '请问,' + prompt
messages.append({"role": "user", "content": prompt})
# print(messages)
new_prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)[-(max_seq_len - 1):]
x = tokenizer(new_prompt).data['input_ids']
x = (torch.tensor(x, dtype=torch.long, device=device)[None, ...])
answer = new_prompt
with torch.no_grad():
res_y = model.generate(x, tokenizer.eos_token_id, max_new_tokens=max_seq_len, temperature=temperature,
top_k=top_k, stream=stream)
print('[A]: ', end='')
try:
y = next(res_y)
except StopIteration:
print("No answer")
continue
history_idx = 0
while y != None:
answer = tokenizer.decode(y[0].tolist())
if answer and answer[-1] == '<EFBFBD>':
try:
y = next(res_y)
except:
break
continue
# print(answer)
if not len(answer):
try:
y = next(res_y)
except:
break
continue
print(answer[history_idx:], end='', flush=True)
try:
y = next(res_y)
except:
break
history_idx = len(answer)
if not stream:
break
print('\n')
if contain_history_chat:
assistant_answer = answer.replace(new_prompt, "")
messages.append({"role": "assistant", "content": assistant_answer})

View File

@ -1,216 +0,0 @@
import os
import platform
import argparse
import time
import math
import warnings
import pandas as pd
import torch
import torch.nn.functional as F
import torch.distributed as dist
from contextlib import nullcontext
from torch import optim
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, DistributedSampler
from transformers import AutoTokenizer, AutoModelForCausalLM
from model.model import Transformer
from model.LMConfig import LMConfig
from model.dataset import SFTDataset
warnings.filterwarnings('ignore')
def Logger(content):
if not ddp or dist.get_rank() == 0:
print(content)
def get_lr(it, all):
warmup_iters = args.warmup_iters
lr_decay_iters = all
min_lr = args.learning_rate / 10
if it < warmup_iters:
return args.learning_rate * it / warmup_iters
if it > lr_decay_iters:
return min_lr
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
return min_lr + coeff * (args.learning_rate - min_lr)
def train_epoch(epoch, wandb):
start_time = time.time()
for step, (X, Y, loss_mask) in enumerate(train_loader):
X = X.to(args.device)
Y = Y.to(args.device)
loss_mask = loss_mask.to(args.device)
lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
with ctx:
logits = model(X, Y).logits
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=0, reduction='none')
loss_mask = loss_mask.view(-1)
loss = torch.sum(loss * loss_mask) / loss_mask.sum()
scaler.scale(loss).backward()
if (step + 1) % args.accumulation_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad(set_to_none=True)
if step % args.log_interval == 0:
spend_time = time.time() - start_time
Logger(
'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
epoch,
args.epochs,
step,
iter_per_epoch,
loss.item(),
optimizer.param_groups[-1]['lr'],
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
if (wandb is not None) and (not ddp or dist.get_rank() == 0):
wandb.log({"loss": loss,
"lr": optimizer.param_groups[-1]['lr'],
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
if (step + 1) % args.save_interval == 0 and (not ddp or dist.get_rank() == 0):
model.eval()
moe_path = '_moe' if lm_config.use_moe else ''
ckp = f'{args.save_dir}/full_sft_{lm_config.dim}{moe_path}.pth'
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
torch.save(state_dict, ckp)
model.train()
def init_model():
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
model_from = 1 # 1从权重2用transformers
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
if model_from == 1:
model = Transformer(lm_config)
moe_path = '_moe' if lm_config.use_moe else ''
ckp = f'./out/pretrain_{lm_config.dim}{moe_path}.pth'
state_dict = torch.load(ckp, map_location=args.device)
unwanted_prefix = '_orig_mod.'
for k, v in list(state_dict.items()):
if k.startswith(unwanted_prefix):
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict, strict=False)
else:
model = AutoModelForCausalLM.from_pretrained('./minimind-v1-small', trust_remote_code=True)
Logger(f'LLM总参数量{count_parameters(model) / 1e6:.3f} 百万')
model = model.to(args.device)
return model, tokenizer
def init_distributed_mode():
if not ddp: return
global ddp_local_rank, DEVICE
dist.init_process_group(backend="nccl")
ddp_rank = int(os.environ["RANK"])
ddp_local_rank = int(os.environ["LOCAL_RANK"])
ddp_world_size = int(os.environ["WORLD_SIZE"])
DEVICE = f"cuda:{ddp_local_rank}"
torch.cuda.set_device(DEVICE)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="MiniMind Full SFT")
parser.add_argument("--out_dir", type=str, default="out", help="Output directory")
parser.add_argument("--epochs", type=int, default=19, help="Number of epochs")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", help="Device to use")
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases")
parser.add_argument("--wandb_project", type=str, default="MiniMind-Full-SFT", help="Weights & Biases project name")
parser.add_argument("--num_workers", type=int, default=1, help="Number of workers for data loading")
parser.add_argument("--ddp", action="store_true", help="Use DistributedDataParallel")
parser.add_argument("--accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
parser.add_argument("--warmup_iters", type=int, default=0, help="Number of warmup iterations")
parser.add_argument("--log_interval", type=int, default=100, help="Logging interval")
parser.add_argument("--save_interval", type=int, default=1000, help="Model saving interval")
parser.add_argument('--local_rank', type=int, default=-1, help='local rank for distributed training')
args = parser.parse_args()
lm_config = LMConfig()
max_seq_len = lm_config.max_seq_len
args.save_dir = os.path.join(args.out_dir)
os.makedirs(args.save_dir, exist_ok=True)
os.makedirs(args.out_dir, exist_ok=True)
tokens_per_iter = args.batch_size * max_seq_len
torch.manual_seed(1337)
device_type = "cuda" if "cuda" in args.device else "cpu"
args.wandb_run_name = f"MiniMind-Full-SFT-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run?
ddp_local_rank, DEVICE = 0, "cuda:0"
if ddp:
init_distributed_mode()
args.device = torch.device(DEVICE)
if args.use_wandb and (not ddp or ddp_local_rank == 0):
import wandb
wandb.init(project=args.wandb_project, name=args.wandb_run_name)
else:
wandb = None
model, tokenizer = init_model()
df = pd.read_csv('./dataset/sft_data_single.csv')
df = df.sample(frac=1.0)
train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len)
train_sampler = DistributedSampler(train_ds) if ddp else None
train_loader = DataLoader(
train_ds,
batch_size=args.batch_size,
pin_memory=True,
drop_last=False,
shuffle=False,
num_workers=args.num_workers,
sampler=train_sampler
)
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16']))
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
if False and not lm_config.use_moe and platform.system() != 'Windows' and float(torch.__version__.split('.')[0]) >= 2:
Logger("compiling the model... (takes a ~minute)")
unoptimized_model = model
model = torch.compile(model)
if ddp:
model._ddp_params_and_buffers_to_ignore = {"pos_cis"}
model = DistributedDataParallel(model, device_ids=[ddp_local_rank])
iter_per_epoch = len(train_loader)
for epoch in range(args.epochs):
train_epoch(epoch, wandb)

View File

@ -1,188 +0,0 @@
import os
import platform
import argparse
import time
import math
import warnings
import torch
import pandas as pd
import torch.nn.functional as F
from contextlib import nullcontext
from torch import optim
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader
from model.LMConfig import LMConfig
from model.dataset import SFTDataset
from model.model import Transformer
warnings.filterwarnings('ignore')
def Logger(content):
print(content)
def get_lr(it, all):
warmup_iters = args.warmup_iters
lr_decay_iters = all
min_lr = args.learning_rate / 10
if it < warmup_iters:
return args.learning_rate * it / warmup_iters
if it > lr_decay_iters:
return min_lr
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
return min_lr + coeff * (args.learning_rate - min_lr)
def train_epoch(epoch, wandb):
start_time = time.time()
for step, (X, Y, loss_mask) in enumerate(train_loader):
X = X.to(args.device)
Y = Y.to(args.device)
loss_mask = loss_mask.to(args.device)
lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
with ctx:
logits = model(X, Y).logits
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=0, reduction='none')
loss_mask = loss_mask.view(-1)
loss = torch.sum(loss * loss_mask) / loss_mask.sum()
loss = loss / args.accumulation_steps
scaler.scale(loss).backward()
if (step + 1) % args.accumulation_steps == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad(set_to_none=True)
if step % args.log_interval == 0:
spend_time = time.time() - start_time
Logger(
'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
epoch,
args.epochs,
step,
iter_per_epoch,
loss.item() * args.accumulation_steps,
optimizer.param_groups[-1]['lr'],
spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))
if wandb is not None:
wandb.log({"loss": loss.item() * args.accumulation_steps,
"lr": optimizer.param_groups[-1]['lr'],
"epoch_Time": spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60})
if (step + 1) % args.save_interval == 0:
model.save_pretrained(args.save_dir)
def find_linear_with_keys(model, keys=["wq", "wk"]):
cls = torch.nn.Linear
linear_names = []
for name, module in model.named_modules():
if isinstance(module, cls):
for key in keys:
if key in name:
linear_names.append(name)
break
return linear_names
def init_model():
model_name_or_path = "./minimind-v1-small"
tokenizer_name_or_path = "./minimind-v1-small"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True).to(args.device)
target_modules = find_linear_with_keys(model)
peft_config = LoraConfig(
r=8,
target_modules=target_modules
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model = model.to(args.device)
return model, tokenizer
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="MiniMind LoRA Fine-tuning")
parser.add_argument("--out_dir", type=str, default="out", help="Output directory")
parser.add_argument("--epochs", type=int, default=20, help="Number of epochs")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu",
help="Device to use")
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type")
parser.add_argument("--use_wandb", action="store_true", help="Use Weights & Biases")
parser.add_argument("--wandb_project", type=str, default="MiniMind-LoRA", help="Weights & Biases project name")
parser.add_argument("--num_workers", type=int, default=1, help="Number of workers for data loading")
parser.add_argument("--accumulation_steps", type=int, default=1, help="Gradient accumulation steps")
parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping threshold")
parser.add_argument("--warmup_iters", type=int, default=1000, help="Number of warmup iterations")
parser.add_argument("--log_interval", type=int, default=100, help="Logging interval")
parser.add_argument("--save_interval", type=int, default=1000, help="Model saving interval")
args = parser.parse_args()
lm_config = LMConfig()
max_seq_len = lm_config.max_seq_len
args.save_dir = os.path.join(args.out_dir)
os.makedirs(args.save_dir, exist_ok=True)
os.makedirs(args.out_dir, exist_ok=True)
tokens_per_iter = args.batch_size * max_seq_len
torch.manual_seed(1337)
device_type = "cuda" if "cuda" in args.device else "cpu"
args.wandb_run_name = f"MiniMind-LoRA-Epoch-{args.epochs}-BatchSize-{args.batch_size}-LearningRate-{args.learning_rate}"
ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()
if args.use_wandb:
import wandb
wandb.init(project=args.wandb_project, name=args.wandb_run_name)
else:
wandb = None
model, tokenizer = init_model()
df = pd.read_csv('./dataset/sft_data_single.csv')
df = df.sample(frac=1.0)
train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len)
train_loader = DataLoader(
train_ds,
batch_size=args.batch_size,
pin_memory=True,
drop_last=False,
shuffle=False,
num_workers=args.num_workers,
)
scaler = torch.cuda.amp.GradScaler(enabled=(args.dtype in ['float16', 'bfloat16']))
optimizer = optim.Adam(
filter(lambda p: p.requires_grad, model.parameters()),
lr=args.learning_rate
)
if False and platform.system() != 'Windows' and float(torch.__version__.split('.')[0]) >= 2:
Logger("compiling the model... (takes a ~minute)")
unoptimized_model = model
model = torch.compile(model)
iter_per_epoch = len(train_loader)
for epoch in range(args.epochs):
train_epoch(epoch, wandb)

View File

@ -1,48 +0,0 @@
import os
import warnings
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from transformers import TrainingArguments, AutoModelForCausalLM, AutoTokenizer
from trl import DPOConfig, DPOTrainer
from datasets import load_dataset
warnings.filterwarnings('ignore')
def init_model():
device = 'cuda:0'
# Do model patching and add fast LoRA weights
model_name_or_path = "minimind-v1"
tokenizer_name_or_path = "minimind-v1"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
model = model.to(device)
return model, tokenizer
if __name__ == '__main__':
model, tokenizer = init_model()
training_config = DPOConfig(
output_dir="./minimind_dpo",
per_device_train_batch_size=1,
remove_unused_columns=False,
report_to="none",
save_steps=2000,
learning_rate=4e-5
)
dataset_path = './dataset/dpo/train_data.json'
train_dataset = load_dataset('json', data_files=dataset_path)
dpo_trainer = DPOTrainer(
model,
ref_model=None,
args=training_config,
beta=0.1,
train_dataset=train_dataset['train'],
tokenizer=tokenizer,
max_length=512,
max_prompt_length=512
)
dpo_trainer.train()

View File

@ -1,185 +0,0 @@
import csv
import glob
import os
import re
import json
import jsonlines
import pandas as pd
from tqdm import tqdm
bos_token = "<s>"
eos_token = "</s>"
def pretrain_process():
# 定义输入和输出路径
input_dir = '../CCI3-HQ/data'
output_file = '../dataset/pretrain_data_hq.csv'
jsonl_files = glob.glob(os.path.join(input_dir, 'part_*.jsonl'))
total_lines = 0
print("正在计算总行数...")
for file in jsonl_files:
with open(file, 'r', encoding='utf-8') as f:
for _ in f:
total_lines += 1
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['text', 'score']) # 写入表头
for jsonl_file in jsonl_files:
with open(jsonl_file, 'r', encoding='utf-8') as f:
for line in tqdm(f, desc=f'处理 {os.path.basename(jsonl_file)}', total=total_lines, unit='',
leave=False):
try:
data = json.loads(line)
text = data.get('text', '')
score = data.get('score', 0)
if len(text) <= 512 and score > 3.5:
writer.writerow([text, score])
except json.JSONDecodeError:
continue
print(f"筛选完成,结果已保存到 {output_file}")
def sft_process():
sft_file_name = 'sft_data.csv'
def process_and_write_data(data):
q_lst, a_lst, history_lst = [], [], []
for per in data:
history, q, a = per['history'], per['q'], per['a']
if not q or not a:
continue
history_len = sum(len(s) for s in history)
message_len = history_len + len(q) + len(a)
if message_len < 70 or message_len > 512:
continue
q_lst.append(q)
a_lst.append(a)
history_lst.append(history)
df = pd.DataFrame({'history': history_lst, 'q': q_lst, 'a': a_lst})
df.to_csv(f'../dataset/{sft_file_name}',
mode='a', header=False, index=False,
lineterminator='\r\n', escapechar='\\', encoding='utf-8')
chunk_size = 1000
data = []
with open(f'../dataset/{sft_file_name}', 'w', encoding='utf-8') as f:
f.write('history,q,a\n')
# sft_path = ['/root/shared-nvme/sft_data_zh.jsonl', '/root/shared-nvme/sft_data_en.jsonl']
sft_path = ['/root/shared-nvme/sft_data_en.jsonl']
chunk_num = 0
for path in sft_path:
with jsonlines.open(path) as reader:
for idx, obj in enumerate(reader):
try:
data.append({
'history': obj.get('history', ''),
'q': obj.get('input', '') + obj.get('q', ''),
'a': obj.get('output', '') + obj.get('a', '')
})
if len(data) >= chunk_size:
chunk_num += 1
process_and_write_data(data)
data = []
if chunk_num % 100 == 0:
print(f'chunk:{chunk_num} process end')
except jsonlines.InvalidLineError as e:
print(f"Skipping invalid JSON line {idx + 1}: {e}")
continue
if data:
process_and_write_data(data)
data = []
def rl_process():
# 偏好数据默认只用中文(建议)
input_paths = [
# "../dataset/dpo_en.json",
"../dataset/dpo_zh.json"
]
output_path = "../dataset/dpo_data.jsonl" # 修改输出文件扩展名为 .jsonl
all_converted = []
for input_path in input_paths:
with open(input_path, "r", encoding="utf-8") as f:
data = json.load(f) # data is likely a list
for item in data:
new_data = {
"chosen": [],
"rejected": []
}
for turn in item["conversations"]:
role = "user" if turn["from"] == "human" else "assistant"
message = {"role": role, "content": turn["value"]}
new_data["chosen"].append(message)
new_data["rejected"].append(message)
new_data["chosen"].append({
"role": "assistant",
"content": item["chosen"]["value"]
})
new_data["rejected"].append({
"role": "assistant",
"content": item["rejected"]["value"]
})
all_converted.append(new_data)
with open(output_path, "w", encoding="utf-8") as f:
for item in all_converted:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
def lora_dataset():
import json
import csv
# 读取JSON文件
with open('../dataset/Chinese-medical-dialogue.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# 准备CSV数据
csv_data = []
for item in data:
# 提取input和output并去除首尾空白
q = item['input'].strip()
a = item['output'].strip()
# 检查长度是否符合要求
if len(q) + len(a) < 160:
csv_data.append({
'history': '[]',
'q': q,
'a': a
})
# 写入CSV文件
with open('../dataset/medical_sft.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['history', 'q', 'a']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(csv_data)
print(f'转换完成,共处理 {len(csv_data)} 条有效数据')
if __name__ == "__main__":
################
# 1: pretrain
# 2: sft
# 3: RL
################
process_type = 4
if process_type == 1:
pretrain_process()
if process_type == 2:
sft_process()
if process_type == 3:
rl_process()
if process_type == 4:
lora_dataset()

View File

@ -1,97 +0,0 @@
# from datasets import load_dataset
#
# dataset_paths = [
# ['ceval/ceval-exam',
# ['computer_network', 'operating_system', 'computer_architecture', 'college_programming', 'college_physics',
# 'college_chemistry', 'advanced_mathematics', 'probability_and_statistics', 'discrete_mathematics',
# 'electrical_engineer', 'metrology_engineer', 'high_school_mathematics', 'high_school_physics',
# 'high_school_chemistry', 'high_school_biology', 'middle_school_mathematics', 'middle_school_biology',
# 'middle_school_physics', 'middle_school_chemistry', 'veterinary_medicine', 'college_economics',
# 'business_administration', 'marxism', 'mao_zedong_thought', 'education_science', 'teacher_qualification',
# 'high_school_politics', 'high_school_geography', 'middle_school_politics', 'middle_school_geography',
# 'modern_chinese_history', 'ideological_and_moral_cultivation', 'logic', 'law', 'chinese_language_and_literature',
# 'art_studies', 'professional_tour_guide', 'legal_professional', 'high_school_chinese', 'high_school_history',
# 'middle_school_history', 'civil_servant', 'sports_science', 'plant_protection', 'basic_medicine',
# 'clinical_medicine', 'urban_and_rural_planner', 'accountant', 'fire_engineer',
# 'environmental_impact_assessment_engineer', 'tax_accountant', 'physician']], # ceval*
# ['haonan-li/cmmlu', [
# 'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics',
# 'chinese_civil_service_exam', 'chinese_driving_rule', 'chinese_food_culture',
# 'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
# 'chinese_teacher_qualification', 'clinical_knowledge', 'college_actuarial_science',
# 'college_education', 'college_engineering_hydrology', 'college_law',
# 'college_mathematics', 'college_medical_statistics', 'college_medicine',
# 'computer_science', 'computer_security', 'conceptual_physics',
# 'construction_project_management', 'economics', 'education', 'electrical_engineering',
# 'elementary_chinese', 'elementary_commonsense', 'elementary_information_and_technology',
# 'elementary_mathematics', 'ethnology', 'food_science', 'genetics', 'global_facts',
# 'high_school_biology', 'high_school_chemistry', 'high_school_geography',
# 'high_school_mathematics', 'high_school_physics', 'high_school_politics',
# 'human_sexuality', 'international_law', 'journalism', 'jurisprudence',
# 'legal_and_moral_basis', 'logical', 'machine_learning', 'management', 'marketing',
# 'marxist_theory', 'modern_chinese', 'nutrition', 'philosophy', 'professional_accounting',
# 'professional_law', 'professional_medicine', 'professional_psychology',
# 'public_relations', 'security_study', 'sociology', 'sports_science',
# 'traditional_chinese_medicine', 'virology', 'world_history', 'world_religions'
# ]], # cmmlu*
# ['tyouisen/aclue',
# ['polysemy_resolution', 'poetry_sentiment_analysis', 'named_entity_recognition', 'basic_ancient_chinese',
# 'poetry_context_prediction', 'sentence_segmentation', 'couplet_prediction', 'poetry_appreciate',
# 'ancient_chinese_culture', 'ancient_phonetics', 'homographic_character_resolution', 'ancient_literature',
# 'ancient_medical', 'poetry_quality_assessment', 'reading_comprehension']], # aclue
# ['juletxara/mgsm', ['zh']], # mgsm_direct_zh
# ['openbookqa', ['main']], # openbookqa
# ['ZoneTwelve/tmmluplus',
# ['dentistry', 'traditional_chinese_medicine_clinical_medicine', 'clinical_psychology', 'technical',
# 'culinary_skills', 'mechanical', 'logic_reasoning', 'real_estate', 'general_principles_of_law', 'finance_banking',
# 'anti_money_laundering', 'ttqav2', 'marketing_management', 'business_management', 'organic_chemistry',
# 'advance_chemistry', 'physics', 'secondary_physics', 'human_behavior', 'national_protection', 'jce_humanities',
# 'politic_science', 'agriculture', 'official_document_management', 'financial_analysis', 'pharmacy',
# 'educational_psychology', 'statistics_and_machine_learning', 'management_accounting', 'introduction_to_law',
# 'computer_science', 'veterinary_pathology', 'accounting', 'fire_science', 'optometry', 'insurance_studies',
# 'pharmacology', 'taxation', 'education_(profession_level)', 'economics', 'veterinary_pharmacology',
# 'nautical_science', 'occupational_therapy_for_psychological_disorders', 'trust_practice', 'geography_of_taiwan',
# 'physical_education', 'auditing', 'administrative_law', 'basic_medical_science', 'macroeconomics', 'trade',
# 'chinese_language_and_literature', 'tve_design', 'junior_science_exam', 'junior_math_exam', 'junior_chinese_exam',
# 'junior_social_studies', 'tve_mathematics', 'tve_chinese_language', 'tve_natural_sciences', 'junior_chemistry',
# 'music', 'education', 'three_principles_of_people', 'taiwanese_hokkien', 'engineering_math', 'linear_algebra']]
# # tmmluplus
#
# ]
#
# for dataset_path in dataset_paths:
# for dataset_name in dataset_path[1]:
# datasets = load_dataset(dataset_path[0], dataset_name, cache_dir='./test_dataset_cache')
#
# """
# export HF_HUB_OFFLINE=1 && lm_eval --model hf --model_args pretrained=/xxx/minimind/minimind-v2-small/,device=cuda,dtype=auto --tasks ceval* --batch_size 8 --trust_remote_code
# """
"""
$env:HF_HUB_OFFLINE=1; lm_eval --model hf --model_args pretrained=../minimind-v2-small/,device=cuda,dtype=auto --tasks ceval* --batch_size 8 --trust_remote_code
"""
import subprocess
# 定义要执行的命令
command = (
'set HF_HUB_OFFLINE=1 & '
'lm_eval --model hf --model_args pretrained=../minimind-v2-small/,device=cuda,dtype=auto '
'--tasks ceval* --batch_size 8 --trust_remote_code'
)
# 使用 subprocess 执行命令
try:
process = subprocess.run(
command,
shell=True,
check=True,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
# 打印命令的输出
print("STDOUT:", process.stdout)
print("STDERR:", process.stderr)
except subprocess.CalledProcessError as e:
print(f"命令执行失败,返回码: {e.returncode}")
print("STDERR:", e.stderr)