diff --git a/1-pretrain.py b/1-pretrain.py index a7f4a5b..dcf3f06 100644 --- a/1-pretrain.py +++ b/1-pretrain.py @@ -99,7 +99,7 @@ def init_model(): # model init model = Transformer(lm_config).to(device) - # moe_path = '_moe' if lm_config.use_moe else '' + moe_path = '_moe' if lm_config.use_moe else '' # ckp = f'{save_dir}/pretrain_{lm_config.dim}{moe_path}.pth' # # state_dict = torch.load(ckp, map_location=device) diff --git a/2-eval.py b/2-eval.py index 5297db8..5abdace 100644 --- a/2-eval.py +++ b/2-eval.py @@ -21,7 +21,7 @@ def init_model(lm_config): if model_from == 1: moe_path = '_moe' if lm_config.use_moe else '' - ckp = f'./out/single_chat/full_sft_{lm_config.dim}{moe_path}.pth' + ckp = f'./out/full_sft_{lm_config.dim}{moe_path}.pth' model = Transformer(lm_config) state_dict = torch.load(ckp, map_location=device) diff --git a/3-full_sft.py b/3-full_sft.py index 2097faf..4ea9cee 100644 --- a/3-full_sft.py +++ b/3-full_sft.py @@ -108,17 +108,15 @@ def init_model(lm_config): return sum(p.numel() for p in model.parameters() if p.requires_grad) if model_from == 1: + model = Transformer(lm_config) moe_path = '_moe' if lm_config.use_moe else '' ckp = f'./out/pretrain_{lm_config.dim}{moe_path}.pth' - - model = Transformer(lm_config) - # state_dict = torch.load(ckp, map_location=device) - # - # unwanted_prefix = '_orig_mod.' - # for k, v in list(state_dict.items()): - # if k.startswith(unwanted_prefix): - # state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) - # model.load_state_dict(state_dict, strict=False) + state_dict = torch.load(ckp, map_location=device) + unwanted_prefix = '_orig_mod.' + for k, v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) + model.load_state_dict(state_dict, strict=False) else: model = AutoModel.from_pretrained('./minimind', trust_remote_code=True) @@ -148,7 +146,7 @@ if __name__ == "__main__": out_dir = 'out' epochs = 19 gradient_accumulation_steps = 1 - batch_size = 80 + batch_size = 50 learning_rate = 2e-4 device = 'cuda:0' dtype = 'bfloat16' @@ -175,7 +173,7 @@ if __name__ == "__main__": model, tokenizer = init_model(lm_config) # -----init dataloader------ - df = pd.read_csv('./dataset/sft_data_single.csv') + df = pd.read_csv('./dataset/sft_data_multi.csv') df = df.sample(frac=1.0) train_ds = SFTDataset(df, tokenizer, max_length=max_seq_len) train_sampler = DistributedSampler(train_ds) if ddp else None diff --git a/model/mistral_tokenizer/tokenizer.model b/model/mistral_tokenizer/tokenizer.model new file mode 100644 index 0000000..85c0803 Binary files /dev/null and b/model/mistral_tokenizer/tokenizer.model differ diff --git a/model/mistral_tokenizer/tokenizer_config.json b/model/mistral_tokenizer/tokenizer_config.json new file mode 100644 index 0000000..eb538f9 --- /dev/null +++ b/model/mistral_tokenizer/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": null, + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ 'user\\n' + content + '\\nassistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '' + '\\n' }}{% endif %}{% endfor %}" +}