pretrain过程中会打印10个token以方便观察
This commit is contained in:
parent
2797b76939
commit
d701003f8a
@ -14,7 +14,7 @@
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"content": "<s>",
|
||||
"content": "<|im_start|>",
|
||||
"single_word": false,
|
||||
"lstrip": false,
|
||||
"rstrip": false,
|
||||
@ -23,7 +23,7 @@
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"content": "</s>",
|
||||
"content": "<|im_end|>",
|
||||
"single_word": false,
|
||||
"lstrip": false,
|
||||
"rstrip": false,
|
||||
@ -56,8 +56,8 @@
|
||||
"ignore_merges": false,
|
||||
"vocab": {
|
||||
"<unk>": 0,
|
||||
"<s>": 1,
|
||||
"</s>": 2,
|
||||
"<|im_start|>": 1,
|
||||
"<|im_end|>": 2,
|
||||
"!": 3,
|
||||
"\"": 4,
|
||||
"#": 5,
|
||||
|
@ -12,7 +12,7 @@
|
||||
"special": true
|
||||
},
|
||||
"1": {
|
||||
"content": "<s>",
|
||||
"content": "<|im_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
@ -20,7 +20,7 @@
|
||||
"special": true
|
||||
},
|
||||
"2": {
|
||||
"content": "</s>",
|
||||
"content": "<|im_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
@ -29,9 +29,9 @@
|
||||
}
|
||||
},
|
||||
"additional_special_tokens": [],
|
||||
"bos_token": "<s>",
|
||||
"bos_token": "<|im_start|>",
|
||||
"clean_up_tokenization_spaces": false,
|
||||
"eos_token": "</s>",
|
||||
"eos_token": "<|im_end|>",
|
||||
"legacy": true,
|
||||
"model_max_length": 32768,
|
||||
"pad_token": "<unk>",
|
||||
@ -39,5 +39,5 @@
|
||||
"spaces_between_special_tokens": false,
|
||||
"tokenizer_class": "PreTrainedTokenizerFast",
|
||||
"unk_token": "<unk>",
|
||||
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{{ '<s>system\\n' + system_message + '</s>\\n' }}{% else %}{{ '<s>system\\n你是 MiniMind,是一个有用的人工智能助手。</s>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<s>user\\n' + content + '</s>\\n<s>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' + '\\n' }}{% endif %}{% endfor %}"
|
||||
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% else %}{{ '<|im_start|>system\\n你是 MiniMind,是一个有用的人工智能助手。<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
|
||||
}
|
File diff suppressed because one or more lines are too long
@ -583,16 +583,19 @@ class MiniMindLM(PreTrainedModel):
|
||||
return res
|
||||
|
||||
def _stream(self, input_ids, eos_token_id, max_new_tokens, temperature, top_p, rp, **args):
|
||||
start, first_seq, past_kvs = input_ids.shape[1], True, None
|
||||
while input_ids.shape[1] < max_new_tokens - 1:
|
||||
if first_seq:
|
||||
out, first_seq = self(input_ids, **args), False
|
||||
else:
|
||||
out = self(input_ids[:, -1:],
|
||||
start_pos=input_ids.shape[1] - 1, **args)
|
||||
logits, past_kvs = out.logits[:, -1, :], out.past_key_values
|
||||
start = input_ids.shape[1]
|
||||
for _ in range(max_new_tokens):
|
||||
# 每次都传入完整的input_ids,不使用KV缓存
|
||||
out = self(input_ids, **args)
|
||||
logits = out.logits[:, -1, :] # 取最后一个位置的logits
|
||||
|
||||
# 重复惩罚
|
||||
logits[:, list(set(input_ids.tolist()[0]))] /= rp
|
||||
|
||||
# 温度采样
|
||||
logits /= (temperature + 1e-9)
|
||||
|
||||
# Top-p采样
|
||||
if top_p is not None and top_p < 1.0:
|
||||
sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
|
||||
sorted_probs = F.softmax(sorted_logits, dim=-1)
|
||||
@ -602,8 +605,14 @@ class MiniMindLM(PreTrainedModel):
|
||||
sorted_indices_to_remove[:, 0] = False
|
||||
indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
|
||||
logits[indices_to_remove] = -float('Inf')
|
||||
|
||||
# 采样下一个token
|
||||
input_ids_next = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
|
||||
input_ids = torch.cat((input_ids, input_ids_next), dim=1)
|
||||
|
||||
# 返回新生成的部分
|
||||
yield input_ids[:, start:]
|
||||
|
||||
# 如果遇到结束token,停止生成
|
||||
if input_ids_next.item() == eos_token_id:
|
||||
break
|
||||
|
@ -453,16 +453,19 @@ class MiniMindLM(PreTrainedModel):
|
||||
return res
|
||||
|
||||
def _stream(self, input_ids, eos_token_id, max_new_tokens, temperature, top_p, rp, **args):
|
||||
start, first_seq, past_kvs = input_ids.shape[1], True, None
|
||||
while input_ids.shape[1] < max_new_tokens - 1:
|
||||
if first_seq:
|
||||
out, first_seq = self(input_ids, **args), False
|
||||
else:
|
||||
out = self(input_ids[:, -1:],
|
||||
start_pos=input_ids.shape[1] - 1, **args)
|
||||
logits, past_kvs = out.logits[:, -1, :], out.past_key_values
|
||||
start = input_ids.shape[1]
|
||||
for _ in range(max_new_tokens):
|
||||
# 每次都传入完整的input_ids,不使用KV缓存
|
||||
out = self(input_ids, **args)
|
||||
logits = out.logits[:, -1, :] # 取最后一个位置的logits
|
||||
|
||||
# 重复惩罚
|
||||
logits[:, list(set(input_ids.tolist()[0]))] /= rp
|
||||
|
||||
# 温度采样
|
||||
logits /= (temperature + 1e-9)
|
||||
|
||||
# Top-p采样
|
||||
if top_p is not None and top_p < 1.0:
|
||||
sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
|
||||
sorted_probs = F.softmax(sorted_logits, dim=-1)
|
||||
@ -472,8 +475,14 @@ class MiniMindLM(PreTrainedModel):
|
||||
sorted_indices_to_remove[:, 0] = False
|
||||
indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
|
||||
logits[indices_to_remove] = -float('Inf')
|
||||
|
||||
# 采样下一个token
|
||||
input_ids_next = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
|
||||
input_ids = torch.cat((input_ids, input_ids_next), dim=1)
|
||||
|
||||
# 返回新生成的部分
|
||||
yield input_ids[:, start:]
|
||||
|
||||
# 如果遇到结束token,停止生成
|
||||
if input_ids_next.item() == eos_token_id:
|
||||
break
|
||||
|
12603
models/minimind_tokenizer/tokenizer.json
Normal file
12603
models/minimind_tokenizer/tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
43
models/minimind_tokenizer/tokenizer_config.json
Normal file
43
models/minimind_tokenizer/tokenizer_config.json
Normal file
@ -0,0 +1,43 @@
|
||||
{
|
||||
"add_bos_token": false,
|
||||
"add_eos_token": false,
|
||||
"add_prefix_space": false,
|
||||
"added_tokens_decoder": {
|
||||
"0": {
|
||||
"content": "<unk>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"1": {
|
||||
"content": "<|im_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"2": {
|
||||
"content": "<|im_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
}
|
||||
},
|
||||
"additional_special_tokens": [],
|
||||
"bos_token": "<|im_start|>",
|
||||
"clean_up_tokenization_spaces": false,
|
||||
"eos_token": "<|im_end|>",
|
||||
"legacy": true,
|
||||
"model_max_length": 32768,
|
||||
"pad_token": "<unk>",
|
||||
"sp_model_kwargs": {},
|
||||
"spaces_between_special_tokens": false,
|
||||
"tokenizer_class": "PreTrainedTokenizerFast",
|
||||
"unk_token": "<unk>",
|
||||
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% else %}{{ '<|im_start|>system\\n你是 MiniMind,是一个有用的人工智能助手。<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}"
|
||||
}
|
1
models/minimind_tokenizer/vocab.json
Normal file
1
models/minimind_tokenizer/vocab.json
Normal file
File diff suppressed because one or more lines are too long
@ -18,21 +18,20 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
|
||||
--out_dir "out" \
|
||||
--epochs 3 \
|
||||
--embedding_epoch 2 \
|
||||
--batch_size 48 \
|
||||
--learning_rate 2e-4 \
|
||||
--batch_size 64 \
|
||||
--learning_rate 8e-5 \
|
||||
--dtype bfloat16 \
|
||||
--use_swanlab \
|
||||
--swanlab_project "MiniMind-Pretrain" \
|
||||
--num_workers 1 \
|
||||
--accumulation_steps 32 \
|
||||
--grad_clip 1.0 \
|
||||
--accumulation_steps 16 \
|
||||
--grad_clip 0.5 \
|
||||
--warmup_iters 0 \
|
||||
--log_interval 100 \
|
||||
--save_interval 10000 \
|
||||
--dim 1024 \
|
||||
--n_layers 18 \
|
||||
--n_layers 48 \
|
||||
--max_seq_len 512 \
|
||||
--use_moe False \
|
||||
--data_path "./dataset/stable/merged_pretrain.jsonl" \
|
||||
--profile \
|
||||
--profile_interval 10 \
|
||||
@ -44,4 +43,4 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
|
||||
--cluster_cache_path "./cache/cluster_tokens_single.pt" \
|
||||
--memory_monitor_interval 10 \
|
||||
--model_type "model_original" \
|
||||
--model_size 814.724
|
||||
--model_size 538
|
||||
|
@ -685,6 +685,47 @@ def train_epoch(epoch, accelerator, model, train_loader, optimizer, scheduler, a
|
||||
f"Bwd: {backward_time/args.log_interval:.2f}ms, "
|
||||
f"Optim: {optimizer_time/args.log_interval:.2f}ms, "
|
||||
f"Iter Time: {iter_time:.2f}ms", accelerator)
|
||||
|
||||
# 生成文本示例
|
||||
try:
|
||||
# 随机选择一个样本
|
||||
random_idx = torch.randint(0, X.size(0), (1,)).item()
|
||||
sample_input = X[random_idx:random_idx+1] # [1, seq_len]
|
||||
|
||||
# 取前面的部分作为prompt(例如前一半)
|
||||
prompt_len = min(sample_input.size(1) // 2, sample_input.size(1) - 10)
|
||||
prompt_input = sample_input[:, :prompt_len]
|
||||
|
||||
# 生成10个token
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
unwrapped_model.eval() # 设置为评估模式
|
||||
|
||||
with torch.no_grad():
|
||||
generated = unwrapped_model.generate(
|
||||
prompt_input,
|
||||
max_new_tokens=10,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
eos_token_id=tokenizer.eos_token_id,
|
||||
pad_token_id=tokenizer.pad_token_id
|
||||
)
|
||||
|
||||
# 转换为人类可读文本
|
||||
original_text = tokenizer.decode(sample_input[0], skip_special_tokens=True)
|
||||
prompt_text = tokenizer.decode(prompt_input[0], skip_special_tokens=True)
|
||||
generated_text = tokenizer.decode(generated[0], skip_special_tokens=True)
|
||||
new_tokens_text = generated_text[len(prompt_text):]
|
||||
|
||||
Logger(f"生成文本示例:", accelerator)
|
||||
Logger(f" 原始文本: {original_text[:100]}...", accelerator)
|
||||
Logger(f" 输入提示: {prompt_text[-50:]}", accelerator)
|
||||
Logger(f" 生成续写: {new_tokens_text}", accelerator)
|
||||
|
||||
unwrapped_model.train() # 恢复训练模式
|
||||
|
||||
except Exception as e:
|
||||
Logger(f"生成文本示例失败: {e}", accelerator)
|
||||
|
||||
# 重置事件以便下次测量从0开始
|
||||
data_start = torch.cuda.Event(enable_timing=True)
|
||||
data_end = torch.cuda.Event(enable_timing=True)
|
||||
|
Loading…
x
Reference in New Issue
Block a user