update train_tokenizer
This commit is contained in:
parent
7c67ba0b92
commit
1240829c89
@ -122,36 +122,25 @@ def eval_tokenizer():
|
|||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "system", "content": "你是一个优秀的聊天机器人,总是给我正确的回应!"},
|
{"role": "system", "content": "你是一个优秀的聊天机器人,总是给我正确的回应!"},
|
||||||
{"role": "user", "content": '是椭圆形的'},
|
{"role": "user", "content": '你来自哪里?'},
|
||||||
{"role": "assistant", "content": '456'},
|
{"role": "assistant", "content": '我来自地球'}
|
||||||
{"role": "user", "content": '456'},
|
|
||||||
{"role": "assistant", "content": '789'}
|
|
||||||
]
|
]
|
||||||
new_prompt = tokenizer.apply_chat_template(
|
new_prompt = tokenizer.apply_chat_template(
|
||||||
messages,
|
messages,
|
||||||
tokenize=False
|
tokenize=False
|
||||||
)
|
)
|
||||||
|
|
||||||
print(new_prompt)
|
print(new_prompt)
|
||||||
# 获取词汇表大小(不包括特殊符号)
|
|
||||||
print('tokenizer词表大小:', tokenizer.vocab_size)
|
|
||||||
|
|
||||||
# 获取实际词汇表长度(包括特殊符号)
|
# 获取实际词汇表长度(包括特殊符号)
|
||||||
actual_vocab_size = len(tokenizer)
|
actual_vocab_size = len(tokenizer)
|
||||||
print('qwen实际词表长度:', actual_vocab_size)
|
print('tokenizer实际词表长度:', actual_vocab_size)
|
||||||
|
|
||||||
new_prompt = 'wenjie,椭圆和⚪的关系是什么呢?因为明天下午要带家人去下医院,所以申请上午在家办公,因为明天下午要带家人去下医院,所以申请上午在家办公,因为明天下午要带家人去下医院,所以申请上午在家办公,下午请半天假~@LWJWe '
|
|
||||||
print(new_prompt)
|
|
||||||
model_inputs = tokenizer(new_prompt)
|
model_inputs = tokenizer(new_prompt)
|
||||||
|
print('encoder长度:', len(model_inputs['input_ids']))
|
||||||
|
|
||||||
print(model_inputs)
|
input_ids = model_inputs['input_ids']
|
||||||
print('长度:', len(model_inputs['input_ids']))
|
response = tokenizer.decode(input_ids)
|
||||||
|
print('decoder和原始文本是否一致:', response == new_prompt)
|
||||||
input_ids_ = model_inputs['input_ids']
|
|
||||||
|
|
||||||
response = tokenizer.decode(input_ids_)
|
|
||||||
print(response, end='')
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# train_tokenizer()
|
# train_tokenizer()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user