update train_tokenizer

2024-11-06 17:48:33 +08:00 · 2024-11-06 17:48:33 +08:00 · 1240829c89
commit 1240829c89
parent 7c67ba0b92
1 changed files with 7 additions and 18 deletions
--- a/train_tokenizer.py
+++ b/train_tokenizer.py
@ -122,36 +122,25 @@ def eval_tokenizer():

    messages = [
        {"role": "system", "content": "你是一个优秀的聊天机器人，总是给我正确的回应！"},
-        {"role": "user", "content": '是椭圆形的'},
-        {"role": "assistant", "content": '456'},
-        {"role": "user", "content": '456'},
-        {"role": "assistant", "content": '789'}
+        {"role": "user", "content": '你来自哪里？'},
+        {"role": "assistant", "content": '我来自地球'}
    ]
    new_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False
    )
-
    print(new_prompt)
-    # 获取词汇表大小（不包括特殊符号）
-    print('tokenizer词表大小：', tokenizer.vocab_size)

    # 获取实际词汇表长度（包括特殊符号）
    actual_vocab_size = len(tokenizer)
-    print('qwen实际词表长度：', actual_vocab_size)
+    print('tokenizer实际词表长度：', actual_vocab_size)

-    new_prompt = 'wenjie，椭圆和⚪的关系是什么呢？因为明天下午要带家人去下医院，所以申请上午在家办公，因为明天下午要带家人去下医院，所以申请上午在家办公，因为明天下午要带家人去下医院，所以申请上午在家办公，下午请半天假~@LWJWe '
-    print(new_prompt)
    model_inputs = tokenizer(new_prompt)
+    print('encoder长度：', len(model_inputs['input_ids']))

-    print(model_inputs)
-    print('长度：', len(model_inputs['input_ids']))
-
-    input_ids_ = model_inputs['input_ids']
-
-    response = tokenizer.decode(input_ids_)
-    print(response, end='')
-
+    input_ids = model_inputs['input_ids']
+    response = tokenizer.decode(input_ids)
+    print('decoder和原始文本是否一致：', response == new_prompt)

 def main():
    # train_tokenizer()