From 6fb569abd8490ca1299e250b2714dadcb33cf764 Mon Sep 17 00:00:00 2001 From: Yu Chengzhang Date: Mon, 23 Sep 2024 18:34:58 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86process=5Fseq=5Fmonk?= =?UTF-8?q?ey=E5=86=99=E5=85=A5=E9=94=99=E8=AF=AF=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_process.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_process.py b/data_process.py index 2663ef1..9c03628 100644 --- a/data_process.py +++ b/data_process.py @@ -89,7 +89,7 @@ def process_seq_monkey(chunk_size=50000): if len(doc_ids) > 1000000: arr = np.array(doc_ids, dtype=np.uint16) - with open(f'./dataset/clean_seq_monkey.bin', 'wb') as f: + with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f: f.write(arr.tobytes()) doc_ids = [] @@ -220,7 +220,7 @@ if __name__ == "__main__": # 2: sft # 3: RL ################ - process_type = 3 + process_type = 1 if process_type == 1: pretrain_process()