From 6fb569abd8490ca1299e250b2714dadcb33cf764 Mon Sep 17 00:00:00 2001
From: Yu Chengzhang <iomgaaycz@gmail.com>
Date: Mon, 23 Sep 2024 18:34:58 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86process=5Fseq=5Fmonk?=
 =?UTF-8?q?ey=E5=86=99=E5=85=A5=E9=94=99=E8=AF=AF=E7=9A=84bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data_process.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_process.py b/data_process.py
index 2663ef1..9c03628 100644
--- a/data_process.py
+++ b/data_process.py
@@ -89,7 +89,7 @@ def process_seq_monkey(chunk_size=50000):
 
             if len(doc_ids) > 1000000:
                 arr = np.array(doc_ids, dtype=np.uint16)
-                with open(f'./dataset/clean_seq_monkey.bin', 'wb') as f:
+                with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f:
                     f.write(arr.tobytes())
                 doc_ids = []
 
@@ -220,7 +220,7 @@ if __name__ == "__main__":
     # 2: sft
     # 3: RL
     ################
-    process_type = 3
+    process_type = 1
 
     if process_type == 1:
         pretrain_process()