From 6fb569abd8490ca1299e250b2714dadcb33cf764 Mon Sep 17 00:00:00 2001
From: Yu Chengzhang <iomgaaycz@gmail.com>
Date: Mon, 23 Sep 2024 18:34:58 +0800
Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86process=5Fseq=5F?=
 =?UTF-8?q?monkey=E5=86=99=E5=85=A5=E9=94=99=E8=AF=AF=E7=9A=84bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data_process.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data_process.py b/data_process.py
index 2663ef1..9c03628 100644
--- a/data_process.py
+++ b/data_process.py
@@ -89,7 +89,7 @@ def process_seq_monkey(chunk_size=50000):
 
             if len(doc_ids) > 1000000:
                 arr = np.array(doc_ids, dtype=np.uint16)
-                with open(f'./dataset/clean_seq_monkey.bin', 'wb') as f:
+                with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f:
                     f.write(arr.tobytes())
                 doc_ids = []
 
@@ -220,7 +220,7 @@ if __name__ == "__main__":
     # 2: sft
     # 3: RL
     ################
-    process_type = 3
+    process_type = 1
 
     if process_type == 1:
         pretrain_process()

From 0fa4d17d268c949f301460c926b3913fb95100ee Mon Sep 17 00:00:00 2001
From: Yu Chengzhang <iomgaaycz@gmail.com>
Date: Mon, 23 Sep 2024 18:36:04 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BA=86=E5=BF=BD?=
 =?UTF-8?q?=E8=A7=86=E5=88=97=E8=A1=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..09f33a7
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/model/__pycache__
+/dataset
\ No newline at end of file