From 6fb569abd8490ca1299e250b2714dadcb33cf764 Mon Sep 17 00:00:00 2001 From: Yu Chengzhang Date: Mon, 23 Sep 2024 18:34:58 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86process=5Fseq=5F?= =?UTF-8?q?monkey=E5=86=99=E5=85=A5=E9=94=99=E8=AF=AF=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_process.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_process.py b/data_process.py index 2663ef1..9c03628 100644 --- a/data_process.py +++ b/data_process.py @@ -89,7 +89,7 @@ def process_seq_monkey(chunk_size=50000): if len(doc_ids) > 1000000: arr = np.array(doc_ids, dtype=np.uint16) - with open(f'./dataset/clean_seq_monkey.bin', 'wb') as f: + with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f: f.write(arr.tobytes()) doc_ids = [] @@ -220,7 +220,7 @@ if __name__ == "__main__": # 2: sft # 3: RL ################ - process_type = 3 + process_type = 1 if process_type == 1: pretrain_process() From 0fa4d17d268c949f301460c926b3913fb95100ee Mon Sep 17 00:00:00 2001 From: Yu Chengzhang Date: Mon, 23 Sep 2024 18:36:04 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BA=86=E5=BF=BD?= =?UTF-8?q?=E8=A7=86=E5=88=97=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..09f33a7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/model/__pycache__ +/dataset \ No newline at end of file