Merge pull request #42 from iomgaa-ycz/wandb

修复data_process.py文件追加的bug
This commit is contained in:
jingyaogong 2024-09-23 20:08:13 +08:00 committed by GitHub
commit 5f8279f661
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 4 additions and 2 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/model/__pycache__
/dataset

View File

@ -89,7 +89,7 @@ def process_seq_monkey(chunk_size=50000):
if len(doc_ids) > 1000000:
arr = np.array(doc_ids, dtype=np.uint16)
with open(f'./dataset/clean_seq_monkey.bin', 'wb') as f:
with open(f'./dataset/clean_seq_monkey.bin', 'ab') as f:
f.write(arr.tobytes())
doc_ids = []
@ -220,7 +220,7 @@ if __name__ == "__main__":
# 2: sft
# 3: RL
################
process_type = 3
process_type = 1
if process_type == 1:
pretrain_process()