From 42bd06e55decc3701e0dce66be770d9ad336207b Mon Sep 17 00:00:00 2001 From: gongjy <2474590974@qq.com> Date: Wed, 23 Oct 2024 12:25:45 +0800 Subject: [PATCH] update data_process --- data_process.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/data_process.py b/data_process.py index be1a852..4d67dd3 100644 --- a/data_process.py +++ b/data_process.py @@ -75,7 +75,10 @@ def sft_process(contain_history=False): # 创建DataFrame并追加到CSV文件 df = pd.DataFrame({'history': history_lst, 'q': q_lst, 'a': a_lst}) + # 1、默认 df.to_csv(f'./dataset/{file_name}', mode='a', header=False, index=False, lineterminator='\r\n') + # 2、若遇到数据 `_csv.Error: need to escape, but no escapechar set` 问题,可加 escapechar='\\' 参数: + # df.to_csv(f'./dataset/{file_name}', mode='a', header=False, index=False, lineterminator='\r\n', escapechar='\\') chunk_size = 1000 # 每次处理的记录数 data = [] @@ -148,6 +151,6 @@ if __name__ == "__main__": if process_type == 1: pretrain_process() if process_type == 2: - sft_process(contain_history=False) + sft_process(contain_history=True) if process_type == 3: rl_process()