28 lines
906 B
Python
28 lines
906 B
Python
import json
|
|
from collections import Counter
|
|
|
|
input_path = '/home/rwkv/RWKV-TS/RETRO_TEST/extract/processed_trex_data.json'
|
|
output_path = '/home/rwkv/RWKV-TS/RETRO_TEST/extract/predicate_vocab.json'
|
|
|
|
with open(input_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
predicate_set = set()
|
|
|
|
for item in data:
|
|
if 'target' in item and isinstance(item['target'], list):
|
|
# 用集合去重本条数据的谓词
|
|
predicates_in_item = set()
|
|
for triple in item['target']:
|
|
if isinstance(triple, dict) and 'predicate' in triple:
|
|
predicates_in_item.add(triple['predicate'])
|
|
predicate_set.update(predicates_in_item)
|
|
|
|
predicate_list = list(predicate_set)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(predicate_list, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f'已统计{len(predicate_list)}个谓词,保存到 {output_path}')
|
|
|