Minimind/stat_predicate_vocab.py
2025-07-05 03:03:43 +00:00

28 lines
906 B
Python

import json
from collections import Counter
input_path = '/home/rwkv/RWKV-TS/RETRO_TEST/extract/processed_trex_data.json'
output_path = '/home/rwkv/RWKV-TS/RETRO_TEST/extract/predicate_vocab.json'
with open(input_path, 'r', encoding='utf-8') as f:
data = json.load(f)
predicate_set = set()
for item in data:
if 'target' in item and isinstance(item['target'], list):
# 用集合去重本条数据的谓词
predicates_in_item = set()
for triple in item['target']:
if isinstance(triple, dict) and 'predicate' in triple:
predicates_in_item.add(triple['predicate'])
predicate_set.update(predicates_in_item)
predicate_list = list(predicate_set)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(predicate_list, f, ensure_ascii=False, indent=2)
print(f'已统计{len(predicate_list)}个谓词,保存到 {output_path}')