import json from collections import Counter input_path = '/home/rwkv/RWKV-TS/RETRO_TEST/extract/processed_trex_data.json' output_path = '/home/rwkv/RWKV-TS/RETRO_TEST/extract/predicate_vocab.json' with open(input_path, 'r', encoding='utf-8') as f: data = json.load(f) predicate_set = set() for item in data: if 'target' in item and isinstance(item['target'], list): # 用集合去重本条数据的谓词 predicates_in_item = set() for triple in item['target']: if isinstance(triple, dict) and 'predicate' in triple: predicates_in_item.add(triple['predicate']) predicate_set.update(predicates_in_item) predicate_list = list(predicate_set) with open(output_path, 'w', encoding='utf-8') as f: json.dump(predicate_list, f, ensure_ascii=False, indent=2) print(f'已统计{len(predicate_list)}个谓词,保存到 {output_path}')