#!/usr/bin/env python3 """ 统计dataset/bbb.json中所有病例的一级科室和二级科室集合 """ import json from pathlib import Path from collections import Counter def load_dataset(file_path: str) -> list: """加载数据集""" try: with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) except Exception as e: print(f"加载文件 {file_path} 时出错: {e}") return [] def analyze_departments(data: list) -> dict: """分析科室分布""" # 收集一级科室和二级科室 level1_departments = [] level2_departments = [] # 建立一级到二级的映射关系 level1_to_level2 = {} for case in data: level1 = case.get('一级科室', '').strip() level2 = case.get('二级科室', '').strip() if level1: level1_departments.append(level1) if level2: level2_departments.append(level2) # 建立映射关系 if level1 and level2: if level1 not in level1_to_level2: level1_to_level2[level1] = set() level1_to_level2[level1].add(level2) # 统计计数 level1_counter = Counter(level1_departments) level2_counter = Counter(level2_departments) return { 'level1_counter': level1_counter, 'level2_counter': level2_counter, 'level1_to_level2': {k: list(v) for k, v in level1_to_level2.items()}, 'total_cases': len(data), 'unique_level1': len(set(level1_departments)), 'unique_level2': len(set(level2_departments)) } def print_statistics(stats: dict): """打印统计结果""" print("=" * 60) print("DATASET 科室统计报告") print("=" * 60) print(f"总病例数: {stats['total_cases']}") print(f"一级科室种类数: {stats['unique_level1']}") print(f"二级科室种类数: {stats['unique_level2']}") print() print("一级科室分布:") print("-" * 40) for dept, count in sorted(stats['level1_counter'].items(), key=lambda x: x[1], reverse=True): percentage = (count / stats['total_cases']) * 100 print(f" {dept}: {count} 例 ({percentage:.1f}%)") print() print("二级科室分布:") print("-" * 40) for dept, count in sorted(stats['level2_counter'].items(), key=lambda x: x[1], reverse=True): percentage = (count / stats['total_cases']) * 100 print(f" {dept}: {count} 例 ({percentage:.1f}%)") print() print("一级科室 → 二级科室映射:") print("-" * 40) for level1, level2_list in sorted(stats['level1_to_level2'].items()): print(f" {level1}:") for level2 in sorted(level2_list): count = stats['level2_counter'][level2] print(f" - {level2}: {count} 例") print() def save_statistics(stats: dict, output_file: str): """保存统计结果""" # 准备保存的数据 save_data = { '一级科室列表': sorted(list(stats['level1_counter'].keys())), '二级科室列表': sorted(list(set(stats['level2_counter'].keys()))), '一级科室计数': dict(stats['level1_counter']), '二级科室计数': dict(stats['level2_counter']), '一级科室到二级科室映射': stats['level1_to_level2'], '统计信息': { '总病例数': stats['total_cases'], '一级科室种类数': stats['unique_level1'], '二级科室种类数': stats['unique_level2'] } } with open(output_file, 'w', encoding='utf-8') as f: json.dump(save_data, f, ensure_ascii=False, indent=2) def main(): """主函数""" # 设置路径 dataset_file = Path(__file__).parent.parent / "dataset" / "bbb.json" output_file = Path(__file__).parent.parent / "analysis" / "dataset_department_statistics.json" print(f"正在加载数据集: {dataset_file}") data = load_dataset(str(dataset_file)) if not data: print("无法加载数据集") return print(f"成功加载 {len(data)} 个病例") stats = analyze_departments(data) print_statistics(stats) save_statistics(stats, str(output_file)) print(f"统计结果已保存到: {output_file}") # 额外输出纯列表格式 print("\n" + "=" * 60) print("科室列表(纯文本格式)") print("=" * 60) print("一级科室集合:") for dept in sorted(list(stats['level1_counter'].keys())): print(f" '{dept}'") print("\n二级科室集合:") for dept in sorted(list(set(stats['level2_counter'].keys()))): print(f" '{dept}'") if __name__ == "__main__": main()