#!/usr/bin/env python3 """ 病例数据提取器 用于提取每个病例的原始case_data、完整对话记录和最终生成的医疗信息 """ import json import os import re from pathlib import Path from typing import Dict, List, Any, Optional def extract_case_data(workflow_file: Path) -> Dict[str, Any]: """ 从工作流文件中提取病例原始数据 Args: workflow_file: 工作流文件路径 Returns: 病例原始数据 """ try: with open(workflow_file, 'r', encoding='utf-8') as f: lines = f.readlines() # 提取第一行的workflow_start事件 if lines: first_line = lines[0].strip() try: first_step = json.loads(first_line) if first_step.get('event_type') == 'workflow_start': case_data = first_step.get('case_data', {}) return case_data except json.JSONDecodeError: return {} except Exception as e: print(f"读取文件 {workflow_file} 时出错: {e}") return {} def extract_conversation_history(workflow_file: Path) -> str: """ 提取完整的对话记录 Args: workflow_file: 工作流文件路径 Returns: 完整对话记录字符串 """ try: with open(workflow_file, 'r', encoding='utf-8') as f: lines = f.readlines() conversation_parts = [] for line in lines: try: step = json.loads(line.strip()) # 提取患者回应 if step.get('event_type') == 'patient_response': patient_response = step.get('message', '') if patient_response: conversation_parts.append(f"患者: {patient_response}") # 提取医生问题 if step.get('event_type') == 'agent_execution': agent_name = step.get('agent_name', '') if agent_name in ['inquirer', 'prompter']: output_data = step.get('output_data', {}) doctor_question = output_data.get('doctor_question', '') if doctor_question: conversation_parts.append(f"医生: {doctor_question}") except json.JSONDecodeError: continue return '\n'.join(conversation_parts) except Exception as e: print(f"提取对话历史时出错: {e}") return "" def extract_final_medical_info(workflow_file: Path) -> Dict[str, str]: """ 提取最终生成的医疗信息(主诉、现病史、既往史) Args: workflow_file: 工作流文件路径 Returns: 包含主诉、现病史、既往史的字典 """ try: with open(workflow_file, 'r', encoding='utf-8') as f: lines = f.readlines() if not lines: return {"chief_complaint": "", "hpi": "", "ph": ""} # 查找包含最终医疗信息的step_end事件 chief_complaint = "" hpi = "" ph = "" for line in reversed(lines): try: step = json.loads(line.strip()) if step.get('event_type') == 'step_end': step_result = step.get('step_result', {}) chief_complaint = step_result.get('updated_chief_complaint', chief_complaint) hpi = step_result.get('updated_hpi', hpi) ph = step_result.get('updated_ph', ph) # 如果三个都找到了,就返回 if chief_complaint and hpi and ph: break except json.JSONDecodeError: continue return { "chief_complaint": chief_complaint, "hpi": hpi, "ph": ph } except Exception as e: print(f"提取最终医疗信息时出错: {e}") return {"chief_complaint": "", "hpi": "", "ph": ""} def extract_case_summary(workflow_file: Path) -> Dict[str, Any]: """ 提取完整的病例摘要 Args: workflow_file: 工作流文件路径 Returns: 包含所有提取信息的完整摘要 """ case_data = extract_case_data(workflow_file) conversation = extract_conversation_history(workflow_file) final_info = extract_final_medical_info(workflow_file) return { "case_id": workflow_file.stem, "case_data": case_data, "conversation_history": conversation, "final_medical_info": final_info, "metadata": { "total_turns": len(conversation.split('\n')) if conversation else 0, "file_path": str(workflow_file), "has_case_data": bool(case_data), "has_conversation": bool(conversation), "has_final_info": any(final_info.values()) } } def process_all_cases(data_dir: str, output_dir: str) -> None: """ 处理所有病例文件 Args: data_dir: 工作流数据目录 output_dir: 输出目录 """ data_path = Path(data_dir) output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) workflow_files = list(data_path.glob("workflow_*.jsonl")) if not workflow_files: print(f"在 {data_dir} 中未找到工作流文件") return all_cases = [] failed_cases = [] for workflow_file in workflow_files: try: case_summary = extract_case_summary(workflow_file) all_cases.append(case_summary) # 为每个病例创建单独的文件 case_output_file = output_path / f"{workflow_file.stem}_summary.json" with open(case_output_file, 'w', encoding='utf-8') as f: json.dump(case_summary, f, ensure_ascii=False, indent=2) except Exception as e: print(f"处理文件 {workflow_file} 失败: {e}") failed_cases.append(str(workflow_file)) # 创建汇总文件 summary = { "total_cases": len(all_cases), "failed_cases": len(failed_cases), "failed_files": failed_cases, "cases": all_cases } with open(output_path / "all_cases_summary.json", 'w', encoding='utf-8') as f: json.dump(summary, f, ensure_ascii=False, indent=2) # 创建简化汇总(只包含关键信息) simple_summary = [] for case in all_cases: simple_case = { "case_id": case["case_id"], "case_info": { "patient_name": case["case_data"].get("病案介绍", {}).get("基本信息", ""), "chief_complaint": case["case_data"].get("病案介绍", {}).get("主诉", ""), "diagnosis": case["case_data"].get("病案介绍", {}).get("诊断", "") }, "final_output": case["final_medical_info"], "conversation_length": len(case["conversation_history"].split('\n')), "total_turns": case["metadata"]["total_turns"] // 2 # 医生+患者算一轮 } simple_summary.append(simple_case) with open(output_path / "simple_summary.json", 'w', encoding='utf-8') as f: json.dump(simple_summary, f, ensure_ascii=False, indent=2) print(f"处理完成!") print(f"成功处理: {len(all_cases)} 个病例") print(f"失败: {len(failed_cases)} 个病例") print(f"输出目录: {output_path}") def print_case_sample(case_summary: Dict[str, Any], max_conversation_lines: int = 10) -> None: """ 打印病例样本信息 Args: case_summary: 病例摘要 max_conversation_lines: 最大对话行数 """ print(f"\n=== 病例 {case_summary['case_id']} ===") # 病例基本信息 case_data = case_summary['case_data'] if case_data and '病案介绍' in case_data: case_info = case_data['病案介绍'] print(f"患者: {case_info.get('基本信息', '未知')}") print(f"主诉: {case_info.get('主诉', '未提供')}") print(f"诊断: {case_info.get('诊断', '未提供')}") # 生成的医疗信息 final_info = case_summary['final_medical_info'] print("\n最终生成信息:") print(f"主诉: {final_info.get('chief_complaint', '')}") print(f"现病史: {final_info.get('hpi', '')[:100]}...") print(f"既往史: {final_info.get('ph', '')[:100]}...") # 对话摘要 conversation = case_summary['conversation_history'] lines = conversation.split('\n') print(f"\n对话记录 (共{len(lines)}行):") for line in lines[:max_conversation_lines]: print(f" {line}") if len(lines) > max_conversation_lines: print(" ...") def main(): """主函数""" base_dir = Path(__file__).parent.parent data_dir = base_dir / "results" / "results0902" output_dir = base_dir / "analysis" / "case_extract_0902" if not data_dir.exists(): print(f"数据目录不存在: {data_dir}") return print("开始提取病例数据...") process_all_cases(str(data_dir), str(output_dir)) # 显示第一个病例作为示例 output_path = Path(output_dir) summary_files = list(output_path.glob("*_summary.json")) if summary_files: with open(summary_files[0], 'r', encoding='utf-8') as f: sample_case = json.load(f) print_case_sample(sample_case) print("\n提取完成!") if __name__ == "__main__": main()