triage/analysis/case_data_extractor.py

#!/usr/bin/env python3
"""
病例数据提取器
用于提取每个病例的原始case_data、完整对话记录和最终生成的医疗信息
"""

import json
import os
import re
from pathlib import Path
from typing import Dict, List, Any, Optional


def extract_case_data(workflow_file: Path) -> Dict[str, Any]:
    """
    从工作流文件中提取病例原始数据
    
    Args:
        workflow_file: 工作流文件路径
        
    Returns:
        病例原始数据
    """
    try:
        with open(workflow_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            
        # 提取第一行的workflow_start事件
        if lines:
            first_line = lines[0].strip()
            try:
                first_step = json.loads(first_line)
                if first_step.get('event_type') == 'workflow_start':
                    case_data = first_step.get('case_data', {})
                    return case_data
            except json.JSONDecodeError:
                return {}
    except Exception as e:
        print(f"读取文件 {workflow_file} 时出错: {e}")
    return {}


def extract_conversation_history(workflow_file: Path) -> str:
    """
    提取完整的对话记录
    
    Args:
        workflow_file: 工作流文件路径
        
    Returns:
        完整对话记录字符串
    """
    try:
        with open(workflow_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            
        conversation_parts = []
        
        for line in lines:
            try:
                step = json.loads(line.strip())
                
                # 提取患者回应
                if step.get('event_type') == 'patient_response':
                    patient_response = step.get('message', '')
                    if patient_response:
                        conversation_parts.append(f"患者: {patient_response}")
                
                # 提取医生问题
                if step.get('event_type') == 'agent_execution':
                    agent_name = step.get('agent_name', '')
                    if agent_name in ['inquirer', 'prompter']:
                        output_data = step.get('output_data', {})
                        doctor_question = output_data.get('doctor_question', '')
                        if doctor_question:
                            conversation_parts.append(f"医生: {doctor_question}")
                        
            except json.JSONDecodeError:
                continue
        
        return '\n'.join(conversation_parts)
    except Exception as e:
        print(f"提取对话历史时出错: {e}")
        return ""


def extract_final_medical_info(workflow_file: Path) -> Dict[str, str]:
    """
    提取最终生成的医疗信息（主诉、现病史、既往史）
    
    Args:
        workflow_file: 工作流文件路径
        
    Returns:
        包含主诉、现病史、既往史的字典
    """
    try:
        with open(workflow_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            
        if not lines:
            return {"chief_complaint": "", "hpi": "", "ph": ""}
            
        # 查找包含最终医疗信息的step_end事件
        chief_complaint = ""
        hpi = ""
        ph = ""
        
        for line in reversed(lines):
            try:
                step = json.loads(line.strip())
                if step.get('event_type') == 'step_end':
                    step_result = step.get('step_result', {})
                    chief_complaint = step_result.get('updated_chief_complaint', chief_complaint)
                    hpi = step_result.get('updated_hpi', hpi)
                    ph = step_result.get('updated_ph', ph)
                    
                    # 如果三个都找到了，就返回
                    if chief_complaint and hpi and ph:
                        break
                        
            except json.JSONDecodeError:
                continue
        
        return {
            "chief_complaint": chief_complaint,
            "hpi": hpi,
            "ph": ph
        }
    except Exception as e:
        print(f"提取最终医疗信息时出错: {e}")
        return {"chief_complaint": "", "hpi": "", "ph": ""}


def extract_case_summary(workflow_file: Path) -> Dict[str, Any]:
    """
    提取完整的病例摘要
    
    Args:
        workflow_file: 工作流文件路径
        
    Returns:
        包含所有提取信息的完整摘要
    """
    case_data = extract_case_data(workflow_file)
    conversation = extract_conversation_history(workflow_file)
    final_info = extract_final_medical_info(workflow_file)
    
    return {
        "case_id": workflow_file.stem,
        "case_data": case_data,
        "conversation_history": conversation,
        "final_medical_info": final_info,
        "metadata": {
            "total_turns": len(conversation.split('\n')) if conversation else 0,
            "file_path": str(workflow_file),
            "has_case_data": bool(case_data),
            "has_conversation": bool(conversation),
            "has_final_info": any(final_info.values())
        }
    }


def process_all_cases(data_dir: str, output_dir: str) -> None:
    """
    处理所有病例文件
    
    Args:
        data_dir: 工作流数据目录
        output_dir: 输出目录
    """
    data_path = Path(data_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    workflow_files = list(data_path.glob("workflow_*.jsonl"))
    
    if not workflow_files:
        print(f"在 {data_dir} 中未找到工作流文件")
        return
    
    all_cases = []
    failed_cases = []
    
    for workflow_file in workflow_files:
        try:
            case_summary = extract_case_summary(workflow_file)
            all_cases.append(case_summary)
            
            # 为每个病例创建单独的文件
            case_output_file = output_path / f"{workflow_file.stem}_summary.json"
            with open(case_output_file, 'w', encoding='utf-8') as f:
                json.dump(case_summary, f, ensure_ascii=False, indent=2)
                
        except Exception as e:
            print(f"处理文件 {workflow_file} 失败: {e}")
            failed_cases.append(str(workflow_file))
    
    # 创建汇总文件
    summary = {
        "total_cases": len(all_cases),
        "failed_cases": len(failed_cases),
        "failed_files": failed_cases,
        "cases": all_cases
    }
    
    with open(output_path / "all_cases_summary.json", 'w', encoding='utf-8') as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)
    
    # 创建简化汇总（只包含关键信息）
    simple_summary = []
    for case in all_cases:
        simple_case = {
            "case_id": case["case_id"],
            "case_info": {
                "patient_name": case["case_data"].get("病案介绍", {}).get("基本信息", ""),
                "chief_complaint": case["case_data"].get("病案介绍", {}).get("主诉", ""),
                "diagnosis": case["case_data"].get("病案介绍", {}).get("诊断", "")
            },
            "final_output": case["final_medical_info"],
            "conversation_length": len(case["conversation_history"].split('\n')),
            "total_turns": case["metadata"]["total_turns"] // 2  # 医生+患者算一轮
        }
        simple_summary.append(simple_case)
    
    with open(output_path / "simple_summary.json", 'w', encoding='utf-8') as f:
        json.dump(simple_summary, f, ensure_ascii=False, indent=2)
    
    print(f"处理完成！")
    print(f"成功处理: {len(all_cases)} 个病例")
    print(f"失败: {len(failed_cases)} 个病例")
    print(f"输出目录: {output_path}")


def print_case_sample(case_summary: Dict[str, Any], max_conversation_lines: int = 10) -> None:
    """
    打印病例样本信息
    
    Args:
        case_summary: 病例摘要
        max_conversation_lines: 最大对话行数
    """
    print(f"\n=== 病例 {case_summary['case_id']} ===")
    
    # 病例基本信息
    case_data = case_summary['case_data']
    if case_data and '病案介绍' in case_data:
        case_info = case_data['病案介绍']
        print(f"患者: {case_info.get('基本信息', '未知')}")
        print(f"主诉: {case_info.get('主诉', '未提供')}")
        print(f"诊断: {case_info.get('诊断', '未提供')}")
    
    # 生成的医疗信息
    final_info = case_summary['final_medical_info']
    print("\n最终生成信息:")
    print(f"主诉: {final_info.get('chief_complaint', '')}")
    print(f"现病史: {final_info.get('hpi', '')[:100]}...")
    print(f"既往史: {final_info.get('ph', '')[:100]}...")
    
    # 对话摘要
    conversation = case_summary['conversation_history']
    lines = conversation.split('\n')
    print(f"\n对话记录 (共{len(lines)}行):")
    for line in lines[:max_conversation_lines]:
        print(f"  {line}")
    if len(lines) > max_conversation_lines:
        print("  ...")


def main():
    """主函数"""
    base_dir = Path(__file__).parent.parent
    data_dir = base_dir / "results" / "results0902"
    output_dir = base_dir / "analysis" / "case_extract_0902"
    
    if not data_dir.exists():
        print(f"数据目录不存在: {data_dir}")
        return
    
    print("开始提取病例数据...")
    process_all_cases(str(data_dir), str(output_dir))
    
    # 显示第一个病例作为示例
    output_path = Path(output_dir)
    summary_files = list(output_path.glob("*_summary.json"))
    
    if summary_files:
        with open(summary_files[0], 'r', encoding='utf-8') as f:
            sample_case = json.load(f)
        print_case_sample(sample_case)
    
    print("\n提取完成！")


if __name__ == "__main__":
    main()