296 lines
9.6 KiB
Python
296 lines
9.6 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
病例数据提取器
|
|||
|
|
用于提取每个病例的原始case_data、完整对话记录和最终生成的医疗信息
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
import re
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Dict, List, Any, Optional
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_case_data(workflow_file: Path) -> Dict[str, Any]:
|
|||
|
|
"""
|
|||
|
|
从工作流文件中提取病例原始数据
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
workflow_file: 工作流文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
病例原始数据
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
with open(workflow_file, 'r', encoding='utf-8') as f:
|
|||
|
|
lines = f.readlines()
|
|||
|
|
|
|||
|
|
# 提取第一行的workflow_start事件
|
|||
|
|
if lines:
|
|||
|
|
first_line = lines[0].strip()
|
|||
|
|
try:
|
|||
|
|
first_step = json.loads(first_line)
|
|||
|
|
if first_step.get('event_type') == 'workflow_start':
|
|||
|
|
case_data = first_step.get('case_data', {})
|
|||
|
|
return case_data
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
return {}
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"读取文件 {workflow_file} 时出错: {e}")
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_conversation_history(workflow_file: Path) -> str:
|
|||
|
|
"""
|
|||
|
|
提取完整的对话记录
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
workflow_file: 工作流文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
完整对话记录字符串
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
with open(workflow_file, 'r', encoding='utf-8') as f:
|
|||
|
|
lines = f.readlines()
|
|||
|
|
|
|||
|
|
conversation_parts = []
|
|||
|
|
|
|||
|
|
for line in lines:
|
|||
|
|
try:
|
|||
|
|
step = json.loads(line.strip())
|
|||
|
|
|
|||
|
|
# 提取患者回应
|
|||
|
|
if step.get('event_type') == 'patient_response':
|
|||
|
|
patient_response = step.get('message', '')
|
|||
|
|
if patient_response:
|
|||
|
|
conversation_parts.append(f"患者: {patient_response}")
|
|||
|
|
|
|||
|
|
# 提取医生问题
|
|||
|
|
if step.get('event_type') == 'agent_execution':
|
|||
|
|
agent_name = step.get('agent_name', '')
|
|||
|
|
if agent_name in ['inquirer', 'prompter']:
|
|||
|
|
output_data = step.get('output_data', {})
|
|||
|
|
doctor_question = output_data.get('doctor_question', '')
|
|||
|
|
if doctor_question:
|
|||
|
|
conversation_parts.append(f"医生: {doctor_question}")
|
|||
|
|
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
return '\n'.join(conversation_parts)
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"提取对话历史时出错: {e}")
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_final_medical_info(workflow_file: Path) -> Dict[str, str]:
|
|||
|
|
"""
|
|||
|
|
提取最终生成的医疗信息(主诉、现病史、既往史)
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
workflow_file: 工作流文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
包含主诉、现病史、既往史的字典
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
with open(workflow_file, 'r', encoding='utf-8') as f:
|
|||
|
|
lines = f.readlines()
|
|||
|
|
|
|||
|
|
if not lines:
|
|||
|
|
return {"chief_complaint": "", "hpi": "", "ph": ""}
|
|||
|
|
|
|||
|
|
# 查找包含最终医疗信息的step_end事件
|
|||
|
|
chief_complaint = ""
|
|||
|
|
hpi = ""
|
|||
|
|
ph = ""
|
|||
|
|
|
|||
|
|
for line in reversed(lines):
|
|||
|
|
try:
|
|||
|
|
step = json.loads(line.strip())
|
|||
|
|
if step.get('event_type') == 'step_end':
|
|||
|
|
step_result = step.get('step_result', {})
|
|||
|
|
chief_complaint = step_result.get('updated_chief_complaint', chief_complaint)
|
|||
|
|
hpi = step_result.get('updated_hpi', hpi)
|
|||
|
|
ph = step_result.get('updated_ph', ph)
|
|||
|
|
|
|||
|
|
# 如果三个都找到了,就返回
|
|||
|
|
if chief_complaint and hpi and ph:
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"chief_complaint": chief_complaint,
|
|||
|
|
"hpi": hpi,
|
|||
|
|
"ph": ph
|
|||
|
|
}
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"提取最终医疗信息时出错: {e}")
|
|||
|
|
return {"chief_complaint": "", "hpi": "", "ph": ""}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_case_summary(workflow_file: Path) -> Dict[str, Any]:
|
|||
|
|
"""
|
|||
|
|
提取完整的病例摘要
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
workflow_file: 工作流文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
包含所有提取信息的完整摘要
|
|||
|
|
"""
|
|||
|
|
case_data = extract_case_data(workflow_file)
|
|||
|
|
conversation = extract_conversation_history(workflow_file)
|
|||
|
|
final_info = extract_final_medical_info(workflow_file)
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"case_id": workflow_file.stem,
|
|||
|
|
"case_data": case_data,
|
|||
|
|
"conversation_history": conversation,
|
|||
|
|
"final_medical_info": final_info,
|
|||
|
|
"metadata": {
|
|||
|
|
"total_turns": len(conversation.split('\n')) if conversation else 0,
|
|||
|
|
"file_path": str(workflow_file),
|
|||
|
|
"has_case_data": bool(case_data),
|
|||
|
|
"has_conversation": bool(conversation),
|
|||
|
|
"has_final_info": any(final_info.values())
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def process_all_cases(data_dir: str, output_dir: str) -> None:
|
|||
|
|
"""
|
|||
|
|
处理所有病例文件
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
data_dir: 工作流数据目录
|
|||
|
|
output_dir: 输出目录
|
|||
|
|
"""
|
|||
|
|
data_path = Path(data_dir)
|
|||
|
|
output_path = Path(output_dir)
|
|||
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|||
|
|
|
|||
|
|
workflow_files = list(data_path.glob("workflow_*.jsonl"))
|
|||
|
|
|
|||
|
|
if not workflow_files:
|
|||
|
|
print(f"在 {data_dir} 中未找到工作流文件")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
all_cases = []
|
|||
|
|
failed_cases = []
|
|||
|
|
|
|||
|
|
for workflow_file in workflow_files:
|
|||
|
|
try:
|
|||
|
|
case_summary = extract_case_summary(workflow_file)
|
|||
|
|
all_cases.append(case_summary)
|
|||
|
|
|
|||
|
|
# 为每个病例创建单独的文件
|
|||
|
|
case_output_file = output_path / f"{workflow_file.stem}_summary.json"
|
|||
|
|
with open(case_output_file, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(case_summary, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"处理文件 {workflow_file} 失败: {e}")
|
|||
|
|
failed_cases.append(str(workflow_file))
|
|||
|
|
|
|||
|
|
# 创建汇总文件
|
|||
|
|
summary = {
|
|||
|
|
"total_cases": len(all_cases),
|
|||
|
|
"failed_cases": len(failed_cases),
|
|||
|
|
"failed_files": failed_cases,
|
|||
|
|
"cases": all_cases
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
with open(output_path / "all_cases_summary.json", 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(summary, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
# 创建简化汇总(只包含关键信息)
|
|||
|
|
simple_summary = []
|
|||
|
|
for case in all_cases:
|
|||
|
|
simple_case = {
|
|||
|
|
"case_id": case["case_id"],
|
|||
|
|
"case_info": {
|
|||
|
|
"patient_name": case["case_data"].get("病案介绍", {}).get("基本信息", ""),
|
|||
|
|
"chief_complaint": case["case_data"].get("病案介绍", {}).get("主诉", ""),
|
|||
|
|
"diagnosis": case["case_data"].get("病案介绍", {}).get("诊断", "")
|
|||
|
|
},
|
|||
|
|
"final_output": case["final_medical_info"],
|
|||
|
|
"conversation_length": len(case["conversation_history"].split('\n')),
|
|||
|
|
"total_turns": case["metadata"]["total_turns"] // 2 # 医生+患者算一轮
|
|||
|
|
}
|
|||
|
|
simple_summary.append(simple_case)
|
|||
|
|
|
|||
|
|
with open(output_path / "simple_summary.json", 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(simple_summary, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
print(f"处理完成!")
|
|||
|
|
print(f"成功处理: {len(all_cases)} 个病例")
|
|||
|
|
print(f"失败: {len(failed_cases)} 个病例")
|
|||
|
|
print(f"输出目录: {output_path}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def print_case_sample(case_summary: Dict[str, Any], max_conversation_lines: int = 10) -> None:
|
|||
|
|
"""
|
|||
|
|
打印病例样本信息
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
case_summary: 病例摘要
|
|||
|
|
max_conversation_lines: 最大对话行数
|
|||
|
|
"""
|
|||
|
|
print(f"\n=== 病例 {case_summary['case_id']} ===")
|
|||
|
|
|
|||
|
|
# 病例基本信息
|
|||
|
|
case_data = case_summary['case_data']
|
|||
|
|
if case_data and '病案介绍' in case_data:
|
|||
|
|
case_info = case_data['病案介绍']
|
|||
|
|
print(f"患者: {case_info.get('基本信息', '未知')}")
|
|||
|
|
print(f"主诉: {case_info.get('主诉', '未提供')}")
|
|||
|
|
print(f"诊断: {case_info.get('诊断', '未提供')}")
|
|||
|
|
|
|||
|
|
# 生成的医疗信息
|
|||
|
|
final_info = case_summary['final_medical_info']
|
|||
|
|
print("\n最终生成信息:")
|
|||
|
|
print(f"主诉: {final_info.get('chief_complaint', '')}")
|
|||
|
|
print(f"现病史: {final_info.get('hpi', '')[:100]}...")
|
|||
|
|
print(f"既往史: {final_info.get('ph', '')[:100]}...")
|
|||
|
|
|
|||
|
|
# 对话摘要
|
|||
|
|
conversation = case_summary['conversation_history']
|
|||
|
|
lines = conversation.split('\n')
|
|||
|
|
print(f"\n对话记录 (共{len(lines)}行):")
|
|||
|
|
for line in lines[:max_conversation_lines]:
|
|||
|
|
print(f" {line}")
|
|||
|
|
if len(lines) > max_conversation_lines:
|
|||
|
|
print(" ...")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
base_dir = Path(__file__).parent.parent
|
|||
|
|
data_dir = base_dir / "results" / "results0902"
|
|||
|
|
output_dir = base_dir / "analysis" / "case_extract_0902"
|
|||
|
|
|
|||
|
|
if not data_dir.exists():
|
|||
|
|
print(f"数据目录不存在: {data_dir}")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print("开始提取病例数据...")
|
|||
|
|
process_all_cases(str(data_dir), str(output_dir))
|
|||
|
|
|
|||
|
|
# 显示第一个病例作为示例
|
|||
|
|
output_path = Path(output_dir)
|
|||
|
|
summary_files = list(output_path.glob("*_summary.json"))
|
|||
|
|
|
|||
|
|
if summary_files:
|
|||
|
|
with open(summary_files[0], 'r', encoding='utf-8') as f:
|
|||
|
|
sample_case = json.load(f)
|
|||
|
|
print_case_sample(sample_case)
|
|||
|
|
|
|||
|
|
print("\n提取完成!")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|