triage/analysis/case_data_extractor.py
iomgaa 7c723fbc4b 删除废弃的disease_analyst智能体模块
删除了不再使用的disease_analyst模块的所有相关文件:
- agent.py: 疾病分析智能体主逻辑
- prompt.py: 疾病分析提示模板
- response_model.py: 响应数据模型
- __init__.py: 模块初始化文件

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-03 21:44:01 +08:00

296 lines
9.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
病例数据提取器
用于提取每个病例的原始case_data、完整对话记录和最终生成的医疗信息
"""
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Any, Optional
def extract_case_data(workflow_file: Path) -> Dict[str, Any]:
"""
从工作流文件中提取病例原始数据
Args:
workflow_file: 工作流文件路径
Returns:
病例原始数据
"""
try:
with open(workflow_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 提取第一行的workflow_start事件
if lines:
first_line = lines[0].strip()
try:
first_step = json.loads(first_line)
if first_step.get('event_type') == 'workflow_start':
case_data = first_step.get('case_data', {})
return case_data
except json.JSONDecodeError:
return {}
except Exception as e:
print(f"读取文件 {workflow_file} 时出错: {e}")
return {}
def extract_conversation_history(workflow_file: Path) -> str:
"""
提取完整的对话记录
Args:
workflow_file: 工作流文件路径
Returns:
完整对话记录字符串
"""
try:
with open(workflow_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
conversation_parts = []
for line in lines:
try:
step = json.loads(line.strip())
# 提取患者回应
if step.get('event_type') == 'patient_response':
patient_response = step.get('message', '')
if patient_response:
conversation_parts.append(f"患者: {patient_response}")
# 提取医生问题
if step.get('event_type') == 'agent_execution':
agent_name = step.get('agent_name', '')
if agent_name in ['inquirer', 'prompter']:
output_data = step.get('output_data', {})
doctor_question = output_data.get('doctor_question', '')
if doctor_question:
conversation_parts.append(f"医生: {doctor_question}")
except json.JSONDecodeError:
continue
return '\n'.join(conversation_parts)
except Exception as e:
print(f"提取对话历史时出错: {e}")
return ""
def extract_final_medical_info(workflow_file: Path) -> Dict[str, str]:
"""
提取最终生成的医疗信息(主诉、现病史、既往史)
Args:
workflow_file: 工作流文件路径
Returns:
包含主诉、现病史、既往史的字典
"""
try:
with open(workflow_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
if not lines:
return {"chief_complaint": "", "hpi": "", "ph": ""}
# 查找包含最终医疗信息的step_end事件
chief_complaint = ""
hpi = ""
ph = ""
for line in reversed(lines):
try:
step = json.loads(line.strip())
if step.get('event_type') == 'step_end':
step_result = step.get('step_result', {})
chief_complaint = step_result.get('updated_chief_complaint', chief_complaint)
hpi = step_result.get('updated_hpi', hpi)
ph = step_result.get('updated_ph', ph)
# 如果三个都找到了,就返回
if chief_complaint and hpi and ph:
break
except json.JSONDecodeError:
continue
return {
"chief_complaint": chief_complaint,
"hpi": hpi,
"ph": ph
}
except Exception as e:
print(f"提取最终医疗信息时出错: {e}")
return {"chief_complaint": "", "hpi": "", "ph": ""}
def extract_case_summary(workflow_file: Path) -> Dict[str, Any]:
"""
提取完整的病例摘要
Args:
workflow_file: 工作流文件路径
Returns:
包含所有提取信息的完整摘要
"""
case_data = extract_case_data(workflow_file)
conversation = extract_conversation_history(workflow_file)
final_info = extract_final_medical_info(workflow_file)
return {
"case_id": workflow_file.stem,
"case_data": case_data,
"conversation_history": conversation,
"final_medical_info": final_info,
"metadata": {
"total_turns": len(conversation.split('\n')) if conversation else 0,
"file_path": str(workflow_file),
"has_case_data": bool(case_data),
"has_conversation": bool(conversation),
"has_final_info": any(final_info.values())
}
}
def process_all_cases(data_dir: str, output_dir: str) -> None:
"""
处理所有病例文件
Args:
data_dir: 工作流数据目录
output_dir: 输出目录
"""
data_path = Path(data_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
workflow_files = list(data_path.glob("workflow_*.jsonl"))
if not workflow_files:
print(f"{data_dir} 中未找到工作流文件")
return
all_cases = []
failed_cases = []
for workflow_file in workflow_files:
try:
case_summary = extract_case_summary(workflow_file)
all_cases.append(case_summary)
# 为每个病例创建单独的文件
case_output_file = output_path / f"{workflow_file.stem}_summary.json"
with open(case_output_file, 'w', encoding='utf-8') as f:
json.dump(case_summary, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"处理文件 {workflow_file} 失败: {e}")
failed_cases.append(str(workflow_file))
# 创建汇总文件
summary = {
"total_cases": len(all_cases),
"failed_cases": len(failed_cases),
"failed_files": failed_cases,
"cases": all_cases
}
with open(output_path / "all_cases_summary.json", 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
# 创建简化汇总(只包含关键信息)
simple_summary = []
for case in all_cases:
simple_case = {
"case_id": case["case_id"],
"case_info": {
"patient_name": case["case_data"].get("病案介绍", {}).get("基本信息", ""),
"chief_complaint": case["case_data"].get("病案介绍", {}).get("主诉", ""),
"diagnosis": case["case_data"].get("病案介绍", {}).get("诊断", "")
},
"final_output": case["final_medical_info"],
"conversation_length": len(case["conversation_history"].split('\n')),
"total_turns": case["metadata"]["total_turns"] // 2 # 医生+患者算一轮
}
simple_summary.append(simple_case)
with open(output_path / "simple_summary.json", 'w', encoding='utf-8') as f:
json.dump(simple_summary, f, ensure_ascii=False, indent=2)
print(f"处理完成!")
print(f"成功处理: {len(all_cases)} 个病例")
print(f"失败: {len(failed_cases)} 个病例")
print(f"输出目录: {output_path}")
def print_case_sample(case_summary: Dict[str, Any], max_conversation_lines: int = 10) -> None:
"""
打印病例样本信息
Args:
case_summary: 病例摘要
max_conversation_lines: 最大对话行数
"""
print(f"\n=== 病例 {case_summary['case_id']} ===")
# 病例基本信息
case_data = case_summary['case_data']
if case_data and '病案介绍' in case_data:
case_info = case_data['病案介绍']
print(f"患者: {case_info.get('基本信息', '未知')}")
print(f"主诉: {case_info.get('主诉', '未提供')}")
print(f"诊断: {case_info.get('诊断', '未提供')}")
# 生成的医疗信息
final_info = case_summary['final_medical_info']
print("\n最终生成信息:")
print(f"主诉: {final_info.get('chief_complaint', '')}")
print(f"现病史: {final_info.get('hpi', '')[:100]}...")
print(f"既往史: {final_info.get('ph', '')[:100]}...")
# 对话摘要
conversation = case_summary['conversation_history']
lines = conversation.split('\n')
print(f"\n对话记录 (共{len(lines)}行):")
for line in lines[:max_conversation_lines]:
print(f" {line}")
if len(lines) > max_conversation_lines:
print(" ...")
def main():
"""主函数"""
base_dir = Path(__file__).parent.parent
data_dir = base_dir / "results" / "results0902"
output_dir = base_dir / "analysis" / "case_extract_0902"
if not data_dir.exists():
print(f"数据目录不存在: {data_dir}")
return
print("开始提取病例数据...")
process_all_cases(str(data_dir), str(output_dir))
# 显示第一个病例作为示例
output_path = Path(output_dir)
summary_files = list(output_path.glob("*_summary.json"))
if summary_files:
with open(summary_files[0], 'r', encoding='utf-8') as f:
sample_case = json.load(f)
print_case_sample(sample_case)
print("\n提取完成!")
if __name__ == "__main__":
main()