重构:优化评价器模块并统一代码规范
主要变更: - 重命名PreDiagnosisEvaluation为Evaluator,统一术语 - 调整评分范围从-5到5改为0到5,优化评价标准 - 重构代码结构,按照disease_analyst标准实现 - 添加必需的run和build_prompt函数 - 更新配置文件,支持gpt-oss:latest模型 - 完善Claude指导文档和代理规范 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
c89d6dd1c0
commit
4d08c52e53
@ -1,7 +1,7 @@
|
||||
# Claude 指导文档
|
||||
|
||||
## 身份
|
||||
你的名字为ycz copilot
|
||||
你的名字为ycz copilot,你被定义为我的专属代码助手,专门帮助你开发和维护AIM智能体系统。
|
||||
|
||||
## 要求
|
||||
1. 所有的注释必须是中文
|
||||
@ -14,4 +14,5 @@
|
||||
8. 你需要使用uv作为包管理器,虚拟环境为`.venv`
|
||||
- 你可以使用uv run xxx.py 来运行python程序
|
||||
- 你可以使用uv add xxx来添加依赖包
|
||||
9. 默认的
|
||||
9. agent的默认模型为ollama的gpt-oss:latest
|
||||
10. agent文件至少包含run函数和build_prompt函数,prompt文件只包含一个类,代码结构可以参考agent_system/disease_analyst
|
||||
|
||||
260
agent_system/evaluetor/agent.py
Normal file
260
agent_system/evaluetor/agent.py
Normal file
@ -0,0 +1,260 @@
|
||||
from typing import Dict, Any, List
|
||||
from agent_system.base import BaseAgent
|
||||
from agent_system.evaluetor.prompt import EvaluatorPrompt
|
||||
from agent_system.evaluetor.response_model import EvaluatorResult
|
||||
|
||||
|
||||
class Evaluator(BaseAgent):
|
||||
"""
|
||||
评价器Agent
|
||||
|
||||
专门用于评价智能医疗系统的多维度评价工具。
|
||||
从八个核心维度对智能医生的表现进行全面评价,
|
||||
包括当前轮次的表现和结合所有轮次的累积表现。
|
||||
|
||||
核心功能:
|
||||
1. 临床问诊能力评价
|
||||
2. 诊断推理能力评价
|
||||
3. 沟通表达能力评价
|
||||
4. 多轮一致性评价
|
||||
5. 整体专业性评价
|
||||
6. 现病史相似度评价
|
||||
7. 既往史相似度评价
|
||||
8. 主述相似度评价
|
||||
|
||||
Attributes:
|
||||
model_type (str): 使用的大语言模型类型,默认为 gpt-oss:latest
|
||||
llm_config (dict): LLM模型配置参数
|
||||
"""
|
||||
|
||||
def __init__(self, model_type: str = "gpt-oss:latest", llm_config: dict = None):
|
||||
"""
|
||||
初始化评价器Agent
|
||||
|
||||
Args:
|
||||
model_type (str): 大语言模型类型,默认使用 gpt-oss:latest
|
||||
llm_config (dict): LLM模型的配置参数,如果为None则使用默认配置
|
||||
"""
|
||||
super().__init__(
|
||||
model_type=model_type,
|
||||
description=EvaluatorPrompt.description,
|
||||
instructions=EvaluatorPrompt.instructions,
|
||||
response_model=EvaluatorResult,
|
||||
llm_config=llm_config or {},
|
||||
structured_outputs=True,
|
||||
markdown=False,
|
||||
use_cache=False
|
||||
)
|
||||
|
||||
def run(self, patient_case: Dict[str, Any], current_round: int,
|
||||
all_rounds_data: List[Dict[str, Any]]) -> EvaluatorResult:
|
||||
"""
|
||||
执行评价任务
|
||||
|
||||
基于患者病例信息、当前轮次和所有轮次的对话数据,
|
||||
对智能医疗系统进行多维度评价。
|
||||
|
||||
Args:
|
||||
patient_case (Dict[str, Any]): 患者病例信息
|
||||
current_round (int): 当前轮次
|
||||
all_rounds_data (List[Dict[str, Any]]): 所有轮次的数据
|
||||
|
||||
Returns:
|
||||
EvaluatorResult: 包含评价结果的结构化数据,包括:
|
||||
- clinical_inquiry: 临床问诊能力评价
|
||||
- diagnostic_reasoning: 诊断推理能力评价
|
||||
- communication_quality: 沟通表达能力评价
|
||||
- multi_round_consistency: 多轮一致性评价
|
||||
- overall_professionalism: 整体专业性评价
|
||||
- present_illness_similarity: 现病史相似度评价
|
||||
- past_history_similarity: 既往史相似度评价
|
||||
- chief_complaint_similarity: 主述相似度评价
|
||||
- summary: 整体评价总结
|
||||
- key_suggestions: 关键改进建议列表
|
||||
|
||||
Raises:
|
||||
Exception: 当LLM调用失败时,返回包含默认信息的EvaluatorResult
|
||||
"""
|
||||
try:
|
||||
# 构建评价提示词
|
||||
prompt = self.build_prompt(patient_case, current_round, all_rounds_data)
|
||||
|
||||
# 调用基类的run方法执行LLM推理
|
||||
result = super().run(prompt)
|
||||
|
||||
# 确保返回正确的类型并进行类型转换
|
||||
return self._ensure_result_type(result)
|
||||
|
||||
except Exception as e:
|
||||
# 当评价失败时记录错误并返回默认结果
|
||||
print(f"评价执行失败: {str(e)}")
|
||||
return self._get_fallback_result()
|
||||
|
||||
def build_prompt(self, patient_case: Dict[str, Any], current_round: int,
|
||||
all_rounds_data: List[Dict[str, Any]]) -> str:
|
||||
"""
|
||||
构建评价的提示词模板
|
||||
|
||||
根据患者病例信息、当前轮次和所有轮次数据,构建简洁高效的评价提示词,
|
||||
引导LLM进行专业的医疗系统评价。
|
||||
|
||||
Args:
|
||||
patient_case (Dict[str, Any]): 患者病例信息
|
||||
current_round (int): 当前轮次
|
||||
all_rounds_data (List[Dict[str, Any]]): 所有轮次的数据
|
||||
|
||||
Returns:
|
||||
str: 精简的评价提示词
|
||||
"""
|
||||
# 格式化患者信息
|
||||
patient_info = self._format_patient_info(patient_case)
|
||||
|
||||
# 格式化真实病历信息
|
||||
true_medical_info = self._format_true_medical_info(patient_case)
|
||||
|
||||
# 格式化对话历史
|
||||
conversation_history = self._format_conversation_history(all_rounds_data)
|
||||
|
||||
# 获取示例输出格式
|
||||
example_output = EvaluatorPrompt.get_example_output()
|
||||
|
||||
prompt = f"""患者病例信息:
|
||||
{patient_info}
|
||||
|
||||
真实病历信息(用于相似度比较):
|
||||
{true_medical_info}
|
||||
|
||||
对话历史(共{current_round}轮):
|
||||
{conversation_history}
|
||||
|
||||
请基于以上信息,从八个维度对医疗系统进行评价,严格按照JSON格式输出。
|
||||
|
||||
输出格式示例:
|
||||
{example_output}
|
||||
|
||||
请严格按照上述JSON格式输出评价结果。"""
|
||||
|
||||
return prompt
|
||||
|
||||
def _ensure_result_type(self, result: Any) -> EvaluatorResult:
|
||||
"""
|
||||
确保返回结果为正确的类型
|
||||
|
||||
Args:
|
||||
result (Any): LLM返回的原始结果
|
||||
|
||||
Returns:
|
||||
EvaluatorResult: 转换后的结构化结果
|
||||
"""
|
||||
if isinstance(result, EvaluatorResult):
|
||||
return result
|
||||
elif isinstance(result, dict):
|
||||
return EvaluatorResult(**result)
|
||||
else:
|
||||
# 如果类型不匹配,返回默认结果
|
||||
return self._get_fallback_result()
|
||||
|
||||
def _get_fallback_result(self) -> EvaluatorResult:
|
||||
"""
|
||||
生成评价失败时的默认结果
|
||||
|
||||
Returns:
|
||||
EvaluatorResult: 包含默认评价信息的结果
|
||||
"""
|
||||
from agent_system.evaluetor.response_model import EvaluationDimension
|
||||
|
||||
default_dimension = EvaluationDimension(
|
||||
score=0.0,
|
||||
comment="评价失败:系统异常,无法完成评价"
|
||||
)
|
||||
|
||||
return EvaluatorResult(
|
||||
clinical_inquiry=default_dimension,
|
||||
diagnostic_reasoning=default_dimension,
|
||||
communication_quality=default_dimension,
|
||||
multi_round_consistency=default_dimension,
|
||||
overall_professionalism=default_dimension,
|
||||
present_illness_similarity=default_dimension,
|
||||
past_history_similarity=default_dimension,
|
||||
chief_complaint_similarity=default_dimension,
|
||||
summary="评价失败:系统异常,无法完成评价",
|
||||
key_suggestions=["系统需要调试和修复"]
|
||||
)
|
||||
|
||||
def _format_patient_info(self, patient_case: Dict[str, Any]) -> str:
|
||||
"""格式化患者信息"""
|
||||
info_parts = []
|
||||
|
||||
# 病案信息
|
||||
if '病案介绍' in patient_case:
|
||||
case_info = patient_case['病案介绍']
|
||||
|
||||
if '基本信息' in case_info:
|
||||
info_parts.append(f"**基本信息**: {case_info['基本信息']}")
|
||||
|
||||
if '主诉' in case_info:
|
||||
info_parts.append(f"**主诉**: {case_info['主诉']}")
|
||||
|
||||
if '现病史' in case_info:
|
||||
info_parts.append(f"**现病史**: {case_info['现病史']}")
|
||||
|
||||
if '既往史' in case_info:
|
||||
info_parts.append(f"**既往史**: {case_info['既往史']}")
|
||||
|
||||
return '\n'.join(info_parts)
|
||||
|
||||
def _format_true_medical_info(self, patient_case: Dict[str, Any]) -> str:
|
||||
"""格式化真实病历信息,用于相似度比较"""
|
||||
info_parts = []
|
||||
|
||||
# 病案信息
|
||||
if '病案介绍' in patient_case:
|
||||
case_info = patient_case['病案介绍']
|
||||
|
||||
if '主诉' in case_info:
|
||||
info_parts.append(f"**真实主诉**: {case_info['主诉']}")
|
||||
|
||||
if '现病史' in case_info:
|
||||
info_parts.append(f"**真实现病史**: {case_info['现病史']}")
|
||||
|
||||
if '既往史' in case_info:
|
||||
info_parts.append(f"**真实既往史**: {case_info['既往史']}")
|
||||
|
||||
return '\n'.join(info_parts)
|
||||
|
||||
def _format_conversation_history(self, all_rounds_data: List[Dict[str, Any]]) -> str:
|
||||
"""格式化对话历史"""
|
||||
history_parts = []
|
||||
|
||||
for i, round_data in enumerate(all_rounds_data, 1):
|
||||
history_parts.append(f"### 第{i}轮对话")
|
||||
|
||||
if 'patient_response' in round_data:
|
||||
history_parts.append(f"**患者回答**: {round_data['patient_response']}")
|
||||
|
||||
if 'doctor_inquiry' in round_data:
|
||||
history_parts.append(f"**医生询问**: {round_data['doctor_inquiry']}")
|
||||
|
||||
if 'HPI' in round_data:
|
||||
history_parts.append(f"**现病史(HPI)**: {round_data['HPI']}")
|
||||
|
||||
if 'PH' in round_data:
|
||||
history_parts.append(f"**既往史(PH)**: {round_data['PH']}")
|
||||
|
||||
history_parts.append("") # 空行分隔
|
||||
|
||||
return '\n'.join(history_parts)
|
||||
|
||||
def evaluate_single_round(self, patient_case: Dict[str, Any],
|
||||
round_data: Dict[str, Any]) -> EvaluatorResult:
|
||||
"""
|
||||
评价单轮对话的便捷接口
|
||||
|
||||
Args:
|
||||
patient_case (Dict[str, Any]): 患者病例信息
|
||||
round_data (Dict[str, Any]): 单轮对话数据
|
||||
|
||||
Returns:
|
||||
EvaluatorResult: 单轮评价结果
|
||||
"""
|
||||
return self.run(patient_case, 1, [round_data])
|
||||
136
agent_system/evaluetor/prompt.py
Normal file
136
agent_system/evaluetor/prompt.py
Normal file
@ -0,0 +1,136 @@
|
||||
from agent_system.base import BasePrompt
|
||||
|
||||
|
||||
class EvaluatorPrompt(BasePrompt):
|
||||
"""
|
||||
评价器提示词模板
|
||||
|
||||
定义了评价器的角色、任务目标和执行指令,
|
||||
确保智能体能够对医疗系统进行专业的多维度评价。
|
||||
"""
|
||||
|
||||
# 智能体角色和目标描述
|
||||
description = (
|
||||
"你是一名专业的医疗系统评价专家,擅长对智能医疗系统进行全面、客观的多维度评价。"
|
||||
"你的主要任务是基于医疗对话记录和真实病历信息,从八个核心维度对系统表现进行评价,"
|
||||
"包括临床问诊能力、诊断推理能力、沟通表达能力、多轮一致性、整体专业性、"
|
||||
"以及现病史、既往史、主述的相似度评价。"
|
||||
"你的评价将为医疗系统的持续改进提供重要参考。"
|
||||
)
|
||||
|
||||
# 执行指令和注意事项
|
||||
instructions = [
|
||||
"## 核心评价任务",
|
||||
"1. **临床问诊能力**: 评价医生的问诊技巧、信息收集能力和问题针对性",
|
||||
"2. **诊断推理能力**: 评价临床思维、推理过程和鉴别诊断能力",
|
||||
"3. **沟通表达能力**: 评价与患者的沟通质量、表达清晰度和专业性",
|
||||
"4. **多轮一致性**: 评价多轮对话的连贯性、一致性和进步性",
|
||||
"5. **整体专业性**: 评价整体的医学专业水平、风险识别和临床决策能力",
|
||||
"6. **现病史相似度**: 比较生成的现病史与真实现病史的相似度和准确性",
|
||||
"7. **既往史相似度**: 比较生成的既往史与真实既往史的相似度和准确性",
|
||||
"8. **主述相似度**: 比较生成的主述与真实主述的相似度和准确性",
|
||||
"",
|
||||
"## 评价标准",
|
||||
"- 评分范围:0到5分(0为非常差/无关/无法判断,5为非常好)",
|
||||
"- 评分原则:严格按照评分标准,重点关注临床安全性、专业性和实用性",
|
||||
"- 0分表示:表现非常差或本轮未涉及该维度或信息不足以评价",
|
||||
"- 只有在表现确实优秀、无明显不足时才给4分以上",
|
||||
"- 5分应该极少出现,只有在各方面都完美无缺时才给出",
|
||||
"",
|
||||
"## 输出要求",
|
||||
"1. **格式要求**: 严格按照 EvaluatorResult 的 JSON 结构输出,不得省略任何必需字段",
|
||||
"2. **内容质量**: 评价意见必须具体明确、具有建设性,明确指出问题和扣分原因",
|
||||
"3. **医学专业性**: 基于临床医学知识和最佳实践进行评价",
|
||||
"4. **客观公正**: 确保评价客观公正,既要指出不足也要认可优点",
|
||||
"",
|
||||
"## 示例输出格式(JSON)",
|
||||
"{",
|
||||
" \"clinical_inquiry\": {",
|
||||
" \"score\": 3.0,",
|
||||
" \"comment\": \"问诊技巧良好,全面系统收集关键信息,问题高度针对性,符合临床最佳实践,仅有个别细节可提升。\"",
|
||||
" },",
|
||||
" \"diagnostic_reasoning\": {",
|
||||
" \"score\": 2.0,",
|
||||
" \"comment\": \"推理方向基本合理,考虑了主要可能性,但分析不够深入,缺乏对重要鉴别诊断的拓展。\"",
|
||||
" },",
|
||||
" \"communication_quality\": {",
|
||||
" \"score\": 4.0,",
|
||||
" \"comment\": \"表达规范,专业且通俗,沟通效果好,体现医学人文关怀,有细节可提升。\"",
|
||||
" },",
|
||||
" \"multi_round_consistency\": {",
|
||||
" \"score\": 0.0,",
|
||||
" \"comment\": \"当前仅1轮对话,无法评价多轮表现。\"",
|
||||
" },",
|
||||
" \"overall_professionalism\": {",
|
||||
" \"score\": 3.0,",
|
||||
" \"comment\": \"专业水平较高,风险识别能力强,决策合理,符合一般临床标准,但距离专家水平仍有差距。\"",
|
||||
" },",
|
||||
" \"present_illness_similarity\": {",
|
||||
" \"score\": 3.0,",
|
||||
" \"comment\": \"现病史记录基本准确,与真实现病史有一定相似度,但对病情发展过程的描述不够详细。\"",
|
||||
" },",
|
||||
" \"past_history_similarity\": {",
|
||||
" \"score\": 0.0,",
|
||||
" \"comment\": \"本轮未涉及既往史或信息不足以评价。\"",
|
||||
" },",
|
||||
" \"chief_complaint_similarity\": {",
|
||||
" \"score\": 4.0,",
|
||||
" \"comment\": \"主述记录较为准确,与真实主述相似度较高,基本涵盖主要症状,但有小偏差。\"",
|
||||
" },",
|
||||
" \"summary\": \"医生在问诊中表现基本合格,能够收集基本信息并进行初步整理,但在诊断推理深度、多轮对话连贯性等方面存在提升空间。\",",
|
||||
" \"key_suggestions\": [",
|
||||
" \"加强鉴别诊断思维的深度和广度\",",
|
||||
" \"提升多轮对话的连贯性和一致性\",",
|
||||
" \"完善现病史的详细记录和分析\"",
|
||||
" ]",
|
||||
"}"
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def get_example_output() -> str:
|
||||
"""
|
||||
获取示例输出格式,用于指导 LLM 生成符合要求的结构化输出
|
||||
|
||||
Returns:
|
||||
str: JSON 格式的示例输出
|
||||
"""
|
||||
return """{
|
||||
"clinical_inquiry": {
|
||||
"score": 3.0,
|
||||
"comment": "问诊技巧评价内容"
|
||||
},
|
||||
"diagnostic_reasoning": {
|
||||
"score": 2.0,
|
||||
"comment": "诊断推理能力评价内容"
|
||||
},
|
||||
"communication_quality": {
|
||||
"score": 4.0,
|
||||
"comment": "沟通表达能力评价内容"
|
||||
},
|
||||
"multi_round_consistency": {
|
||||
"score": 0.0,
|
||||
"comment": "多轮一致性评价内容"
|
||||
},
|
||||
"overall_professionalism": {
|
||||
"score": 3.0,
|
||||
"comment": "整体专业性评价内容"
|
||||
},
|
||||
"present_illness_similarity": {
|
||||
"score": 3.0,
|
||||
"comment": "现病史相似度评价内容"
|
||||
},
|
||||
"past_history_similarity": {
|
||||
"score": 0.0,
|
||||
"comment": "既往史相似度评价内容"
|
||||
},
|
||||
"chief_complaint_similarity": {
|
||||
"score": 4.0,
|
||||
"comment": "主述相似度评价内容"
|
||||
},
|
||||
"summary": "整体评价总结",
|
||||
"key_suggestions": [
|
||||
"改进建议1",
|
||||
"改进建议2",
|
||||
"改进建议3"
|
||||
]
|
||||
}"""
|
||||
65
agent_system/evaluetor/response_model.py
Normal file
65
agent_system/evaluetor/response_model.py
Normal file
@ -0,0 +1,65 @@
|
||||
from typing import List
|
||||
from pydantic import BaseModel, Field
|
||||
from agent_system.base import BaseResponseModel
|
||||
|
||||
class EvaluationDimension(BaseModel):
|
||||
"""单个评价维度"""
|
||||
score: float = Field(
|
||||
...,
|
||||
description="该维度的评分(0-5分,0分最差,5分最好)",
|
||||
ge=0.0,
|
||||
le=5.0
|
||||
)
|
||||
comment: str = Field(
|
||||
...,
|
||||
description="该维度的详细评价和分析"
|
||||
)
|
||||
|
||||
class EvaluatorResult(BaseResponseModel):
|
||||
"""评价器评价结果"""
|
||||
|
||||
# 基础评价维度(5个)
|
||||
clinical_inquiry: EvaluationDimension = Field(
|
||||
default=EvaluationDimension(score=0.0, comment="评价失败:临床问诊能力评价缺失"),
|
||||
description="临床问诊能力评价"
|
||||
)
|
||||
diagnostic_reasoning: EvaluationDimension = Field(
|
||||
default=EvaluationDimension(score=0.0, comment="评价失败:诊断推理能力评价缺失"),
|
||||
description="诊断推理能力评价"
|
||||
)
|
||||
communication_quality: EvaluationDimension = Field(
|
||||
default=EvaluationDimension(score=0.0, comment="评价失败:沟通表达能力评价缺失"),
|
||||
description="沟通表达能力评价"
|
||||
)
|
||||
multi_round_consistency: EvaluationDimension = Field(
|
||||
default=EvaluationDimension(score=0.0, comment="评价失败:多轮一致性评价缺失"),
|
||||
description="多轮一致性评价"
|
||||
)
|
||||
overall_professionalism: EvaluationDimension = Field(
|
||||
default=EvaluationDimension(score=0.0, comment="评价失败:整体专业性评价缺失"),
|
||||
description="整体专业性评价"
|
||||
)
|
||||
|
||||
# 相似度评价维度(3个)
|
||||
present_illness_similarity: EvaluationDimension = Field(
|
||||
default=EvaluationDimension(score=0.0, comment="评价失败:现病史相似度评价缺失"),
|
||||
description="现病史相似度评价"
|
||||
)
|
||||
past_history_similarity: EvaluationDimension = Field(
|
||||
default=EvaluationDimension(score=0.0, comment="评价失败:既往史相似度评价缺失"),
|
||||
description="既往史相似度评价"
|
||||
)
|
||||
chief_complaint_similarity: EvaluationDimension = Field(
|
||||
default=EvaluationDimension(score=0.0, comment="评价失败:主述相似度评价缺失"),
|
||||
description="主述相似度评价"
|
||||
)
|
||||
|
||||
# 总结和建议
|
||||
summary: str = Field(
|
||||
default="评价失败:整体评价总结缺失",
|
||||
description="整体评价总结"
|
||||
)
|
||||
key_suggestions: List[str] = Field(
|
||||
default=["评价失败:关键改进建议缺失"],
|
||||
description="关键改进建议列表"
|
||||
)
|
||||
@ -15,11 +15,12 @@ LLM_CONFIG = {
|
||||
"base_url": "https://api.deepseek.com"
|
||||
}
|
||||
},
|
||||
"ollama": {
|
||||
"gpt-oss:latest": {
|
||||
"class": "Ollama",
|
||||
"params": {
|
||||
"id": "qwen2.5:latest",
|
||||
"host": "127.0.0.1"
|
||||
"id": "gpt-oss:latest",
|
||||
"host": "192.168.31.228",
|
||||
"options": {"think": False} # 关闭思考模式
|
||||
}
|
||||
},
|
||||
"deepseek-v3": {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user