feat: 优化PDF解析筛选逻辑，移除MIMIC关键词依赖并增加AI医学相关性判断

- 移除第一层MIMIC-IV关键词筛选，简化筛选流程 - 增强AI分析功能，同时判断医学相关性和任务类型 - 修改_analyze_research_task函数返回包含医学相关性和任务类型的完整结果 - 更新筛选条件：只有同时满足"医学相关"和"指定任务类型"的论文才通过筛选 - 优化相关注释和日志输出，提高代码可维护性
2025-08-26 23:06:48 +08:00 · 2025-08-26 23:06:48 +08:00 · 22c90728e5
commit 22c90728e5
parent 76c04eae4a
1 changed files with 77 additions and 83 deletions
--- a/src/parse.py
+++ b/src/parse.py
@ -18,7 +18,11 @@ from typing import List, Dict, Optional, Tuple
 class PDFParser:
-    """PDF解析类 - 用于将PDF文件转换为Markdown格式并按任务类型筛选
+    """PDF解析类 - 用于将PDF文件转换为Markdown格式并筛选医学相关论文
    筛选机制：
    1. 医学相关性：使用AI判断论文是否属于医学、临床、生物医学等领域
    2. 任务类型：在医学相关的基础上进一步筛选指定的研究任务类型
    支持的任务类型：
    - prediction: 预测任务 (PRED_)
@ -43,16 +47,11 @@ class PDFParser:
        # OCR API配置
        self.ocr_api_url = "http://100.106.4.14:7861/parse"
-        # AI模型API配置（用于四类任务识别：prediction/classification/time_series/correlation）
+        # AI模型API配置（用于医学相关性和四类任务识别）
        self.ai_api_url = "http://100.82.33.121:11001/v1/chat/completions"
        self.ai_model = "gpt-oss-20b"
-        # MIMIC-IV关键词配置（用于内容筛选）
+        # 注意：原来的MIMIC关键词配置已移除，现在使用AI判断医学相关性
        self.mimic_keywords = [
            "MIMIC-IV", "MIMIC 4", "MIMIC IV", "MIMIC-4",
            "Medical Information Mart Intensive Care IV",
            "MIMIC-IV dataset", "MIMIC-IV database"
        ]
        # 任务类型到前缀的映射配置
        self.task_type_prefixes = {
@ -105,44 +104,9 @@ class PDFParser:
        logging.info(f"发现 {len(pdf_files)} 个PDF文件待处理")
        return pdf_files
-    def _check_mimic_keywords(self, output_subdir: Path) -> bool:
+    # 注意：_check_mimic_keywords函数已移除
-        """检查Markdown文件是否包含MIMIC-IV关键词
+    # 原功能：检查Markdown文件是否包含MIMIC-IV关键词
-        
+    # 移除原因：改用AI分析医学相关性，不再依赖特定关键词筛选
        Args:
            output_subdir (Path): 包含Markdown文件的输出子目录
        Returns:
            bool: 是否包含MIMIC-IV关键词
        """
        try:
            # 查找所有.md文件
            md_files = list(output_subdir.glob("*.md"))
            if not md_files:
                logging.warning(f"未找到Markdown文件进行MIMIC关键词检查: {output_subdir}")
                return False
            # 检查每个Markdown文件的内容
            for md_file in md_files:
                try:
                    with open(md_file, 'r', encoding='utf-8') as f:
                        content = f.read().lower()  # 转换为小写进行不区分大小写匹配
                    # 检查是否包含任何MIMIC-IV关键词
                    for keyword in self.mimic_keywords:
                        if keyword.lower() in content:
                            logging.info(f"发现MIMIC-IV关键词 '{keyword}' 在文件 {md_file.name}")
                            return True
                except Exception as e:
                    logging.error(f"读取Markdown文件时发生错误: {md_file.name} - {e}")
                    continue
            logging.info(f"未发现MIMIC-IV关键词: {output_subdir.name}")
            return False
        except Exception as e:
            logging.error(f"检查MIMIC关键词时发生错误: {output_subdir} - {e}")
            return False
    def _extract_introduction(self, output_subdir: Path) -> Optional[str]:
        """从Markdown文件中提取Introduction部分
@ -210,33 +174,46 @@ class PDFParser:
            logging.error(f"提取Introduction时发生错误: {output_subdir} - {e}")
            return None
-    def _analyze_research_task(self, introduction: str) -> str:
+    def _analyze_research_task(self, introduction: str) -> Dict[str, any]:
-        """使用AI模型分析论文的研究任务类型
+        """使用AI模型分析论文的医学相关性和研究任务类型
        Args:
            introduction (str): 论文的Introduction内容
        Returns:
-            str: 任务类型 ('prediction', 'classification', 'time_series', 'correlation', 'none')
+            Dict[str, any]: 包含医学相关性和任务类型的分析结果
                - is_medical: bool，是否为医学相关论文
                - task_type: str，任务类型 ('prediction', 'classification', 'time_series', 'correlation', 'none')
                - medical_confidence: float，医学相关性置信度
                - task_confidence: float，任务类型置信度
        """
        try:
            # 构造AI分析的提示词
-            system_prompt = """你是一个医学研究专家。请分析给定的论文Introduction部分，判断该研究属于以下哪种任务类型：
+            system_prompt = """你是一个医学研究专家。请分析给定的论文Introduction部分，判断两个维度：
-1. prediction - 预测任务：预测未来事件、结局或数值（如死亡率预测、住院时长预测、疾病进展预测）
+1. 医学相关性：判断该论文是否属于医学、临床医学、生物医学、公共卫生、护理学等医学相关领域
-2. classification - 分类任务：将患者或病例分类到不同类别（如疾病诊断分类、风险等级分类、药物反应分类）
+   - 医学相关：涉及疾病、患者、临床数据、医疗干预、生物医学指标等
-3. time_series - 时间序列分析：分析随时间变化的医疗数据（如生命体征趋势分析、病情演进分析、纵向队列研究）
+   - 非医学相关：纯计算机科学、工程学、物理学、经济学等非医学领域
 4. correlation - 关联性分析：研究变量间的关系或关联（如痾病与人口特征关系、药物与副作用关联、风险因素识别）
 5. none - 不属于以上任何类型
-请以JSON格式回答，包含任务类型和置信度：
+2. 任务类型：如果是医学相关论文，进一步判断属于以下哪种任务类型：
-{\"task_type\": \"prediction\", \"confidence\": 0.85}
+   - prediction: 预测任务（预测未来事件、结局或数值，如死亡率预测、住院时长预测、疾病进展预测）
   - classification: 分类任务（将患者或病例分类到不同类别，如疾病诊断分类、风险等级分类、药物反应分类）
   - time_series: 时间序列分析（分析随时间变化的医疗数据，如生命体征趋势分析、病情演进分析、纵向队列研究）
   - correlation: 关联性分析（研究变量间的关系或关联，如疾病与人口特征关系、药物与副作用关联、风险因素识别）
   - none: 不属于以上任何类型
 请以JSON格式回答，包含所有字段：
 {\"is_medical\": true, \"task_type\": \"prediction\", \"medical_confidence\": 0.90, \"task_confidence\": 0.85}
 字段说明：
 - is_medical: 布尔值，是否为医学相关论文
 - task_type: 任务类型（prediction/classification/time_series/correlation/none）
 - medical_confidence: 医学相关性置信度（0-1之间）
 - task_confidence: 任务类型置信度（0-1之间）
 task_type必须是以下选项之一：prediction、classification、time_series、correlation、none
 confidence为0-1之间的数值，表示判断的置信度。
 只返回JSON，不要添加其他文字。"""
-            user_prompt = f"请分析以下论文Introduction，判断属于哪种任务类型：\n\n{introduction[:2000]}"  # 限制长度避免token过多
+            user_prompt = f"请分析以下论文Introduction，判断医学相关性和任务类型：\n\n{introduction[:2000]}"  # 限制长度避免token过多
            # 构造API请求数据
            api_data = {
@ -245,7 +222,7 @@ confidence为0-1之间的数值，表示判断的置信度。
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
-                "max_tokens": 50,  # 需要返回JSON格式
+                "max_tokens": 100,  # 需要返回更复杂的JSON格式
                "temperature": 0.1  # 降低随机性
            }
@ -264,35 +241,50 @@ confidence为0-1之间的数值，表示判断的置信度。
                try:
                    # 解析JSON响应
                    parsed_response = json.loads(ai_response)
                    is_medical = parsed_response.get('is_medical', False)
                    task_type = parsed_response.get('task_type', 'none').lower()
-                    confidence = parsed_response.get('confidence', 0.0)
+                    medical_confidence = parsed_response.get('medical_confidence', 0.0)
                    task_confidence = parsed_response.get('task_confidence', 0.0)
                    # 验证任务类型是否有效
                    valid_types = ['prediction', 'classification', 'time_series', 'correlation', 'none']
                    if task_type not in valid_types:
                        logging.warning(f"AI返回了无效的任务类型: {task_type}，使用默认值 'none'")
                        task_type = "none"
-                        confidence = 0.0
+                        task_confidence = 0.0
-                    # 只接受高置信度的结果
+                    # 检查医学相关性置信度（要求至少 0.7）
-                    if confidence < 0.7:
+                    if medical_confidence < 0.7:
-                        logging.info(f"AI分析置信度过低 ({confidence:.2f})，归类为 'none'")
+                        logging.info(f"医学相关性置信度过低 ({medical_confidence:.2f})，标记为非医学论文")
                        is_medical = False
                    # 检查任务类型置信度（要求至少 0.7）
                    if task_confidence < 0.7:
                        logging.info(f"任务类型置信度过低 ({task_confidence:.2f})，标记为 'none'")
                        task_type = "none"
-                    logging.info(f"AI分析结果: 任务类型={task_type}, 置信度={confidence:.2f}")
+                    # 构建返回结果
-                    return task_type
+                    result = {
                        'is_medical': is_medical,
                        'task_type': task_type,
                        'medical_confidence': medical_confidence,
                        'task_confidence': task_confidence
                    }
                    logging.info(f"AI分析结果: 医学相关={is_medical}({medical_confidence:.2f}), 任务类型={task_type}({task_confidence:.2f})")
                    return result
                except json.JSONDecodeError as e:
                    logging.error(f"解析AI JSON响应失败: {ai_response} - 错误: {e}")
-                    return "none"
+                    return {'is_medical': False, 'task_type': 'none', 'medical_confidence': 0.0, 'task_confidence': 0.0}
            else:
                logging.error(f"AI API调用失败，状态码: {response.status_code}")
-                return "none"
+                return {'is_medical': False, 'task_type': 'none', 'medical_confidence': 0.0, 'task_confidence': 0.0}
        except Exception as e:
            logging.error(f"AI分析研究任务时发生错误: {e}")
-            return "none"
+            return {'is_medical': False, 'task_type': 'none', 'medical_confidence': 0.0, 'task_confidence': 0.0}
    def _mark_valid_folder(self, output_subdir: Path, task_type: str) -> bool:
        """为通过筛选的文件夹添加任务类型前缀标记
@ -541,26 +533,28 @@ confidence为0-1之间的数值，表示判断的置信度。
            # 获取解压后的文件夹路径
            output_subdir = self.markdown_dir / pdf_file.stem
-            # 第一层筛选：检查MIMIC-IV关键词
+            # AI分析研究任务（医学相关性 + 任务类型）
            logging.info(f"开始MIMIC-IV关键词筛选: {pdf_file.stem}")
            if not self._check_mimic_keywords(output_subdir):
                logging.info(f"未通过MIMIC-IV关键词筛选，跳过: {pdf_file.stem}")
                return True  # 处理成功但未通过筛选
            # 第二层筛选：AI分析研究任务
            logging.info(f"开始AI研究任务分析: {pdf_file.stem}")
            introduction = self._extract_introduction(output_subdir)
            if not introduction:
                logging.warning(f"无法提取Introduction，跳过AI分析: {pdf_file.stem}")
                return True  # 处理成功但无法进行任务分析
-            task_type = self._analyze_research_task(introduction)
+            analysis_result = self._analyze_research_task(introduction)
-            if task_type == "none":
+            is_medical = analysis_result['is_medical']
-                logging.info(f"未通过研究任务筛选 (task_type=none)，跳过: {pdf_file.stem}")
+            task_type = analysis_result['task_type']
            # 检查是否通过筛选（必须是医学相关且属于指定任务类型）
            if not is_medical:
                logging.info(f"未通过医学相关性筛选，跳过: {pdf_file.stem}")
                return True  # 处理成功但未通过筛选
-            # 两层筛选都通过，根据任务类型标记文件夹
+            if task_type == "none":
-            logging.info(f"通过所有筛选，标记为{task_type}任务论文: {pdf_file.stem}")
+                logging.info(f"未通过任务类型筛选 (task_type=none)，跳过: {pdf_file.stem}")
                return True  # 处理成功但未通过筛选
            # 通过所有筛选，根据任务类型标记文件夹
            logging.info(f"通过所有筛选，标记为{task_type}任务医学论文: {pdf_file.stem}")
            if self._mark_valid_folder(output_subdir, task_type):
                logging.info(f"论文筛选完成，已标记为{task_type}任务: {pdf_file.stem}")
            else: