feat: 实现基于LangExtract框架的MIMIC论文信息提取系统

- 新增info_extractor.py主文件，支持命令行参数和测试模式 - 实现src/extractor.py核心MIMICLangExtractBuilder类 - 集成vllm API服务(OpenAI兼容格式)进行结构化信息提取 - 支持5大模块提取：数据集、模型、训练、评估、环境配置 - 实现源文本定位和交互式HTML可视化 - 添加langextract和httpx[socks]依赖 - 支持个性化论文子目录结果保存 - 清理过时的experiment_runner.py和number_extraction_models.py文件
2025-08-25 20:51:30 +08:00 · 2025-08-25 20:51:30 +08:00 · c4037325ed
commit c4037325ed
parent 1b652502d5
4 changed files with 952 additions and 0 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -14,6 +14,7 @@ MedResearcher 是一个给予用户输入的自动实验平台，其会给予用
 ## 各个模块的主文件
 1. 论文爬取主文件: papers_crawler.py
 2. pdf解析主文件: pdf_parser.py
+3. 信息抽取主文件: info_extractor.py
 3. 实验运行主文件: experiment_runner.py

 ## 文件结构
@ -23,6 +24,7 @@ MedResearcher 是一个给予用户输入的自动实验平台，其会给予用
 │   └── mimic.csv              # 存放所有需要处理的与mimic相关论文的基础信息
 ├── papers_crawler.py          # 论文爬取主文件
 ├── pdf_parser.py              # pdf解析主文件
+├── info_extractor.py          # 信息抽取主文件
 ├── experiment_runner.py       # 实验运行主文件
 ├── src/                       # 源代码目录
 │   └── utils/                 # 工具函数目录
--- a/info_extractor.py
+++ b/info_extractor.py
@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""
+基于LangExtract的MIMIC论文信息提取器
+从医学论文中提取结构化的复现任务信息
+
+作者：MedResearcher项目
+创建时间：2025-01-25
+"""
+
+import argparse
+import logging
+
+from src.extractor import MIMICLangExtractBuilder
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO, 
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+
+
+def setup_args():
+    """设置命令行参数解析
+    
+    Returns:
+        argparse.Namespace: 解析后的命令行参数
+    """
+    parser = argparse.ArgumentParser(
+        description='MIMIC论文信息提取工具 - 基于LangExtract从医学论文中提取结构化复现信息',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog='''
+使用示例:
+  %(prog)s                                          # 使用默认参数
+  %(prog)s --papers_dir dataset/markdowns          # 指定论文目录
+  %(prog)s --output_file results/dataset.json      # 指定输出文件
+  %(prog)s --test_mode --max_papers 5              # 测试模式，只处理5篇论文
+        '''
+    )
+    
+    parser.add_argument(
+        '--papers_dir',
+        type=str,
+        default='dataset/markdowns',
+        help='markdown论文文件目录 (默认: dataset/markdowns)'
+    )
+    
+    parser.add_argument(
+        '--output_file',
+        type=str,
+        default='dataset/reproduction_tasks/mimic_langextract_dataset.json',
+        help='输出数据集文件路径 (默认: dataset/reproduction_tasks/mimic_langextract_dataset.json)'
+    )
+    
+    parser.add_argument(
+        '--test_mode',
+        action='store_true',
+        help='测试模式，只处理少量论文进行验证'
+    )
+    
+    parser.add_argument(
+        '--max_papers',
+        type=int,
+        default=None,
+        help='最大处理论文数量，用于测试 (默认: 处理所有论文)'
+    )
+    
+    parser.add_argument(
+        '--log_level',
+        type=str,
+        default='INFO',
+        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
+        help='日志级别 (默认: INFO)'
+    )
+    
+    return parser.parse_args()
+
+
+def main():
+    """主函数 - 执行MIMIC论文信息提取任务"""
+    try:
+        # 解析命令行参数
+        args = setup_args()
+        
+        # 设置日志级别
+        logging.getLogger().setLevel(getattr(logging, args.log_level))
+        
+        # 初始化信息提取器
+        builder = MIMICLangExtractBuilder()
+        
+        print(f"=== MIMIC论文信息提取工具启动 ===")
+        print(f"论文目录: {args.papers_dir}")
+        print(f"输出文件: {args.output_file}")
+        print(f"测试模式: {'是' if args.test_mode else '否'}")
+        if args.max_papers:
+            print(f"最大论文数: {args.max_papers}")
+        print(f"日志级别: {args.log_level}")
+        print(f"========================")
+        
+        # 构建复现数据集
+        print("\n开始构建MIMIC复现数据集...")
+        dataset = builder.build_reproduction_dataset(
+            papers_dir=args.papers_dir,
+            output_file=args.output_file,
+            max_papers=args.max_papers if args.test_mode or args.max_papers else None
+        )
+        
+        # 统计结果
+        total_papers = dataset['metadata']['total_papers']
+        successful_extractions = sum(
+            1 for paper in dataset['papers'].values()
+            if any(module.get('extraction_count', 0) > 0 
+                   for module in paper.get('modules', {}).values())
+        )
+        
+        print(f"\n=== 构建完成 ===")
+        print(f"总论文数: {total_papers}")
+        print(f"成功提取: {successful_extractions}/{total_papers}")
+        print(f"成功率: {successful_extractions/total_papers*100:.1f}%")
+        print(f"结果保存至: {args.output_file}")
+        print(f"交互式报告: {args.output_file.replace('.json', '.html')}")
+        print(f"===============")
+        
+        return 0
+        
+    except FileNotFoundError as e:
+        print(f"错误: 找不到指定的文件或目录 - {e}")
+        return 1
+    except ValueError as e:
+        print(f"错误: 参数值无效 - {e}")
+        return 1
+    except Exception as e:
+        print(f"错误: 程序执行异常 - {e}")
+        logging.exception("详细错误信息:")
+        return 1
+
+
+if __name__ == "__main__":
+    exit_code = main()
+    exit(exit_code)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,6 +6,8 @@ readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
    "agno>=1.7.12",
+    "httpx[socks]>=0.28.1",
+    "langextract>=1.0.8",
    "ollama>=0.5.3",
    "openai>=1.101.0",
    "pydantic",
--- a/src/extractor.py
+++ b/src/extractor.py
@ -0,0 +1,809 @@
+#!/usr/bin/env python3
+"""
+基于LangExtract的MIMIC论文信息提取器 - 核心实现
+从医学论文中提取结构化的复现任务信息
+
+作者：MedResearcher项目
+创建时间：2025-01-25
+"""
+
+import langextract as lx
+import textwrap
+from pathlib import Path
+import json
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+import logging
+
+# 配置日志
+logger = logging.getLogger(__name__)
+
+
+class MIMICLangExtractBuilder:
+    """基于LangExtract的MIMIC论文信息提取器"""
+    
+    def __init__(self):
+        """初始化提取器，配置vllm API服务"""
+        try:
+            # 配置LangExtract使用vllm API（通过OpenAI兼容接口）
+            import os
+            os.environ["LANGEXTRACT_API_KEY"] = "dummy"
+            
+            # 创建ModelConfig，强制使用OpenAI提供者访问vllm端点
+            self.model_config = lx.factory.ModelConfig(
+                model_id="gpt-oss-20b",  # 使用vllm中实际部署的模型名称
+                provider="OpenAILanguageModel",  # 强制指定OpenAI提供者
+                provider_kwargs={
+                    "base_url": "http://100.82.33.121:11001/v1",  # vllm API端点
+                    "api_key": "dummy",
+                    "model_id": "gpt-oss-20b"  # 确保使用正确的模型ID
+                }
+            )
+            
+            # LangExtract通用配置参数
+            self.extract_config = {
+                "config": self.model_config,
+                "max_workers": 3,          # 降低并发，避免过载vllm服务
+                "max_char_buffer": 6000,   # 适合医学论文的上下文长度
+                "extraction_passes": 1,    # 单次提取，避免过多API调用
+                "temperature": 0.1,        # 较低温度确保一致性
+                "fence_output": True,      # 期望代码围栏格式输出
+                "use_schema_constraints": False  # vllm可能不支持严格schema
+            }
+            
+            # 加载所有模块的提取配置
+            self.module_configs = {
+                "data": self._load_data_config(),
+                "model": self._load_model_config(), 
+                "training": self._load_training_config(),
+                "evaluation": self._load_evaluation_config(),
+                "environment": self._load_environment_config()
+            }
+            
+            logger.info("MIMICLangExtractBuilder初始化成功")
+            
+        except Exception as e:
+            logger.error(f"初始化失败: {e}")
+            raise
+    
+    def _load_data_config(self) -> Dict[str, Any]:
+        """加载数据模块的LangExtract配置"""
+        return {
+            "prompt": textwrap.dedent("""
+                从医学论文中提取数据处理相关的具体信息。严格按照以下规则：
+                
+                1. dataset_source: 提取明确提到的数据集名称（如"MIMIC-IV", "Stanford EHR"）
+                2. data_scale: 提取具体的数据规模数字（如"135,483 patients", "2015-2023"）  
+                3. preprocessing_step: 提取数据预处理的具体步骤描述
+                4. feature_type: 提取特征类型和编码方法的描述
+                5. inclusion_criteria: 提取患者纳入标准的确切文本
+                6. exclusion_criteria: 提取患者排除标准的确切文本
+                
+                使用exact text进行提取，不要释义。为每个提取项提供有意义的属性。
+                """),
+            "examples": [
+                lx.data.ExampleData(
+                    text="We analyzed 135,483 ED blood culture orders from Stanford Medicine EHR between 2015-2023. Adult patients (≥18 years) with blood culture collection in the ED were included. Patients with positive blood cultures within 14 days were excluded. Features were one-hot encoded for ML compatibility.",
+                    extractions=[
+                        lx.data.Extraction(
+                            extraction_class="dataset_source",
+                            extraction_text="Stanford Medicine EHR",
+                            attributes={
+                                "data_type": "electronic health records", 
+                                "institution": "Stanford Medicine"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="data_scale", 
+                            extraction_text="135,483 ED blood culture orders",
+                            attributes={
+                                "sample_size": "135,483", 
+                                "time_range": "2015-2023", 
+                                "data_unit": "blood culture orders"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="inclusion_criteria",
+                            extraction_text="Adult patients (≥18 years) with blood culture collection in the ED",
+                            attributes={
+                                "age_limit": "≥18 years",
+                                "setting": "Emergency Department",
+                                "requirement": "blood culture collection"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="exclusion_criteria",
+                            extraction_text="Patients with positive blood cultures within 14 days were excluded", 
+                            attributes={
+                                "timeframe": "within 14 days",
+                                "condition": "positive blood cultures"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="feature_type",
+                            extraction_text="Features were one-hot encoded for ML compatibility",
+                            attributes={
+                                "encoding_method": "one-hot encoding",
+                                "purpose": "ML compatibility"
+                            }
+                        )
+                    ]
+                ),
+                lx.data.ExampleData(
+                    text="This study utilized MIMIC-IV database, including CHARTEVENTS and LABEVENTS tables. We extracted hourly vital signs and laboratory values for ICU patients. Missing values were imputed using forward-fill method. Outliers beyond 3 standard deviations were removed.",
+                    extractions=[
+                        lx.data.Extraction(
+                            extraction_class="dataset_source",
+                            extraction_text="MIMIC-IV database",
+                            attributes={
+                                "data_type": "public clinical database",
+                                "tables": "CHARTEVENTS, LABEVENTS"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="preprocessing_step",
+                            extraction_text="Missing values were imputed using forward-fill method",
+                            attributes={
+                                "method": "forward-fill",
+                                "target": "missing values"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="preprocessing_step", 
+                            extraction_text="Outliers beyond 3 standard deviations were removed",
+                            attributes={
+                                "method": "outlier removal",
+                                "threshold": "3 standard deviations"
+                            }
+                        )
+                    ]
+                )
+            ]
+        }
+    
+    def _load_model_config(self) -> Dict[str, Any]:
+        """加载模型模块的LangExtract配置"""
+        return {
+            "prompt": textwrap.dedent("""
+                从医学论文中提取机器学习模型的具体信息。严格按照以下规则：
+                
+                1. model_name: 提取明确提到的模型名称（如"XGBoost", "LSTM", "GPT-4"）
+                2. architecture_detail: 提取架构描述的具体文本
+                3. hyperparameter: 提取超参数设置的具体数值
+                4. feature_processing: 提取特征处理方法的描述
+                5. model_component: 提取模型组件或模块的描述
+                
+                使用exact text进行提取，不要释义。为每个提取项提供有意义的属性。
+                """),
+            "examples": [
+                lx.data.ExampleData(
+                    text="We employed XGBoost classifier with max depth of 4 and 30 boosting iterations. Class weights were used to handle imbalanced data. STELLA 1.5B model was used for text embeddings with attention-weighted average pooling.",
+                    extractions=[
+                        lx.data.Extraction(
+                            extraction_class="model_name",
+                            extraction_text="XGBoost classifier",
+                            attributes={
+                                "model_type": "gradient boosting",
+                                "task": "classification"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="hyperparameter",
+                            extraction_text="max depth of 4 and 30 boosting iterations",
+                            attributes={
+                                "max_depth": "4",
+                                "n_estimators": "30",
+                                "parameter_type": "tree_structure"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="model_name",
+                            extraction_text="STELLA 1.5B model",
+                            attributes={
+                                "model_type": "pretrained language model",
+                                "parameters": "1.5B",
+                                "purpose": "text embeddings"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="feature_processing",
+                            extraction_text="attention-weighted average pooling",
+                            attributes={
+                                "technique": "pooling",
+                                "method": "attention-weighted"
+                            }
+                        )
+                    ]
+                )
+            ]
+        }
+    
+    def _load_training_config(self) -> Dict[str, Any]:
+        """加载训练模块的LangExtract配置"""
+        return {
+            "prompt": textwrap.dedent("""
+                从医学论文中提取模型训练相关的具体信息。严格按照以下规则：
+                
+                1. data_split_method: 提取数据分割方法的具体描述
+                2. validation_approach: 提取验证策略的具体描述
+                3. hyperparameter_tuning: 提取超参数调优方法
+                4. stopping_condition: 提取训练停止条件
+                5. optimizer_config: 提取优化器配置信息
+                
+                使用exact text进行提取，不要释义。为每个提取项提供有意义的属性。
+                """),
+            "examples": [
+                lx.data.ExampleData(
+                    text="Data was split temporally: training set (2015-2022), development set (2022-2023) for hyperparameter tuning, and evaluation set (2023+). Grid search was performed on the development set to optimize AUC performance. Early stopping was applied when validation loss did not improve for 10 epochs.",
+                    extractions=[
+                        lx.data.Extraction(
+                            extraction_class="data_split_method",
+                            extraction_text="Data was split temporally: training set (2015-2022), development set (2022-2023), and evaluation set (2023+)",
+                            attributes={
+                                "split_type": "temporal",
+                                "train_period": "2015-2022",
+                                "dev_period": "2022-2023",
+                                "eval_period": "2023+"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="hyperparameter_tuning",
+                            extraction_text="Grid search was performed on the development set to optimize AUC performance",
+                            attributes={
+                                "method": "grid search",
+                                "metric": "AUC",
+                                "dataset": "development set"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="stopping_condition",
+                            extraction_text="Early stopping was applied when validation loss did not improve for 10 epochs",
+                            attributes={
+                                "method": "early stopping",
+                                "patience": "10 epochs",
+                                "monitor": "validation loss"
+                            }
+                        )
+                    ]
+                )
+            ]
+        }
+    
+    def _load_evaluation_config(self) -> Dict[str, Any]:
+        """加载评估模块的LangExtract配置"""
+        return {
+            "prompt": textwrap.dedent("""
+                从医学论文中提取模型评估相关的具体信息。严格按照以下规则：
+                
+                1. evaluation_metric: 提取具体的评估指标名称（如"AUC", "F1-score", "sensitivity"）
+                2. baseline_comparison: 提取基线模型或方法的描述
+                3. performance_result: 提取具体的性能数值结果
+                4. statistical_test: 提取统计检验方法的描述
+                5. experimental_setting: 提取实验设置的具体信息
+                
+                使用exact text进行提取，不要释义。为每个提取项提供有意义的属性。
+                """),
+            "examples": [
+                lx.data.ExampleData(
+                    text="The model achieved ROC-AUC of 0.85 (95% CI: 0.82-0.88) on the test set. We compared against three baselines: expert framework (manual assessment), structured-only model, and LLM-automated framework. At 90% sensitivity, our model achieved 45% specificity versus 32% for the baseline.",
+                    extractions=[
+                        lx.data.Extraction(
+                            extraction_class="evaluation_metric",
+                            extraction_text="ROC-AUC",
+                            attributes={
+                                "metric_type": "discriminative performance",
+                                "range": "0-1"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="performance_result",
+                            extraction_text="ROC-AUC of 0.85 (95% CI: 0.82-0.88)",
+                            attributes={
+                                "metric": "ROC-AUC",
+                                "value": "0.85",
+                                "confidence_interval": "0.82-0.88",
+                                "confidence_level": "95%"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="baseline_comparison",
+                            extraction_text="expert framework (manual assessment), structured-only model, and LLM-automated framework",
+                            attributes={
+                                "baseline_count": "3",
+                                "comparison_type": "multiple baselines"
+                            }
+                        )
+                    ]
+                )
+            ]
+        }
+    
+    def _load_environment_config(self) -> Dict[str, Any]:
+        """加载环境模块的LangExtract配置"""
+        return {
+            "prompt": textwrap.dedent("""
+                从医学论文中提取实验环境相关的具体信息。严格按照以下规则：
+                
+                1. software_library: 提取具体的软件工具和库名称
+                2. hardware_resource: 提取硬件资源需求的描述
+                3. data_repository: 提取数据存储和访问的具体信息
+                4. code_availability: 提取代码可用性的具体描述
+                5. compliance_requirement: 提取合规性和部署要求
+                
+                使用exact text进行提取，不要释义。为每个提取项提供有意义的属性。
+                """),
+            "examples": [
+                lx.data.ExampleData(
+                    text="We implemented the models using Python 3.8 with scikit-learn 1.0.2 and XGBoost 1.5.0. Training was performed on NVIDIA A100 GPU with 40GB memory. Code is available at GitHub: https://github.com/HealthRex/CDSS. The study was approved by Stanford IRB.",
+                    extractions=[
+                        lx.data.Extraction(
+                            extraction_class="software_library",
+                            extraction_text="Python 3.8 with scikit-learn 1.0.2 and XGBoost 1.5.0",
+                            attributes={
+                                "language": "Python",
+                                "version": "3.8",
+                                "libraries": "scikit-learn, XGBoost"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="hardware_resource",
+                            extraction_text="NVIDIA A100 GPU with 40GB memory",
+                            attributes={
+                                "gpu_type": "NVIDIA A100",
+                                "memory": "40GB",
+                                "resource_type": "GPU"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="code_availability",
+                            extraction_text="Code is available at GitHub: https://github.com/HealthRex/CDSS",
+                            attributes={
+                                "platform": "GitHub",
+                                "url": "https://github.com/HealthRex/CDSS",
+                                "access_type": "public"
+                            }
+                        ),
+                        lx.data.Extraction(
+                            extraction_class="compliance_requirement",
+                            extraction_text="The study was approved by Stanford IRB",
+                            attributes={
+                                "approval_type": "IRB",
+                                "institution": "Stanford"
+                            }
+                        )
+                    ]
+                )
+            ]
+        }
+    
+    def extract_paper_modules(self, paper_content: str, paper_id: str) -> Dict[str, Any]:
+        """使用LangExtract提取论文的所有模块信息"""
+        
+        results = {
+            "paper_id": paper_id,
+            "extraction_metadata": {
+                "timestamp": datetime.now().isoformat(),
+                "method": "langextract_with_source_grounding",
+                "model": "gpt-oss-20b"
+            },
+            "modules": {}
+        }
+        
+        # 逐个提取每个模块
+        for module_name, config in self.module_configs.items():
+            try:
+                logger.info(f"  提取{module_name}模块...")
+                
+                # 使用LangExtract进行结构化提取
+                extraction_result = lx.extract(
+                    text_or_documents=paper_content,
+                    prompt_description=config["prompt"],
+                    examples=config["examples"],
+                    **self.extract_config
+                )
+                
+                # 处理提取结果 - extraction_result是AnnotatedDocument对象
+                if extraction_result and hasattr(extraction_result, 'extractions') and extraction_result.extractions:
+                    results["modules"][module_name] = {
+                        "extractions": [
+                            {
+                                "extraction_class": ext.extraction_class,
+                                "extraction_text": ext.extraction_text,
+                                "start_index": getattr(ext, 'start_index', None),
+                                "end_index": getattr(ext, 'end_index', None),
+                                "attributes": getattr(ext, 'attributes', {}),
+                                "confidence": getattr(ext, 'confidence', None)
+                            }
+                            for ext in extraction_result.extractions
+                        ],
+                        "extraction_count": len(extraction_result.extractions),
+                        "quality_score": self._calculate_quality_score(extraction_result)
+                    }
+                else:
+                    results["modules"][module_name] = {
+                        "extractions": [],
+                        "extraction_count": 0,
+                        "quality_score": 0.0,
+                        "error": "No valid extractions found"
+                    }
+                    
+            except Exception as e:
+                logger.error(f"    {module_name}模块提取失败: {e}")
+                results["modules"][module_name] = {
+                    "extractions": [],
+                    "extraction_count": 0,
+                    "quality_score": 0.0,
+                    "error": str(e)
+                }
+        
+        return results
+    
+    def build_reproduction_dataset(self, papers_dir: str, output_file: str, max_papers: Optional[int] = None) -> Dict[str, Any]:
+        """构建完整的复现数据集"""
+        papers = self._load_markdown_papers(papers_dir)
+        
+        dataset = {
+            "metadata": {
+                "creation_date": datetime.now().isoformat(),
+                "total_papers": len(papers),
+                "extraction_method": "langextract_source_grounded",
+                "api_endpoint": "http://100.82.33.121:11001/v1",
+                "model": "gpt-oss-20b",
+                "langextract_version": getattr(lx, '__version__', 'unknown')
+            },
+            "papers": {}
+        }
+        
+        # 如果指定了最大处理数量，限制论文数量
+        if max_papers and max_papers < len(papers):
+            papers_items = list(papers.items())[:max_papers]
+            papers = dict(papers_items)
+            dataset["metadata"]["total_papers"] = len(papers)
+            dataset["metadata"]["note"] = f"测试模式: 只处理前{max_papers}篇论文"
+            logger.info(f"测试模式: 只处理前 {max_papers} 篇论文")
+        
+        logger.info(f"开始处理 {len(papers)} 篇论文...")
+        
+        for i, (paper_id, content) in enumerate(papers.items()):
+            logger.info(f"[{i+1}/{len(papers)}] 处理论文: {paper_id}")
+            
+            paper_result = self.extract_paper_modules(content, paper_id)
+            dataset["papers"][paper_id] = paper_result
+            
+            # 为每个论文单独保存结果到其子文件夹
+            self._save_individual_paper_result(papers_dir, paper_id, paper_result)
+            
+            # 定期保存全局进度
+            if (i + 1) % 10 == 0:
+                self._save_progress(dataset, output_file)
+        
+        # 保存最终结果
+        self._save_dataset(dataset, output_file)
+        
+        # 生成交互式HTML报告
+        self._generate_html_report(dataset, output_file.replace('.json', '.html'))
+        
+        return dataset
+    
+    def _load_markdown_papers(self, papers_dir: str) -> Dict[str, str]:
+        """加载markdown论文文件"""
+        papers = {}
+        papers_path = Path(papers_dir)
+        
+        if not papers_path.exists():
+            raise FileNotFoundError(f"论文目录不存在: {papers_dir}")
+        
+        # 修改加载逻辑：从子目录中读取.md文件
+        markdown_files = []
+        for subdir in papers_path.iterdir():
+            if subdir.is_dir():
+                md_files = list(subdir.glob("*.md"))
+                markdown_files.extend(md_files)
+        
+        if not markdown_files:
+            raise ValueError(f"在 {papers_dir} 目录中未找到markdown文件")
+        
+        logger.info(f"发现 {len(markdown_files)} 个markdown文件")
+        
+        for file_path in markdown_files:
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                    papers[file_path.stem] = content
+            except Exception as e:
+                logger.warning(f"读取文件 {file_path} 失败: {e}")
+        
+        return papers
+    
+    def _calculate_quality_score(self, extraction_result) -> float:
+        """计算提取质量分数"""
+        if not extraction_result or not hasattr(extraction_result, 'extractions'):
+            return 0.0
+        
+        if not extraction_result.extractions:
+            return 0.0
+        
+        # 基于提取数量和属性丰富度计算质量分数
+        total_score = 0.0
+        for ext in extraction_result.extractions:
+            score = 0.3  # 基础分数
+            
+            # 有源文本定位加分
+            if hasattr(ext, 'start_index') and ext.start_index is not None:
+                score += 0.2
+            
+            # 属性丰富度加分
+            if ext.attributes and len(ext.attributes) > 0:
+                score += min(0.3, len(ext.attributes) * 0.1)
+            
+            # 置信度加分
+            if hasattr(ext, 'confidence') and ext.confidence:
+                score += 0.2 * ext.confidence
+            
+            total_score += score
+        
+        return min(1.0, total_score / len(extraction_result.extractions))
+    
+    def _save_progress(self, dataset: Dict[str, Any], output_file: str):
+        """保存处理进度"""
+        try:
+            progress_file = output_file.replace('.json', '_progress.json')
+            with open(progress_file, 'w', encoding='utf-8') as f:
+                json.dump(dataset, f, ensure_ascii=False, indent=2)
+            logger.info(f"进度已保存至: {progress_file}")
+        except Exception as e:
+            logger.error(f"保存进度失败: {e}")
+    
+    def _save_individual_paper_result(self, papers_dir: str, paper_id: str, paper_result: Dict[str, Any]):
+        """为单个论文保存提取结果到其对应的子文件夹"""
+        try:
+            # 构建论文子文件夹路径
+            paper_subdir = Path(papers_dir) / paper_id
+            if not paper_subdir.exists():
+                logger.warning(f"论文子文件夹不存在: {paper_subdir}")
+                return
+            
+            # 准备单个论文的数据集格式
+            individual_dataset = {
+                "metadata": {
+                    "creation_date": datetime.now().isoformat(),
+                    "total_papers": 1,
+                    "extraction_method": "langextract_source_grounded",
+                    "api_endpoint": "http://100.82.33.121:11001/v1",
+                    "model": "gpt-oss-20b",
+                    "langextract_version": getattr(lx, '__version__', 'unknown'),
+                    "paper_id": paper_id
+                },
+                "paper": paper_result  # 注意：这里是单个论文，所以用"paper"而不是"papers"
+            }
+            
+            # 保存JSON文件
+            json_file = paper_subdir / "mimic_langextract_dataset.json"
+            with open(json_file, 'w', encoding='utf-8') as f:
+                json.dump(individual_dataset, f, ensure_ascii=False, indent=2)
+            
+            # 生成HTML报告
+            html_file = paper_subdir / "mimic_langextract_dataset.html"
+            self._generate_individual_html_report(individual_dataset, html_file)
+            
+            logger.info(f"已保存论文 {paper_id} 的结果到: {paper_subdir}")
+            
+        except Exception as e:
+            logger.error(f"保存单个论文结果失败 ({paper_id}): {e}")
+    
+    def _save_dataset(self, dataset: Dict[str, Any], output_file: str):
+        """保存最终数据集"""
+        try:
+            # 确保输出目录存在
+            output_path = Path(output_file)
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            
+            with open(output_file, 'w', encoding='utf-8') as f:
+                json.dump(dataset, f, ensure_ascii=False, indent=2)
+            logger.info(f"数据集已保存至: {output_file}")
+        except Exception as e:
+            logger.error(f"保存数据集失败: {e}")
+            raise
+    
+    def _generate_html_report(self, dataset: Dict[str, Any], output_file: str):
+        """生成LangExtract风格的交互式HTML报告"""
+        try:
+            # 合并所有提取结果用于可视化
+            all_extractions = []
+            for paper_id, paper_data in dataset["papers"].items():
+                for module_name, module_data in paper_data.get("modules", {}).items():
+                    all_extractions.extend(module_data.get("extractions", []))
+            
+            # 基础HTML模板（简化版可视化）
+            html_content = f"""
+<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <title>MIMIC复现数据集 - LangExtract报告</title>
+    <style>
+        body {{ font-family: Arial, sans-serif; margin: 20px; }}
+        .header {{ background: #f0f8ff; padding: 20px; border-radius: 5px; }}
+        .stats {{ display: flex; gap: 20px; margin: 20px 0; }}
+        .stat-card {{ background: #e6f3ff; padding: 15px; border-radius: 5px; }}
+        .extraction {{ border: 1px solid #ddd; margin: 10px 0; padding: 15px; border-radius: 5px; }}
+        .class-tag {{ background: #007acc; color: white; padding: 3px 8px; border-radius: 3px; font-size: 12px; }}
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>MIMIC复现数据集 - LangExtract提取报告</h1>
+        <p>生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
+        <p>总论文数: {dataset['metadata']['total_papers']}</p>
+        <p>提取方法: {dataset['metadata']['extraction_method']}</p>
+    </div>
+    
+    <div class="stats">
+        <div class="stat-card">
+            <h3>提取统计</h3>
+            <p>总提取项: {len(all_extractions)}</p>
+            <p>平均每篇: {len(all_extractions)/dataset['metadata']['total_papers']:.1f}</p>
+        </div>
+        <div class="stat-card">
+            <h3>成功率</h3>
+            <p>处理成功: {len([p for p in dataset['papers'].values() if any(m.get('extraction_count', 0) > 0 for m in p.get('modules', {}).values())])}/{dataset['metadata']['total_papers']}</p>
+        </div>
+    </div>
+    
+    <div class="extractions">
+        <h2>提取结果示例</h2>
+"""
+            
+            # 添加前20个提取结果作为示例
+            for i, ext in enumerate(all_extractions[:20]):
+                html_content += f"""
+        <div class="extraction">
+            <span class="class-tag">{ext.get('extraction_class', 'unknown')}</span>
+            <p><strong>提取文本:</strong> "{ext.get('extraction_text', 'N/A')}"</p>
+            <p><strong>属性:</strong> {ext.get('attributes', {})}</p>
+            <p><strong>置信度:</strong> {ext.get('confidence', 'N/A')}</p>
+        </div>
+"""
+            
+            html_content += """
+    </div>
+</body>
+</html>
+"""
+            
+            with open(output_file, 'w', encoding='utf-8') as f:
+                f.write(html_content)
+                
+            logger.info(f"交互式报告已生成: {output_file}")
+            
+        except Exception as e:
+            logger.error(f"HTML报告生成失败: {e}")
+    
+    def _generate_individual_html_report(self, individual_dataset: Dict[str, Any], output_file: Path):
+        """生成单个论文的LangExtract风格交互式HTML报告"""
+        try:
+            # 从单个论文数据中提取所有提取结果
+            paper_data = individual_dataset["paper"]
+            all_extractions = []
+            for module_name, module_data in paper_data.get("modules", {}).items():
+                for ext in module_data.get("extractions", []):
+                    ext["module"] = module_name  # 添加模块标识
+                    all_extractions.append(ext)
+            
+            # 计算统计信息
+            successful_modules = len([
+                module for module in paper_data.get("modules", {}).values()
+                if module.get("extraction_count", 0) > 0
+            ])
+            total_modules = len(paper_data.get("modules", {}))
+            
+            # 生成HTML内容
+            html_content = f"""
+<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <title>{individual_dataset['metadata']['paper_id']} - LangExtract提取报告</title>
+    <style>
+        body {{ font-family: Arial, sans-serif; margin: 20px; background-color: #f9f9f9; }}
+        .header {{ background: #e3f2fd; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
+        .stats {{ display: flex; gap: 20px; margin: 20px 0; }}
+        .stat-card {{ background: #ffffff; padding: 15px; border-radius: 8px; flex: 1; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
+        .extraction {{ border: 1px solid #e0e0e0; margin: 15px 0; padding: 15px; border-radius: 8px; background: white; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }}
+        .class-tag {{ background: #1976d2; color: white; padding: 4px 10px; border-radius: 12px; font-size: 12px; margin-right: 10px; }}
+        .module-tag {{ background: #388e3c; color: white; padding: 2px 8px; border-radius: 10px; font-size: 11px; margin-left: 10px; }}
+        .attributes {{ background: #f5f5f5; padding: 10px; border-radius: 4px; margin-top: 10px; font-size: 13px; }}
+        .no-extractions {{ text-align: center; color: #666; padding: 40px; background: #f0f0f0; border-radius: 8px; }}
+        h1 {{ color: #1565c0; margin: 0; }}
+        h2 {{ color: #424242; }}
+        h3 {{ color: #1976d2; margin: 0; }}
+        .meta-info {{ color: #666; font-size: 14px; }}
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>MIMIC论文信息提取报告</h1>
+        <h2>{individual_dataset['metadata']['paper_id']}</h2>
+        <div class="meta-info">
+            <p><strong>生成时间:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
+            <p><strong>提取方法:</strong> {individual_dataset['metadata']['extraction_method']}</p>
+            <p><strong>模型:</strong> {individual_dataset['metadata']['model']}</p>
+        </div>
+    </div>
+    
+    <div class="stats">
+        <div class="stat-card">
+            <h3>提取统计</h3>
+            <p><strong>总提取项:</strong> {len(all_extractions)}</p>
+            <p><strong>成功模块:</strong> {successful_modules}/{total_modules}</p>
+        </div>
+        <div class="stat-card">
+            <h3>模块分布</h3>
+"""
+            
+            # 添加每个模块的统计信息
+            for module_name, module_data in paper_data.get("modules", {}).items():
+                extraction_count = module_data.get("extraction_count", 0)
+                html_content += f"            <p><strong>{module_name}:</strong> {extraction_count} 项</p>\n"
+            
+            html_content += """
+        </div>
+    </div>
+    
+    <div class="extractions">
+        <h2>详细提取结果</h2>
+"""
+            
+            if all_extractions:
+                # 按模块分组显示提取结果
+                for module_name in ["data", "model", "training", "evaluation", "environment"]:
+                    module_extractions = [ext for ext in all_extractions if ext.get("module") == module_name]
+                    if module_extractions:
+                        html_content += f"""        <h3>{module_name.title()} 模块 ({len(module_extractions)} 项)</h3>\n"""
+                        
+                        for ext in module_extractions:
+                            confidence_text = f" (置信度: {ext.get('confidence', 'N/A')})" if ext.get('confidence') else ""
+                            html_content += f"""
+        <div class="extraction">
+            <span class="class-tag">{ext.get('extraction_class', 'unknown')}</span>
+            <span class="module-tag">{module_name}</span>
+            <p><strong>提取文本:</strong> "{ext.get('extraction_text', 'N/A')}"</p>
+"""
+                            # 添加属性信息
+                            attributes = ext.get('attributes', {})
+                            if attributes:
+                                html_content += f"""            <div class="attributes">
+                <strong>属性:</strong> """
+                                for key, value in attributes.items():
+                                    html_content += f"<span><strong>{key}:</strong> {value}</span> &nbsp;&nbsp; "
+                                html_content += """
+            </div>"""
+                            
+                            # 添加位置信息
+                            if ext.get('start_index') is not None and ext.get('end_index') is not None:
+                                html_content += f"""            <p class="meta-info">位置: {ext.get('start_index')}-{ext.get('end_index')}{confidence_text}</p>"""
+                            
+                            html_content += """        </div>
+"""
+            else:
+                html_content += """
+        <div class="no-extractions">
+            <p>未找到任何提取结果</p>
+            <p>可能的原因：模型无法识别相关信息，或者文本内容不包含目标信息类型</p>
+        </div>
+"""
+            
+            html_content += """
+    </div>
+</body>
+</html>
+"""
+            
+            # 写入HTML文件
+            with open(output_file, 'w', encoding='utf-8') as f:
+                f.write(html_content)
+                
+            logger.info(f"单个论文HTML报告已生成: {output_file}")
+            
+        except Exception as e:
+            logger.error(f"单个论文HTML报告生成失败: {e}")