feat: 实现基于LangExtract框架的MIMIC论文信息提取系统

- 新增info_extractor.py主文件,支持命令行参数和测试模式
- 实现src/extractor.py核心MIMICLangExtractBuilder类
- 集成vllm API服务(OpenAI兼容格式)进行结构化信息提取
- 支持5大模块提取:数据集、模型、训练、评估、环境配置
- 实现源文本定位和交互式HTML可视化
- 添加langextract和httpx[socks]依赖
- 支持个性化论文子目录结果保存
- 清理过时的experiment_runner.py和number_extraction_models.py文件
This commit is contained in:
iomgaa 2025-08-25 20:51:30 +08:00
parent 1b652502d5
commit c4037325ed
4 changed files with 952 additions and 0 deletions

View File

@ -14,6 +14,7 @@ MedResearcher 是一个给予用户输入的自动实验平台,其会给予用
## 各个模块的主文件 ## 各个模块的主文件
1. 论文爬取主文件: papers_crawler.py 1. 论文爬取主文件: papers_crawler.py
2. pdf解析主文件: pdf_parser.py 2. pdf解析主文件: pdf_parser.py
3. 信息抽取主文件: info_extractor.py
3. 实验运行主文件: experiment_runner.py 3. 实验运行主文件: experiment_runner.py
## 文件结构 ## 文件结构
@ -23,6 +24,7 @@ MedResearcher 是一个给予用户输入的自动实验平台,其会给予用
│ └── mimic.csv # 存放所有需要处理的与mimic相关论文的基础信息 │ └── mimic.csv # 存放所有需要处理的与mimic相关论文的基础信息
├── papers_crawler.py # 论文爬取主文件 ├── papers_crawler.py # 论文爬取主文件
├── pdf_parser.py # pdf解析主文件 ├── pdf_parser.py # pdf解析主文件
├── info_extractor.py # 信息抽取主文件
├── experiment_runner.py # 实验运行主文件 ├── experiment_runner.py # 实验运行主文件
├── src/ # 源代码目录 ├── src/ # 源代码目录
│ └── utils/ # 工具函数目录 │ └── utils/ # 工具函数目录

139
info_extractor.py Normal file
View File

@ -0,0 +1,139 @@
#!/usr/bin/env python3
"""
基于LangExtract的MIMIC论文信息提取器
从医学论文中提取结构化的复现任务信息
作者MedResearcher项目
创建时间2025-01-25
"""
import argparse
import logging
from src.extractor import MIMICLangExtractBuilder
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
def setup_args():
"""设置命令行参数解析
Returns:
argparse.Namespace: 解析后的命令行参数
"""
parser = argparse.ArgumentParser(
description='MIMIC论文信息提取工具 - 基于LangExtract从医学论文中提取结构化复现信息',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='''
使用示例:
%(prog)s # 使用默认参数
%(prog)s --papers_dir dataset/markdowns # 指定论文目录
%(prog)s --output_file results/dataset.json # 指定输出文件
%(prog)s --test_mode --max_papers 5 # 测试模式只处理5篇论文
'''
)
parser.add_argument(
'--papers_dir',
type=str,
default='dataset/markdowns',
help='markdown论文文件目录 (默认: dataset/markdowns)'
)
parser.add_argument(
'--output_file',
type=str,
default='dataset/reproduction_tasks/mimic_langextract_dataset.json',
help='输出数据集文件路径 (默认: dataset/reproduction_tasks/mimic_langextract_dataset.json)'
)
parser.add_argument(
'--test_mode',
action='store_true',
help='测试模式,只处理少量论文进行验证'
)
parser.add_argument(
'--max_papers',
type=int,
default=None,
help='最大处理论文数量,用于测试 (默认: 处理所有论文)'
)
parser.add_argument(
'--log_level',
type=str,
default='INFO',
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
help='日志级别 (默认: INFO)'
)
return parser.parse_args()
def main():
"""主函数 - 执行MIMIC论文信息提取任务"""
try:
# 解析命令行参数
args = setup_args()
# 设置日志级别
logging.getLogger().setLevel(getattr(logging, args.log_level))
# 初始化信息提取器
builder = MIMICLangExtractBuilder()
print(f"=== MIMIC论文信息提取工具启动 ===")
print(f"论文目录: {args.papers_dir}")
print(f"输出文件: {args.output_file}")
print(f"测试模式: {'' if args.test_mode else ''}")
if args.max_papers:
print(f"最大论文数: {args.max_papers}")
print(f"日志级别: {args.log_level}")
print(f"========================")
# 构建复现数据集
print("\n开始构建MIMIC复现数据集...")
dataset = builder.build_reproduction_dataset(
papers_dir=args.papers_dir,
output_file=args.output_file,
max_papers=args.max_papers if args.test_mode or args.max_papers else None
)
# 统计结果
total_papers = dataset['metadata']['total_papers']
successful_extractions = sum(
1 for paper in dataset['papers'].values()
if any(module.get('extraction_count', 0) > 0
for module in paper.get('modules', {}).values())
)
print(f"\n=== 构建完成 ===")
print(f"总论文数: {total_papers}")
print(f"成功提取: {successful_extractions}/{total_papers}")
print(f"成功率: {successful_extractions/total_papers*100:.1f}%")
print(f"结果保存至: {args.output_file}")
print(f"交互式报告: {args.output_file.replace('.json', '.html')}")
print(f"===============")
return 0
except FileNotFoundError as e:
print(f"错误: 找不到指定的文件或目录 - {e}")
return 1
except ValueError as e:
print(f"错误: 参数值无效 - {e}")
return 1
except Exception as e:
print(f"错误: 程序执行异常 - {e}")
logging.exception("详细错误信息:")
return 1
if __name__ == "__main__":
exit_code = main()
exit(exit_code)

View File

@ -6,6 +6,8 @@ readme = "README.md"
requires-python = ">=3.13" requires-python = ">=3.13"
dependencies = [ dependencies = [
"agno>=1.7.12", "agno>=1.7.12",
"httpx[socks]>=0.28.1",
"langextract>=1.0.8",
"ollama>=0.5.3", "ollama>=0.5.3",
"openai>=1.101.0", "openai>=1.101.0",
"pydantic", "pydantic",

809
src/extractor.py Normal file
View File

@ -0,0 +1,809 @@
#!/usr/bin/env python3
"""
基于LangExtract的MIMIC论文信息提取器 - 核心实现
从医学论文中提取结构化的复现任务信息
作者MedResearcher项目
创建时间2025-01-25
"""
import langextract as lx
import textwrap
from pathlib import Path
import json
from datetime import datetime
from typing import List, Dict, Any, Optional
import logging
# 配置日志
logger = logging.getLogger(__name__)
class MIMICLangExtractBuilder:
"""基于LangExtract的MIMIC论文信息提取器"""
def __init__(self):
"""初始化提取器配置vllm API服务"""
try:
# 配置LangExtract使用vllm API通过OpenAI兼容接口
import os
os.environ["LANGEXTRACT_API_KEY"] = "dummy"
# 创建ModelConfig强制使用OpenAI提供者访问vllm端点
self.model_config = lx.factory.ModelConfig(
model_id="gpt-oss-20b", # 使用vllm中实际部署的模型名称
provider="OpenAILanguageModel", # 强制指定OpenAI提供者
provider_kwargs={
"base_url": "http://100.82.33.121:11001/v1", # vllm API端点
"api_key": "dummy",
"model_id": "gpt-oss-20b" # 确保使用正确的模型ID
}
)
# LangExtract通用配置参数
self.extract_config = {
"config": self.model_config,
"max_workers": 3, # 降低并发避免过载vllm服务
"max_char_buffer": 6000, # 适合医学论文的上下文长度
"extraction_passes": 1, # 单次提取避免过多API调用
"temperature": 0.1, # 较低温度确保一致性
"fence_output": True, # 期望代码围栏格式输出
"use_schema_constraints": False # vllm可能不支持严格schema
}
# 加载所有模块的提取配置
self.module_configs = {
"data": self._load_data_config(),
"model": self._load_model_config(),
"training": self._load_training_config(),
"evaluation": self._load_evaluation_config(),
"environment": self._load_environment_config()
}
logger.info("MIMICLangExtractBuilder初始化成功")
except Exception as e:
logger.error(f"初始化失败: {e}")
raise
def _load_data_config(self) -> Dict[str, Any]:
"""加载数据模块的LangExtract配置"""
return {
"prompt": textwrap.dedent("""
从医学论文中提取数据处理相关的具体信息严格按照以下规则
1. dataset_source: 提取明确提到的数据集名称"MIMIC-IV", "Stanford EHR"
2. data_scale: 提取具体的数据规模数字"135,483 patients", "2015-2023"
3. preprocessing_step: 提取数据预处理的具体步骤描述
4. feature_type: 提取特征类型和编码方法的描述
5. inclusion_criteria: 提取患者纳入标准的确切文本
6. exclusion_criteria: 提取患者排除标准的确切文本
使用exact text进行提取不要释义为每个提取项提供有意义的属性
"""),
"examples": [
lx.data.ExampleData(
text="We analyzed 135,483 ED blood culture orders from Stanford Medicine EHR between 2015-2023. Adult patients (≥18 years) with blood culture collection in the ED were included. Patients with positive blood cultures within 14 days were excluded. Features were one-hot encoded for ML compatibility.",
extractions=[
lx.data.Extraction(
extraction_class="dataset_source",
extraction_text="Stanford Medicine EHR",
attributes={
"data_type": "electronic health records",
"institution": "Stanford Medicine"
}
),
lx.data.Extraction(
extraction_class="data_scale",
extraction_text="135,483 ED blood culture orders",
attributes={
"sample_size": "135,483",
"time_range": "2015-2023",
"data_unit": "blood culture orders"
}
),
lx.data.Extraction(
extraction_class="inclusion_criteria",
extraction_text="Adult patients (≥18 years) with blood culture collection in the ED",
attributes={
"age_limit": "≥18 years",
"setting": "Emergency Department",
"requirement": "blood culture collection"
}
),
lx.data.Extraction(
extraction_class="exclusion_criteria",
extraction_text="Patients with positive blood cultures within 14 days were excluded",
attributes={
"timeframe": "within 14 days",
"condition": "positive blood cultures"
}
),
lx.data.Extraction(
extraction_class="feature_type",
extraction_text="Features were one-hot encoded for ML compatibility",
attributes={
"encoding_method": "one-hot encoding",
"purpose": "ML compatibility"
}
)
]
),
lx.data.ExampleData(
text="This study utilized MIMIC-IV database, including CHARTEVENTS and LABEVENTS tables. We extracted hourly vital signs and laboratory values for ICU patients. Missing values were imputed using forward-fill method. Outliers beyond 3 standard deviations were removed.",
extractions=[
lx.data.Extraction(
extraction_class="dataset_source",
extraction_text="MIMIC-IV database",
attributes={
"data_type": "public clinical database",
"tables": "CHARTEVENTS, LABEVENTS"
}
),
lx.data.Extraction(
extraction_class="preprocessing_step",
extraction_text="Missing values were imputed using forward-fill method",
attributes={
"method": "forward-fill",
"target": "missing values"
}
),
lx.data.Extraction(
extraction_class="preprocessing_step",
extraction_text="Outliers beyond 3 standard deviations were removed",
attributes={
"method": "outlier removal",
"threshold": "3 standard deviations"
}
)
]
)
]
}
def _load_model_config(self) -> Dict[str, Any]:
"""加载模型模块的LangExtract配置"""
return {
"prompt": textwrap.dedent("""
从医学论文中提取机器学习模型的具体信息严格按照以下规则
1. model_name: 提取明确提到的模型名称"XGBoost", "LSTM", "GPT-4"
2. architecture_detail: 提取架构描述的具体文本
3. hyperparameter: 提取超参数设置的具体数值
4. feature_processing: 提取特征处理方法的描述
5. model_component: 提取模型组件或模块的描述
使用exact text进行提取不要释义为每个提取项提供有意义的属性
"""),
"examples": [
lx.data.ExampleData(
text="We employed XGBoost classifier with max depth of 4 and 30 boosting iterations. Class weights were used to handle imbalanced data. STELLA 1.5B model was used for text embeddings with attention-weighted average pooling.",
extractions=[
lx.data.Extraction(
extraction_class="model_name",
extraction_text="XGBoost classifier",
attributes={
"model_type": "gradient boosting",
"task": "classification"
}
),
lx.data.Extraction(
extraction_class="hyperparameter",
extraction_text="max depth of 4 and 30 boosting iterations",
attributes={
"max_depth": "4",
"n_estimators": "30",
"parameter_type": "tree_structure"
}
),
lx.data.Extraction(
extraction_class="model_name",
extraction_text="STELLA 1.5B model",
attributes={
"model_type": "pretrained language model",
"parameters": "1.5B",
"purpose": "text embeddings"
}
),
lx.data.Extraction(
extraction_class="feature_processing",
extraction_text="attention-weighted average pooling",
attributes={
"technique": "pooling",
"method": "attention-weighted"
}
)
]
)
]
}
def _load_training_config(self) -> Dict[str, Any]:
"""加载训练模块的LangExtract配置"""
return {
"prompt": textwrap.dedent("""
从医学论文中提取模型训练相关的具体信息严格按照以下规则
1. data_split_method: 提取数据分割方法的具体描述
2. validation_approach: 提取验证策略的具体描述
3. hyperparameter_tuning: 提取超参数调优方法
4. stopping_condition: 提取训练停止条件
5. optimizer_config: 提取优化器配置信息
使用exact text进行提取不要释义为每个提取项提供有意义的属性
"""),
"examples": [
lx.data.ExampleData(
text="Data was split temporally: training set (2015-2022), development set (2022-2023) for hyperparameter tuning, and evaluation set (2023+). Grid search was performed on the development set to optimize AUC performance. Early stopping was applied when validation loss did not improve for 10 epochs.",
extractions=[
lx.data.Extraction(
extraction_class="data_split_method",
extraction_text="Data was split temporally: training set (2015-2022), development set (2022-2023), and evaluation set (2023+)",
attributes={
"split_type": "temporal",
"train_period": "2015-2022",
"dev_period": "2022-2023",
"eval_period": "2023+"
}
),
lx.data.Extraction(
extraction_class="hyperparameter_tuning",
extraction_text="Grid search was performed on the development set to optimize AUC performance",
attributes={
"method": "grid search",
"metric": "AUC",
"dataset": "development set"
}
),
lx.data.Extraction(
extraction_class="stopping_condition",
extraction_text="Early stopping was applied when validation loss did not improve for 10 epochs",
attributes={
"method": "early stopping",
"patience": "10 epochs",
"monitor": "validation loss"
}
)
]
)
]
}
def _load_evaluation_config(self) -> Dict[str, Any]:
"""加载评估模块的LangExtract配置"""
return {
"prompt": textwrap.dedent("""
从医学论文中提取模型评估相关的具体信息严格按照以下规则
1. evaluation_metric: 提取具体的评估指标名称"AUC", "F1-score", "sensitivity"
2. baseline_comparison: 提取基线模型或方法的描述
3. performance_result: 提取具体的性能数值结果
4. statistical_test: 提取统计检验方法的描述
5. experimental_setting: 提取实验设置的具体信息
使用exact text进行提取不要释义为每个提取项提供有意义的属性
"""),
"examples": [
lx.data.ExampleData(
text="The model achieved ROC-AUC of 0.85 (95% CI: 0.82-0.88) on the test set. We compared against three baselines: expert framework (manual assessment), structured-only model, and LLM-automated framework. At 90% sensitivity, our model achieved 45% specificity versus 32% for the baseline.",
extractions=[
lx.data.Extraction(
extraction_class="evaluation_metric",
extraction_text="ROC-AUC",
attributes={
"metric_type": "discriminative performance",
"range": "0-1"
}
),
lx.data.Extraction(
extraction_class="performance_result",
extraction_text="ROC-AUC of 0.85 (95% CI: 0.82-0.88)",
attributes={
"metric": "ROC-AUC",
"value": "0.85",
"confidence_interval": "0.82-0.88",
"confidence_level": "95%"
}
),
lx.data.Extraction(
extraction_class="baseline_comparison",
extraction_text="expert framework (manual assessment), structured-only model, and LLM-automated framework",
attributes={
"baseline_count": "3",
"comparison_type": "multiple baselines"
}
)
]
)
]
}
def _load_environment_config(self) -> Dict[str, Any]:
"""加载环境模块的LangExtract配置"""
return {
"prompt": textwrap.dedent("""
从医学论文中提取实验环境相关的具体信息严格按照以下规则
1. software_library: 提取具体的软件工具和库名称
2. hardware_resource: 提取硬件资源需求的描述
3. data_repository: 提取数据存储和访问的具体信息
4. code_availability: 提取代码可用性的具体描述
5. compliance_requirement: 提取合规性和部署要求
使用exact text进行提取不要释义为每个提取项提供有意义的属性
"""),
"examples": [
lx.data.ExampleData(
text="We implemented the models using Python 3.8 with scikit-learn 1.0.2 and XGBoost 1.5.0. Training was performed on NVIDIA A100 GPU with 40GB memory. Code is available at GitHub: https://github.com/HealthRex/CDSS. The study was approved by Stanford IRB.",
extractions=[
lx.data.Extraction(
extraction_class="software_library",
extraction_text="Python 3.8 with scikit-learn 1.0.2 and XGBoost 1.5.0",
attributes={
"language": "Python",
"version": "3.8",
"libraries": "scikit-learn, XGBoost"
}
),
lx.data.Extraction(
extraction_class="hardware_resource",
extraction_text="NVIDIA A100 GPU with 40GB memory",
attributes={
"gpu_type": "NVIDIA A100",
"memory": "40GB",
"resource_type": "GPU"
}
),
lx.data.Extraction(
extraction_class="code_availability",
extraction_text="Code is available at GitHub: https://github.com/HealthRex/CDSS",
attributes={
"platform": "GitHub",
"url": "https://github.com/HealthRex/CDSS",
"access_type": "public"
}
),
lx.data.Extraction(
extraction_class="compliance_requirement",
extraction_text="The study was approved by Stanford IRB",
attributes={
"approval_type": "IRB",
"institution": "Stanford"
}
)
]
)
]
}
def extract_paper_modules(self, paper_content: str, paper_id: str) -> Dict[str, Any]:
"""使用LangExtract提取论文的所有模块信息"""
results = {
"paper_id": paper_id,
"extraction_metadata": {
"timestamp": datetime.now().isoformat(),
"method": "langextract_with_source_grounding",
"model": "gpt-oss-20b"
},
"modules": {}
}
# 逐个提取每个模块
for module_name, config in self.module_configs.items():
try:
logger.info(f" 提取{module_name}模块...")
# 使用LangExtract进行结构化提取
extraction_result = lx.extract(
text_or_documents=paper_content,
prompt_description=config["prompt"],
examples=config["examples"],
**self.extract_config
)
# 处理提取结果 - extraction_result是AnnotatedDocument对象
if extraction_result and hasattr(extraction_result, 'extractions') and extraction_result.extractions:
results["modules"][module_name] = {
"extractions": [
{
"extraction_class": ext.extraction_class,
"extraction_text": ext.extraction_text,
"start_index": getattr(ext, 'start_index', None),
"end_index": getattr(ext, 'end_index', None),
"attributes": getattr(ext, 'attributes', {}),
"confidence": getattr(ext, 'confidence', None)
}
for ext in extraction_result.extractions
],
"extraction_count": len(extraction_result.extractions),
"quality_score": self._calculate_quality_score(extraction_result)
}
else:
results["modules"][module_name] = {
"extractions": [],
"extraction_count": 0,
"quality_score": 0.0,
"error": "No valid extractions found"
}
except Exception as e:
logger.error(f" {module_name}模块提取失败: {e}")
results["modules"][module_name] = {
"extractions": [],
"extraction_count": 0,
"quality_score": 0.0,
"error": str(e)
}
return results
def build_reproduction_dataset(self, papers_dir: str, output_file: str, max_papers: Optional[int] = None) -> Dict[str, Any]:
"""构建完整的复现数据集"""
papers = self._load_markdown_papers(papers_dir)
dataset = {
"metadata": {
"creation_date": datetime.now().isoformat(),
"total_papers": len(papers),
"extraction_method": "langextract_source_grounded",
"api_endpoint": "http://100.82.33.121:11001/v1",
"model": "gpt-oss-20b",
"langextract_version": getattr(lx, '__version__', 'unknown')
},
"papers": {}
}
# 如果指定了最大处理数量,限制论文数量
if max_papers and max_papers < len(papers):
papers_items = list(papers.items())[:max_papers]
papers = dict(papers_items)
dataset["metadata"]["total_papers"] = len(papers)
dataset["metadata"]["note"] = f"测试模式: 只处理前{max_papers}篇论文"
logger.info(f"测试模式: 只处理前 {max_papers} 篇论文")
logger.info(f"开始处理 {len(papers)} 篇论文...")
for i, (paper_id, content) in enumerate(papers.items()):
logger.info(f"[{i+1}/{len(papers)}] 处理论文: {paper_id}")
paper_result = self.extract_paper_modules(content, paper_id)
dataset["papers"][paper_id] = paper_result
# 为每个论文单独保存结果到其子文件夹
self._save_individual_paper_result(papers_dir, paper_id, paper_result)
# 定期保存全局进度
if (i + 1) % 10 == 0:
self._save_progress(dataset, output_file)
# 保存最终结果
self._save_dataset(dataset, output_file)
# 生成交互式HTML报告
self._generate_html_report(dataset, output_file.replace('.json', '.html'))
return dataset
def _load_markdown_papers(self, papers_dir: str) -> Dict[str, str]:
"""加载markdown论文文件"""
papers = {}
papers_path = Path(papers_dir)
if not papers_path.exists():
raise FileNotFoundError(f"论文目录不存在: {papers_dir}")
# 修改加载逻辑:从子目录中读取.md文件
markdown_files = []
for subdir in papers_path.iterdir():
if subdir.is_dir():
md_files = list(subdir.glob("*.md"))
markdown_files.extend(md_files)
if not markdown_files:
raise ValueError(f"{papers_dir} 目录中未找到markdown文件")
logger.info(f"发现 {len(markdown_files)} 个markdown文件")
for file_path in markdown_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
papers[file_path.stem] = content
except Exception as e:
logger.warning(f"读取文件 {file_path} 失败: {e}")
return papers
def _calculate_quality_score(self, extraction_result) -> float:
"""计算提取质量分数"""
if not extraction_result or not hasattr(extraction_result, 'extractions'):
return 0.0
if not extraction_result.extractions:
return 0.0
# 基于提取数量和属性丰富度计算质量分数
total_score = 0.0
for ext in extraction_result.extractions:
score = 0.3 # 基础分数
# 有源文本定位加分
if hasattr(ext, 'start_index') and ext.start_index is not None:
score += 0.2
# 属性丰富度加分
if ext.attributes and len(ext.attributes) > 0:
score += min(0.3, len(ext.attributes) * 0.1)
# 置信度加分
if hasattr(ext, 'confidence') and ext.confidence:
score += 0.2 * ext.confidence
total_score += score
return min(1.0, total_score / len(extraction_result.extractions))
def _save_progress(self, dataset: Dict[str, Any], output_file: str):
"""保存处理进度"""
try:
progress_file = output_file.replace('.json', '_progress.json')
with open(progress_file, 'w', encoding='utf-8') as f:
json.dump(dataset, f, ensure_ascii=False, indent=2)
logger.info(f"进度已保存至: {progress_file}")
except Exception as e:
logger.error(f"保存进度失败: {e}")
def _save_individual_paper_result(self, papers_dir: str, paper_id: str, paper_result: Dict[str, Any]):
"""为单个论文保存提取结果到其对应的子文件夹"""
try:
# 构建论文子文件夹路径
paper_subdir = Path(papers_dir) / paper_id
if not paper_subdir.exists():
logger.warning(f"论文子文件夹不存在: {paper_subdir}")
return
# 准备单个论文的数据集格式
individual_dataset = {
"metadata": {
"creation_date": datetime.now().isoformat(),
"total_papers": 1,
"extraction_method": "langextract_source_grounded",
"api_endpoint": "http://100.82.33.121:11001/v1",
"model": "gpt-oss-20b",
"langextract_version": getattr(lx, '__version__', 'unknown'),
"paper_id": paper_id
},
"paper": paper_result # 注意:这里是单个论文,所以用"paper"而不是"papers"
}
# 保存JSON文件
json_file = paper_subdir / "mimic_langextract_dataset.json"
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(individual_dataset, f, ensure_ascii=False, indent=2)
# 生成HTML报告
html_file = paper_subdir / "mimic_langextract_dataset.html"
self._generate_individual_html_report(individual_dataset, html_file)
logger.info(f"已保存论文 {paper_id} 的结果到: {paper_subdir}")
except Exception as e:
logger.error(f"保存单个论文结果失败 ({paper_id}): {e}")
def _save_dataset(self, dataset: Dict[str, Any], output_file: str):
"""保存最终数据集"""
try:
# 确保输出目录存在
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(dataset, f, ensure_ascii=False, indent=2)
logger.info(f"数据集已保存至: {output_file}")
except Exception as e:
logger.error(f"保存数据集失败: {e}")
raise
def _generate_html_report(self, dataset: Dict[str, Any], output_file: str):
"""生成LangExtract风格的交互式HTML报告"""
try:
# 合并所有提取结果用于可视化
all_extractions = []
for paper_id, paper_data in dataset["papers"].items():
for module_name, module_data in paper_data.get("modules", {}).items():
all_extractions.extend(module_data.get("extractions", []))
# 基础HTML模板简化版可视化
html_content = f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>MIMIC复现数据集 - LangExtract报告</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
.header {{ background: #f0f8ff; padding: 20px; border-radius: 5px; }}
.stats {{ display: flex; gap: 20px; margin: 20px 0; }}
.stat-card {{ background: #e6f3ff; padding: 15px; border-radius: 5px; }}
.extraction {{ border: 1px solid #ddd; margin: 10px 0; padding: 15px; border-radius: 5px; }}
.class-tag {{ background: #007acc; color: white; padding: 3px 8px; border-radius: 3px; font-size: 12px; }}
</style>
</head>
<body>
<div class="header">
<h1>MIMIC复现数据集 - LangExtract提取报告</h1>
<p>生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<p>总论文数: {dataset['metadata']['total_papers']}</p>
<p>提取方法: {dataset['metadata']['extraction_method']}</p>
</div>
<div class="stats">
<div class="stat-card">
<h3>提取统计</h3>
<p>总提取项: {len(all_extractions)}</p>
<p>平均每篇: {len(all_extractions)/dataset['metadata']['total_papers']:.1f}</p>
</div>
<div class="stat-card">
<h3>成功率</h3>
<p>处理成功: {len([p for p in dataset['papers'].values() if any(m.get('extraction_count', 0) > 0 for m in p.get('modules', {}).values())])}/{dataset['metadata']['total_papers']}</p>
</div>
</div>
<div class="extractions">
<h2>提取结果示例</h2>
"""
# 添加前20个提取结果作为示例
for i, ext in enumerate(all_extractions[:20]):
html_content += f"""
<div class="extraction">
<span class="class-tag">{ext.get('extraction_class', 'unknown')}</span>
<p><strong>提取文本:</strong> "{ext.get('extraction_text', 'N/A')}"</p>
<p><strong>属性:</strong> {ext.get('attributes', {})}</p>
<p><strong>置信度:</strong> {ext.get('confidence', 'N/A')}</p>
</div>
"""
html_content += """
</div>
</body>
</html>
"""
with open(output_file, 'w', encoding='utf-8') as f:
f.write(html_content)
logger.info(f"交互式报告已生成: {output_file}")
except Exception as e:
logger.error(f"HTML报告生成失败: {e}")
def _generate_individual_html_report(self, individual_dataset: Dict[str, Any], output_file: Path):
"""生成单个论文的LangExtract风格交互式HTML报告"""
try:
# 从单个论文数据中提取所有提取结果
paper_data = individual_dataset["paper"]
all_extractions = []
for module_name, module_data in paper_data.get("modules", {}).items():
for ext in module_data.get("extractions", []):
ext["module"] = module_name # 添加模块标识
all_extractions.append(ext)
# 计算统计信息
successful_modules = len([
module for module in paper_data.get("modules", {}).values()
if module.get("extraction_count", 0) > 0
])
total_modules = len(paper_data.get("modules", {}))
# 生成HTML内容
html_content = f"""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>{individual_dataset['metadata']['paper_id']} - LangExtract提取报告</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; background-color: #f9f9f9; }}
.header {{ background: #e3f2fd; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
.stats {{ display: flex; gap: 20px; margin: 20px 0; }}
.stat-card {{ background: #ffffff; padding: 15px; border-radius: 8px; flex: 1; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
.extraction {{ border: 1px solid #e0e0e0; margin: 15px 0; padding: 15px; border-radius: 8px; background: white; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }}
.class-tag {{ background: #1976d2; color: white; padding: 4px 10px; border-radius: 12px; font-size: 12px; margin-right: 10px; }}
.module-tag {{ background: #388e3c; color: white; padding: 2px 8px; border-radius: 10px; font-size: 11px; margin-left: 10px; }}
.attributes {{ background: #f5f5f5; padding: 10px; border-radius: 4px; margin-top: 10px; font-size: 13px; }}
.no-extractions {{ text-align: center; color: #666; padding: 40px; background: #f0f0f0; border-radius: 8px; }}
h1 {{ color: #1565c0; margin: 0; }}
h2 {{ color: #424242; }}
h3 {{ color: #1976d2; margin: 0; }}
.meta-info {{ color: #666; font-size: 14px; }}
</style>
</head>
<body>
<div class="header">
<h1>MIMIC论文信息提取报告</h1>
<h2>{individual_dataset['metadata']['paper_id']}</h2>
<div class="meta-info">
<p><strong>生成时间:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<p><strong>提取方法:</strong> {individual_dataset['metadata']['extraction_method']}</p>
<p><strong>模型:</strong> {individual_dataset['metadata']['model']}</p>
</div>
</div>
<div class="stats">
<div class="stat-card">
<h3>提取统计</h3>
<p><strong>总提取项:</strong> {len(all_extractions)}</p>
<p><strong>成功模块:</strong> {successful_modules}/{total_modules}</p>
</div>
<div class="stat-card">
<h3>模块分布</h3>
"""
# 添加每个模块的统计信息
for module_name, module_data in paper_data.get("modules", {}).items():
extraction_count = module_data.get("extraction_count", 0)
html_content += f" <p><strong>{module_name}:</strong> {extraction_count} 项</p>\n"
html_content += """
</div>
</div>
<div class="extractions">
<h2>详细提取结果</h2>
"""
if all_extractions:
# 按模块分组显示提取结果
for module_name in ["data", "model", "training", "evaluation", "environment"]:
module_extractions = [ext for ext in all_extractions if ext.get("module") == module_name]
if module_extractions:
html_content += f""" <h3>{module_name.title()} 模块 ({len(module_extractions)} 项)</h3>\n"""
for ext in module_extractions:
confidence_text = f" (置信度: {ext.get('confidence', 'N/A')})" if ext.get('confidence') else ""
html_content += f"""
<div class="extraction">
<span class="class-tag">{ext.get('extraction_class', 'unknown')}</span>
<span class="module-tag">{module_name}</span>
<p><strong>提取文本:</strong> "{ext.get('extraction_text', 'N/A')}"</p>
"""
# 添加属性信息
attributes = ext.get('attributes', {})
if attributes:
html_content += f""" <div class="attributes">
<strong>属性:</strong> """
for key, value in attributes.items():
html_content += f"<span><strong>{key}:</strong> {value}</span> &nbsp;&nbsp; "
html_content += """
</div>"""
# 添加位置信息
if ext.get('start_index') is not None and ext.get('end_index') is not None:
html_content += f""" <p class="meta-info">位置: {ext.get('start_index')}-{ext.get('end_index')}{confidence_text}</p>"""
html_content += """ </div>
"""
else:
html_content += """
<div class="no-extractions">
<p>未找到任何提取结果</p>
<p>可能的原因模型无法识别相关信息或者文本内容不包含目标信息类型</p>
</div>
"""
html_content += """
</div>
</body>
</html>
"""
# 写入HTML文件
with open(output_file, 'w', encoding='utf-8') as f:
f.write(html_content)
logger.info(f"单个论文HTML报告已生成: {output_file}")
except Exception as e:
logger.error(f"单个论文HTML报告生成失败: {e}")