2025-08-25 20:51:30 +08:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
"""
|
|
|
|
|
|
基于LangExtract的MIMIC论文信息提取器
|
|
|
|
|
|
从医学论文中提取结构化的复现任务信息
|
|
|
|
|
|
|
|
|
|
|
|
作者:MedResearcher项目
|
|
|
|
|
|
创建时间:2025-01-25
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
|
import logging
|
|
|
|
|
|
|
|
|
|
|
|
from src.extractor import MIMICLangExtractBuilder
|
|
|
|
|
|
|
|
|
|
|
|
# 配置日志
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def setup_args():
|
|
|
|
|
|
"""设置命令行参数解析
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
argparse.Namespace: 解析后的命令行参数
|
|
|
|
|
|
"""
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
|
description='MIMIC论文信息提取工具 - 基于LangExtract从医学论文中提取结构化复现信息',
|
|
|
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
|
|
|
epilog='''
|
|
|
|
|
|
使用示例:
|
|
|
|
|
|
%(prog)s # 使用默认参数
|
|
|
|
|
|
%(prog)s --papers_dir dataset/markdowns # 指定论文目录
|
|
|
|
|
|
%(prog)s --output_file results/dataset.json # 指定输出文件
|
|
|
|
|
|
%(prog)s --test_mode --max_papers 5 # 测试模式,只处理5篇论文
|
|
|
|
|
|
'''
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--papers_dir',
|
|
|
|
|
|
type=str,
|
|
|
|
|
|
default='dataset/markdowns',
|
|
|
|
|
|
help='markdown论文文件目录 (默认: dataset/markdowns)'
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--output_file',
|
|
|
|
|
|
type=str,
|
|
|
|
|
|
default='dataset/reproduction_tasks/mimic_langextract_dataset.json',
|
|
|
|
|
|
help='输出数据集文件路径 (默认: dataset/reproduction_tasks/mimic_langextract_dataset.json)'
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--test_mode',
|
|
|
|
|
|
action='store_true',
|
|
|
|
|
|
help='测试模式,只处理少量论文进行验证'
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--max_papers',
|
|
|
|
|
|
type=int,
|
|
|
|
|
|
default=None,
|
|
|
|
|
|
help='最大处理论文数量,用于测试 (默认: 处理所有论文)'
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--log_level',
|
|
|
|
|
|
type=str,
|
|
|
|
|
|
default='INFO',
|
|
|
|
|
|
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
|
|
|
|
|
|
help='日志级别 (默认: INFO)'
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-08-26 22:19:28 +08:00
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--doc_workers',
|
|
|
|
|
|
type=int,
|
|
|
|
|
|
default=50,
|
|
|
|
|
|
help='文档并行处理工作线程数 (默认: 4)'
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-08-25 20:51:30 +08:00
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
"""主函数 - 执行MIMIC论文信息提取任务"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 解析命令行参数
|
|
|
|
|
|
args = setup_args()
|
|
|
|
|
|
|
|
|
|
|
|
# 设置日志级别
|
|
|
|
|
|
logging.getLogger().setLevel(getattr(logging, args.log_level))
|
|
|
|
|
|
|
|
|
|
|
|
# 初始化信息提取器
|
2025-08-26 22:19:28 +08:00
|
|
|
|
builder = MIMICLangExtractBuilder(doc_workers=args.doc_workers)
|
2025-08-25 20:51:30 +08:00
|
|
|
|
|
|
|
|
|
|
print(f"=== MIMIC论文信息提取工具启动 ===")
|
|
|
|
|
|
print(f"论文目录: {args.papers_dir}")
|
|
|
|
|
|
print(f"输出文件: {args.output_file}")
|
|
|
|
|
|
print(f"测试模式: {'是' if args.test_mode else '否'}")
|
|
|
|
|
|
if args.max_papers:
|
|
|
|
|
|
print(f"最大论文数: {args.max_papers}")
|
2025-08-26 22:19:28 +08:00
|
|
|
|
print(f"文档并行度: {args.doc_workers} 线程")
|
2025-08-25 20:51:30 +08:00
|
|
|
|
print(f"日志级别: {args.log_level}")
|
|
|
|
|
|
print(f"========================")
|
|
|
|
|
|
|
|
|
|
|
|
# 构建复现数据集
|
|
|
|
|
|
print("\n开始构建MIMIC复现数据集...")
|
|
|
|
|
|
dataset = builder.build_reproduction_dataset(
|
|
|
|
|
|
papers_dir=args.papers_dir,
|
|
|
|
|
|
output_file=args.output_file,
|
|
|
|
|
|
max_papers=args.max_papers if args.test_mode or args.max_papers else None
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 统计结果
|
|
|
|
|
|
total_papers = dataset['metadata']['total_papers']
|
|
|
|
|
|
successful_extractions = sum(
|
|
|
|
|
|
1 for paper in dataset['papers'].values()
|
|
|
|
|
|
if any(module.get('extraction_count', 0) > 0
|
|
|
|
|
|
for module in paper.get('modules', {}).values())
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\n=== 构建完成 ===")
|
|
|
|
|
|
print(f"总论文数: {total_papers}")
|
|
|
|
|
|
print(f"成功提取: {successful_extractions}/{total_papers}")
|
|
|
|
|
|
print(f"成功率: {successful_extractions/total_papers*100:.1f}%")
|
|
|
|
|
|
print(f"结果保存至: {args.output_file}")
|
|
|
|
|
|
print(f"交互式报告: {args.output_file.replace('.json', '.html')}")
|
|
|
|
|
|
print(f"===============")
|
|
|
|
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
except FileNotFoundError as e:
|
|
|
|
|
|
print(f"错误: 找不到指定的文件或目录 - {e}")
|
|
|
|
|
|
return 1
|
|
|
|
|
|
except ValueError as e:
|
|
|
|
|
|
print(f"错误: 参数值无效 - {e}")
|
|
|
|
|
|
return 1
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"错误: 程序执行异常 - {e}")
|
|
|
|
|
|
logging.exception("详细错误信息:")
|
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
exit_code = main()
|
|
|
|
|
|
exit(exit_code)
|