2025-08-23 12:28:45 +08:00
|
|
|
|
import argparse
|
|
|
|
|
|
|
2025-08-23 16:33:36 +08:00
|
|
|
|
from src.crawler import PaperCrawler
|
|
|
|
|
|
|
2025-08-23 12:28:45 +08:00
|
|
|
|
|
|
|
|
|
|
def setup_args():
|
|
|
|
|
|
"""设置命令行参数解析
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
argparse.Namespace: 解析后的命令行参数
|
|
|
|
|
|
"""
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
|
|
description='论文爬取工具 - 用于批量爬取和处理医学研究论文',
|
|
|
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
|
|
|
epilog='''
|
|
|
|
|
|
使用示例:
|
|
|
|
|
|
%(prog)s # 使用默认参数
|
|
|
|
|
|
%(prog)s --paper_website arxiv medrxiv # 指定论文数据源
|
|
|
|
|
|
%(prog)s --parallel 10 # 设置并行度为10
|
|
|
|
|
|
'''
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--paper_website',
|
2025-08-26 22:19:28 +08:00
|
|
|
|
default=["medrxiv"],
|
2025-08-23 12:28:45 +08:00
|
|
|
|
help='论文网站 (默认: arxiv,medrxiv)',
|
|
|
|
|
|
nargs='+',
|
|
|
|
|
|
choices=["arxiv","medrxiv"]
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--parallel',
|
|
|
|
|
|
type=int,
|
|
|
|
|
|
default=20,
|
|
|
|
|
|
help='并行处理线程数 (默认: 20)'
|
|
|
|
|
|
)
|
2025-08-23 19:42:47 +08:00
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--csv-download',
|
|
|
|
|
|
type=str,
|
2025-08-24 15:07:34 +08:00
|
|
|
|
default="yes",
|
2025-08-23 19:42:47 +08:00
|
|
|
|
help='指定CSV文件路径'
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
|
'--pdf_download_list',
|
|
|
|
|
|
type=str,
|
2025-08-26 22:19:28 +08:00
|
|
|
|
default='dataset/mimic_papers_20250825.csv',
|
2025-08-23 19:42:47 +08:00
|
|
|
|
help='指定PDF下载目录'
|
|
|
|
|
|
)
|
2025-08-23 12:28:45 +08:00
|
|
|
|
|
|
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-08-23 16:33:36 +08:00
|
|
|
|
|
2025-08-23 12:28:45 +08:00
|
|
|
|
def main():
|
|
|
|
|
|
"""主函数 - 执行论文爬取任务"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 解析命令行参数
|
|
|
|
|
|
args = setup_args()
|
|
|
|
|
|
|
2025-08-23 16:33:36 +08:00
|
|
|
|
# 初始化论文爬取器
|
|
|
|
|
|
crawler = PaperCrawler(
|
|
|
|
|
|
websites=args.paper_website,
|
|
|
|
|
|
parallel=args.parallel
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-08-23 19:42:47 +08:00
|
|
|
|
|
|
|
|
|
|
print(f"=== 论文爬取工具启动 ===")
|
|
|
|
|
|
print(f"论文数据源: {args.paper_website}")
|
|
|
|
|
|
print(f"并行处理数: {args.parallel}")
|
|
|
|
|
|
print(f"========================")
|
|
|
|
|
|
|
2025-08-23 16:33:36 +08:00
|
|
|
|
# 执行论文爬取
|
2025-08-23 19:42:47 +08:00
|
|
|
|
if args.csv_download:
|
|
|
|
|
|
print("开始爬取MIMIC-4相关论文...")
|
|
|
|
|
|
papers = crawler.crawl_papers()
|
|
|
|
|
|
|
|
|
|
|
|
if papers:
|
|
|
|
|
|
# 保存到CSV文件
|
|
|
|
|
|
csv_file_path = crawler.save_to_csv(papers)
|
|
|
|
|
|
print(f"\n=== 爬取完成 ===")
|
|
|
|
|
|
print(f"成功爬取: {len(papers)} 篇论文")
|
|
|
|
|
|
print(f"保存位置: {csv_file_path}")
|
|
|
|
|
|
print(f"================")
|
|
|
|
|
|
else:
|
|
|
|
|
|
print("未找到相关论文,请检查网络连接或关键词设置")
|
2025-08-23 12:28:45 +08:00
|
|
|
|
|
2025-08-23 19:42:47 +08:00
|
|
|
|
# 如果指定了PDF下载测试,执行测试
|
|
|
|
|
|
if args.pdf_download_list:
|
|
|
|
|
|
print(f"=== PDF下载功能测试 ===")
|
|
|
|
|
|
print(f"CSV文件: {args.pdf_download_list}")
|
|
|
|
|
|
print(f"并发数: {args.parallel}")
|
|
|
|
|
|
print(f"========================")
|
|
|
|
|
|
|
|
|
|
|
|
# 执行PDF下载
|
|
|
|
|
|
stats = crawler.download_pdfs_from_csv(args.pdf_download_list)
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\n=== PDF下载测试完成 ===")
|
|
|
|
|
|
print(f"总数: {stats['total']} 篇论文")
|
|
|
|
|
|
print(f"成功: {stats['success']} 篇 ({stats['success']/stats['total']*100:.1f}%)")
|
|
|
|
|
|
print(f"失败: {stats['failed']} 篇 ({stats['failed']/stats['total']*100:.1f}%)")
|
|
|
|
|
|
print(f"========================")
|
|
|
|
|
|
return 0
|
2025-08-23 12:28:45 +08:00
|
|
|
|
|
|
|
|
|
|
except FileNotFoundError as e:
|
|
|
|
|
|
print(f"错误: 找不到指定的文件 - {e}")
|
|
|
|
|
|
return 1
|
|
|
|
|
|
except ValueError as e:
|
|
|
|
|
|
print(f"错误: 参数值无效 - {e}")
|
|
|
|
|
|
return 1
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"错误: 程序执行异常 - {e}")
|
|
|
|
|
|
return 1
|
|
|
|
|
|
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
exit_code = main()
|
|
|
|
|
|
exit(exit_code)
|