MedResearcher/src/parse.py
iomgaa 22c90728e5 feat: 优化PDF解析筛选逻辑,移除MIMIC关键词依赖并增加AI医学相关性判断
- 移除第一层MIMIC-IV关键词筛选,简化筛选流程
- 增强AI分析功能,同时判断医学相关性和任务类型
- 修改_analyze_research_task函数返回包含医学相关性和任务类型的完整结果
- 更新筛选条件:只有同时满足"医学相关"和"指定任务类型"的论文才通过筛选
- 优化相关注释和日志输出,提高代码可维护性
2025-08-26 23:06:48 +08:00

735 lines
33 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""PDF解析模块
该模块提供PDFParser类用于将PDF文件通过OCR API转换为Markdown格式。
支持并发处理、进度显示、错误处理等功能。
"""
import requests
import logging
import os
import time
import zipfile
import tempfile
import re
import json
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Optional, Tuple
class PDFParser:
"""PDF解析类 - 用于将PDF文件转换为Markdown格式并筛选医学相关论文
筛选机制:
1. 医学相关性使用AI判断论文是否属于医学、临床、生物医学等领域
2. 任务类型:在医学相关的基础上进一步筛选指定的研究任务类型
支持的任务类型:
- prediction: 预测任务 (PRED_)
- classification: 分类任务 (CLAS_)
- time_series: 时间序列分析 (TIME_)
- correlation: 关联性分析 (CORR_)
"""
def __init__(self, pdf_dir: str = "dataset/pdfs", parallel: int = 3,
markdown_dir: str = "dataset/markdowns"):
"""初始化解析器配置
Args:
pdf_dir (str): PDF文件目录默认dataset/pdfs
parallel (int): 并发处理数默认3降低并发以避免服务器过载
markdown_dir (str): Markdown输出目录默认dataset/markdowns
"""
self.pdf_dir = Path(pdf_dir)
self.parallel = parallel
self.markdown_dir = Path(markdown_dir)
# OCR API配置
self.ocr_api_url = "http://100.106.4.14:7861/parse"
# AI模型API配置用于医学相关性和四类任务识别
self.ai_api_url = "http://100.82.33.121:11001/v1/chat/completions"
self.ai_model = "gpt-oss-20b"
# 注意原来的MIMIC关键词配置已移除现在使用AI判断医学相关性
# 任务类型到前缀的映射配置
self.task_type_prefixes = {
"prediction": "PRED_",
"classification": "CLAS_",
"time_series": "TIME_",
"correlation": "CORR_",
"none": None # 不符合任何类型,不标记
}
# HTTP会话配置增加连接池大小和超时时间
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'MedResearcher-PDFParser/1.0'
})
# 配置连接池适配器(增加连接池大小)
adapter = HTTPAdapter(
pool_connections=10, # 连接池数量
pool_maxsize=20, # 最大连接数
max_retries=0 # 禁用自动重试,使用自定义重试逻辑
)
self.session.mount('http://', adapter)
self.session.mount('https://', adapter)
# 配置日志
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
def _scan_pdf_files(self) -> List[Path]:
"""扫描PDF文件目录获取所有PDF文件
Returns:
List[Path]: PDF文件路径列表
Raises:
FileNotFoundError: PDF目录不存在
"""
if not self.pdf_dir.exists():
raise FileNotFoundError(f"PDF目录不存在: {self.pdf_dir}")
pdf_files = []
for pdf_file in self.pdf_dir.glob("*.pdf"):
if pdf_file.is_file():
pdf_files.append(pdf_file)
logging.info(f"发现 {len(pdf_files)} 个PDF文件待处理")
return pdf_files
# 注意_check_mimic_keywords函数已移除
# 原功能检查Markdown文件是否包含MIMIC-IV关键词
# 移除原因改用AI分析医学相关性不再依赖特定关键词筛选
def _extract_introduction(self, output_subdir: Path) -> Optional[str]:
"""从Markdown文件中提取Introduction部分
Args:
output_subdir (Path): 包含Markdown文件的输出子目录
Returns:
Optional[str]: 提取的Introduction内容失败时返回None
"""
try:
# 查找所有.md文件
md_files = list(output_subdir.glob("*.md"))
if not md_files:
logging.warning(f"未找到Markdown文件进行Introduction提取: {output_subdir}")
return None
# 通常使用第一个md文件
md_file = md_files[0]
try:
with open(md_file, 'r', encoding='utf-8') as f:
content = f.read()
# 使用正则表达式提取Introduction部分
# 匹配各种可能的Introduction标题格式
patterns = [
r'(?i)#\s*Introduction\s*\n(.*?)(?=\n#|\n\n#|$)',
r'(?i)##\s*Introduction\s*\n(.*?)(?=\n##|\n\n##|$)',
r'(?i)###\s*Introduction\s*\n(.*?)(?=\n###|\n\n###|$)',
r'(?i)\*\*Introduction\*\*\s*\n(.*?)(?=\n\*\*|\n\n\*\*|$)',
r'(?i)Introduction\s*\n(.*?)(?=\n[A-Z][a-z]+\s*\n|$)'
]
for pattern in patterns:
match = re.search(pattern, content, re.DOTALL)
if match:
introduction = match.group(1).strip()
if len(introduction) > 100: # 确保有足够的内容进行分析
logging.info(f"成功提取Introduction部分 ({len(introduction)} 字符): {md_file.name}")
return introduction
# 如果没有明确的Introduction标题尝试提取前几段作为近似的introduction
paragraphs = content.split('\n\n')
introduction_candidates = []
for para in paragraphs[:5]: # 取前5段
para = para.strip()
if len(para) > 50 and not para.startswith('#'): # 过滤掉标题和过短段落
introduction_candidates.append(para)
if introduction_candidates:
introduction = '\n\n'.join(introduction_candidates[:3]) # 最多取前3段
if len(introduction) > 200:
logging.info(f"提取近似Introduction部分 ({len(introduction)} 字符): {md_file.name}")
return introduction
logging.warning(f"未能提取到有效的Introduction内容: {md_file.name}")
return None
except Exception as e:
logging.error(f"读取Markdown文件时发生错误: {md_file.name} - {e}")
return None
except Exception as e:
logging.error(f"提取Introduction时发生错误: {output_subdir} - {e}")
return None
def _analyze_research_task(self, introduction: str) -> Dict[str, any]:
"""使用AI模型分析论文的医学相关性和研究任务类型
Args:
introduction (str): 论文的Introduction内容
Returns:
Dict[str, any]: 包含医学相关性和任务类型的分析结果
- is_medical: bool是否为医学相关论文
- task_type: str任务类型 ('prediction', 'classification', 'time_series', 'correlation', 'none')
- medical_confidence: float医学相关性置信度
- task_confidence: float任务类型置信度
"""
try:
# 构造AI分析的提示词
system_prompt = """你是一个医学研究专家。请分析给定的论文Introduction部分判断两个维度
1. 医学相关性:判断该论文是否属于医学、临床医学、生物医学、公共卫生、护理学等医学相关领域
- 医学相关:涉及疾病、患者、临床数据、医疗干预、生物医学指标等
- 非医学相关:纯计算机科学、工程学、物理学、经济学等非医学领域
2. 任务类型:如果是医学相关论文,进一步判断属于以下哪种任务类型:
- prediction: 预测任务(预测未来事件、结局或数值,如死亡率预测、住院时长预测、疾病进展预测)
- classification: 分类任务(将患者或病例分类到不同类别,如疾病诊断分类、风险等级分类、药物反应分类)
- time_series: 时间序列分析(分析随时间变化的医疗数据,如生命体征趋势分析、病情演进分析、纵向队列研究)
- correlation: 关联性分析(研究变量间的关系或关联,如疾病与人口特征关系、药物与副作用关联、风险因素识别)
- none: 不属于以上任何类型
请以JSON格式回答包含所有字段
{\"is_medical\": true, \"task_type\": \"prediction\", \"medical_confidence\": 0.90, \"task_confidence\": 0.85}
字段说明:
- is_medical: 布尔值,是否为医学相关论文
- task_type: 任务类型prediction/classification/time_series/correlation/none
- medical_confidence: 医学相关性置信度0-1之间
- task_confidence: 任务类型置信度0-1之间
只返回JSON不要添加其他文字。"""
user_prompt = f"请分析以下论文Introduction判断医学相关性和任务类型\n\n{introduction[:2000]}" # 限制长度避免token过多
# 构造API请求数据
api_data = {
"model": self.ai_model,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
"max_tokens": 100, # 需要返回更复杂的JSON格式
"temperature": 0.1 # 降低随机性
}
# 调用AI API
response = self.session.post(
self.ai_api_url,
json=api_data,
headers={"Content-Type": "application/json"},
timeout=30
)
if response.status_code == 200:
result = response.json()
ai_response = result['choices'][0]['message']['content'].strip()
try:
# 解析JSON响应
parsed_response = json.loads(ai_response)
is_medical = parsed_response.get('is_medical', False)
task_type = parsed_response.get('task_type', 'none').lower()
medical_confidence = parsed_response.get('medical_confidence', 0.0)
task_confidence = parsed_response.get('task_confidence', 0.0)
# 验证任务类型是否有效
valid_types = ['prediction', 'classification', 'time_series', 'correlation', 'none']
if task_type not in valid_types:
logging.warning(f"AI返回了无效的任务类型: {task_type},使用默认值 'none'")
task_type = "none"
task_confidence = 0.0
# 检查医学相关性置信度(要求至少 0.7
if medical_confidence < 0.7:
logging.info(f"医学相关性置信度过低 ({medical_confidence:.2f}),标记为非医学论文")
is_medical = False
# 检查任务类型置信度(要求至少 0.7
if task_confidence < 0.7:
logging.info(f"任务类型置信度过低 ({task_confidence:.2f}),标记为 'none'")
task_type = "none"
# 构建返回结果
result = {
'is_medical': is_medical,
'task_type': task_type,
'medical_confidence': medical_confidence,
'task_confidence': task_confidence
}
logging.info(f"AI分析结果: 医学相关={is_medical}({medical_confidence:.2f}), 任务类型={task_type}({task_confidence:.2f})")
return result
except json.JSONDecodeError as e:
logging.error(f"解析AI JSON响应失败: {ai_response} - 错误: {e}")
return {'is_medical': False, 'task_type': 'none', 'medical_confidence': 0.0, 'task_confidence': 0.0}
else:
logging.error(f"AI API调用失败状态码: {response.status_code}")
return {'is_medical': False, 'task_type': 'none', 'medical_confidence': 0.0, 'task_confidence': 0.0}
except Exception as e:
logging.error(f"AI分析研究任务时发生错误: {e}")
return {'is_medical': False, 'task_type': 'none', 'medical_confidence': 0.0, 'task_confidence': 0.0}
def _mark_valid_folder(self, output_subdir: Path, task_type: str) -> bool:
"""为通过筛选的文件夹添加任务类型前缀标记
Args:
output_subdir (Path): 需要标记的输出子目录
task_type (str): 任务类型 ('prediction', 'classification', 'time_series', 'correlation')
Returns:
bool: 标记是否成功
"""
try:
# 获取任务类型对应的前缀
prefix = self.task_type_prefixes.get(task_type)
if not prefix:
logging.info(f"任务类型 '{task_type}' 不需要标记文件夹")
return True # 不需要标记,但认为成功
# 检查文件夹是否已经有相应的任务类型前缀
if output_subdir.name.startswith(prefix):
logging.info(f"文件夹已标记为{task_type}任务: {output_subdir.name}")
return True
# 检查是否已经有其他任务类型的前缀
for existing_type, existing_prefix in self.task_type_prefixes.items():
if existing_prefix and output_subdir.name.startswith(existing_prefix):
logging.info(f"文件夹已有{existing_type}任务标记,不需要重新标记: {output_subdir.name}")
return True
# 生成新的文件夹名
new_folder_name = prefix + output_subdir.name
new_folder_path = output_subdir.parent / new_folder_name
# 重命名文件夹
output_subdir.rename(new_folder_path)
logging.info(f"文件夹标记成功: {output_subdir.name} -> {new_folder_name} (任务类型: {task_type})")
return True
except Exception as e:
logging.error(f"标记文件夹时发生错误: {output_subdir} - {e}")
return False
def _prepare_output_dir(self) -> Path:
"""准备Markdown输出目录
Returns:
Path: Markdown输出目录路径
"""
self.markdown_dir.mkdir(parents=True, exist_ok=True)
logging.info(f"Markdown输出目录已准备: {self.markdown_dir}")
return self.markdown_dir
def _call_ocr_api(self, pdf_file: Path) -> Optional[Dict]:
"""调用OCR API解析PDF文件
Args:
pdf_file (Path): PDF文件路径
Returns:
Optional[Dict]: API响应数据失败时返回None
"""
try:
with open(pdf_file, 'rb') as f:
files = {
'file': (pdf_file.name, f, 'application/pdf')
}
response = self._make_request_with_retry(
self.ocr_api_url,
files=files,
timeout=1800 # 增加到3分钟匹配服务器处理时间
)
if response.status_code == 200:
response_data = response.json()
if response_data.get('success', False):
logging.debug(f"OCR API调用成功: {pdf_file.name}")
return response_data
else:
logging.warning(f"OCR API处理失败: {pdf_file.name} - {response_data.get('message', 'Unknown error')}")
return None
else:
logging.error(f"OCR API请求失败状态码: {response.status_code} - {pdf_file.name}")
return None
except Exception as e:
logging.error(f"调用OCR API时发生错误: {pdf_file.name} - {e}")
return None
def _download_and_extract_zip(self, download_url: str, pdf_file: Path) -> bool:
"""从API响应中下载ZIP文件并解压到子文件夹
Args:
download_url (str): 完整的下载URL
pdf_file (Path): 原始PDF文件路径用于生成输出文件夹名
Returns:
bool: 下载和解压是否成功
"""
try:
# 下载ZIP文件
response = self._make_request_with_retry(download_url, timeout=60)
if response.status_code != 200:
logging.error(f"下载ZIP失败状态码: {response.status_code} - {pdf_file.name}")
return False
# 创建以PDF文件名命名的输出子文件夹
output_subdir = self.markdown_dir / pdf_file.stem
output_subdir.mkdir(parents=True, exist_ok=True)
# 使用临时文件保存ZIP内容
with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip:
temp_zip.write(response.content)
temp_zip_path = temp_zip.name
try:
# 解压ZIP文件到输出子文件夹
with zipfile.ZipFile(temp_zip_path, 'r') as zip_ref:
zip_ref.extractall(output_subdir)
logging.debug(f"ZIP文件解压成功: {pdf_file.name} -> {output_subdir}")
# 清洗解压后的Markdown文件
if not self._clean_markdown_files(output_subdir):
logging.warning(f"Markdown文件清洗失败但解压成功: {pdf_file.name}")
return True
finally:
# 清理临时ZIP文件
os.unlink(temp_zip_path)
except zipfile.BadZipFile as e:
logging.error(f"ZIP文件损坏: {pdf_file.name} - {e}")
return False
except Exception as e:
logging.error(f"下载或解压ZIP时发生错误: {pdf_file.name} - {e}")
return False
def _clean_markdown_files(self, output_subdir: Path) -> bool:
"""清洗输出目录中的Markdown文件去除数字编号和空行
Args:
output_subdir (Path): 包含Markdown文件的输出子目录
Returns:
bool: 清洗是否成功
"""
try:
# 查找所有.md文件
md_files = list(output_subdir.glob("*.md"))
if not md_files:
logging.debug(f"未找到Markdown文件进行清洗: {output_subdir}")
return True
for md_file in md_files:
try:
# 读取原文件内容
with open(md_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 清洗每一行
cleaned_lines = []
for line in lines:
# 去除行尾换行符
line_content = line.rstrip('\n\r')
# 跳过纯数字行(如 "2", "30"
if re.match(r'^\d+$', line_content):
continue
# 跳过数字+空格行(如 "30 "
if re.match(r'^\d+\s*$', line_content):
continue
# 去除行首的数字+空格模式(如 "1 Title:" -> "Title:"
cleaned_line = re.sub(r'^\d+\s+', '', line_content)
# 如果清洗后行不为空,则保留
if cleaned_line.strip():
cleaned_lines.append(cleaned_line + '\n')
else:
# 保留空行以维护文档结构
cleaned_lines.append('\n')
# 写回清洗后的内容
with open(md_file, 'w', encoding='utf-8') as f:
f.writelines(cleaned_lines)
logging.debug(f"Markdown文件清洗完成: {md_file.name}")
except Exception as e:
logging.error(f"清洗Markdown文件时发生错误: {md_file.name} - {e}")
return False
logging.info(f"成功清洗 {len(md_files)} 个Markdown文件: {output_subdir}")
return True
except Exception as e:
logging.error(f"清洗Markdown文件时发生错误: {output_subdir} - {e}")
return False
def _process_single_pdf(self, pdf_file: Path) -> bool:
"""处理单个PDF文件的完整流程
Args:
pdf_file (Path): PDF文件路径
Returns:
bool: 处理是否成功
"""
try:
# 检查PDF文件是否存在且有效
if not pdf_file.exists() or pdf_file.stat().st_size == 0:
logging.warning(f"PDF文件不存在或为空: {pdf_file}")
return False
# 检查是否已存在对应的输出子文件夹
output_subdir = self.markdown_dir / pdf_file.stem
if output_subdir.exists() and any(output_subdir.iterdir()):
logging.info(f"输出文件夹已存在且非空,跳过处理: {pdf_file.stem}")
return True
# 调用OCR API
api_response = self._call_ocr_api(pdf_file)
if not api_response:
return False
# 获取下载URL并拼接完整地址
download_url = api_response.get('download_url')
if not download_url:
logging.error(f"API响应中缺少下载URL: {pdf_file.name}")
return False
# 拼接完整的下载URL
full_download_url = f"http://100.106.4.14:7861{download_url}"
logging.debug(f"完整下载URL: {full_download_url}")
# 下载并解压ZIP文件
success = self._download_and_extract_zip(full_download_url, pdf_file)
if not success:
return False
# 获取解压后的文件夹路径
output_subdir = self.markdown_dir / pdf_file.stem
# AI分析研究任务医学相关性 + 任务类型)
logging.info(f"开始AI研究任务分析: {pdf_file.stem}")
introduction = self._extract_introduction(output_subdir)
if not introduction:
logging.warning(f"无法提取Introduction跳过AI分析: {pdf_file.stem}")
return True # 处理成功但无法进行任务分析
analysis_result = self._analyze_research_task(introduction)
is_medical = analysis_result['is_medical']
task_type = analysis_result['task_type']
# 检查是否通过筛选(必须是医学相关且属于指定任务类型)
if not is_medical:
logging.info(f"未通过医学相关性筛选,跳过: {pdf_file.stem}")
return True # 处理成功但未通过筛选
if task_type == "none":
logging.info(f"未通过任务类型筛选 (task_type=none),跳过: {pdf_file.stem}")
return True # 处理成功但未通过筛选
# 通过所有筛选,根据任务类型标记文件夹
logging.info(f"通过所有筛选,标记为{task_type}任务医学论文: {pdf_file.stem}")
if self._mark_valid_folder(output_subdir, task_type):
logging.info(f"论文筛选完成,已标记为{task_type}任务: {pdf_file.stem}")
else:
logging.warning(f"文件夹标记失败: {pdf_file.stem}")
return True
except Exception as e:
logging.error(f"处理PDF文件时发生错误: {pdf_file.name} - {e}")
return False
def _make_request_with_retry(self, url: str, files: Optional[Dict] = None,
max_retries: int = 5, timeout: int = 180) -> requests.Response:
"""带智能重试策略的HTTP请求
Args:
url (str): 请求URL
files (Optional[Dict]): 文件数据用于POST请求
max_retries (int): 最大重试次数增加到5次
timeout (int): 请求超时时间(秒)
Returns:
requests.Response: HTTP响应
Raises:
requests.RequestException: 当所有重试都失败时抛出
"""
for attempt in range(max_retries):
try:
if files:
response = self.session.post(url, files=files, timeout=timeout)
else:
response = self.session.get(url, timeout=timeout)
# 检查响应状态针对500错误进行重试
if response.status_code == 500:
if attempt == max_retries - 1:
logging.error(f"服务器内部错误,已达到最大重试次数: HTTP {response.status_code}")
return response # 返回错误响应而不是抛出异常
# 500错误使用较长的等待时间
wait_time = min(30, 10 + (attempt * 5)) # 10s, 15s, 20s, 25s, 30s
logging.warning(f"服务器内部错误,{wait_time}秒后重试 (第{attempt + 1}次)")
time.sleep(wait_time)
continue
return response
except requests.exceptions.Timeout as e:
if attempt == max_retries - 1:
logging.error(f"请求超时,已达到最大重试次数: {e}")
raise
# 超时错误使用较短的等待时间
wait_time = min(15, 5 + (attempt * 2)) # 5s, 7s, 9s, 11s, 13s
logging.warning(f"请求超时,{wait_time}秒后重试 (第{attempt + 1}次): {e}")
time.sleep(wait_time)
except requests.exceptions.ConnectionError as e:
if attempt == max_retries - 1:
logging.error(f"连接错误,已达到最大重试次数: {e}")
raise
# 连接错误使用指数退避
wait_time = min(60, 5 * (2 ** attempt)) # 5s, 10s, 20s, 40s, 60s
logging.warning(f"连接错误,{wait_time}秒后重试 (第{attempt + 1}次): {e}")
time.sleep(wait_time)
except requests.RequestException as e:
if attempt == max_retries - 1:
logging.error(f"请求失败,已达到最大重试次数: {e}")
raise
# 其他错误使用标准指数退避
wait_time = min(30, 3 * (2 ** attempt)) # 3s, 6s, 12s, 24s, 30s
logging.warning(f"请求失败,{wait_time}秒后重试 (第{attempt + 1}次): {e}")
time.sleep(wait_time)
def parse_all_pdfs(self) -> Dict[str, int]:
"""批量处理所有PDF文件转换为Markdown格式
Returns:
Dict[str, int]: 处理统计信息 {'success': 成功数, 'failed': 失败数, 'total': 总数}
Raises:
FileNotFoundError: PDF目录不存在
"""
try:
# 扫描PDF文件
pdf_files = self._scan_pdf_files()
if not pdf_files:
logging.warning("未找到PDF文件")
return {'success': 0, 'failed': 0, 'total': 0}
# 准备输出目录
self._prepare_output_dir()
# 初始化统计
total_files = len(pdf_files)
success_count = 0
failed_count = 0
failed_files = []
logging.info(f"开始并发处理 {total_files} 个PDF文件")
logging.info(f"并发数: {self.parallel} (降低并发数以避免服务器过载)")
logging.info(f"请求超时: 1800秒 (适配服务器处理时间)")
logging.info(f"重试次数: 5次 (智能重试策略)")
# 使用并发执行器处理PDF
with ThreadPoolExecutor(max_workers=self.parallel) as executor:
# 提交所有处理任务
future_to_pdf = {
executor.submit(self._process_single_pdf, pdf_file): pdf_file
for pdf_file in pdf_files
}
# 处理完成的任务,实时显示进度
completed_count = 0
for future in as_completed(future_to_pdf):
pdf_file = future_to_pdf[future]
filename = pdf_file.name[:50] + '...' if len(pdf_file.name) > 50 else pdf_file.name
try:
success = future.result()
completed_count += 1
if success:
success_count += 1
status = ""
else:
failed_count += 1
failed_files.append({
'filename': pdf_file.name,
'path': str(pdf_file)
})
status = ""
# 显示进度
progress = (completed_count / total_files) * 100
print(f"\r[{completed_count:3d}/{total_files}] {progress:5.1f}% {status} {filename}", end='', flush=True)
except Exception as e:
failed_count += 1
completed_count += 1
failed_files.append({
'filename': pdf_file.name,
'path': str(pdf_file),
'error': str(e)
})
progress = (completed_count / total_files) * 100
print(f"\r[{completed_count:3d}/{total_files}] {progress:5.1f}% ✗ {filename} (Error: {str(e)[:30]})", end='', flush=True)
print() # 换行
# 记录失败详情
if failed_files:
logging.warning(f"以下 {len(failed_files)} 个PDF文件处理失败:")
for file_info in failed_files:
logging.warning(f" - {file_info['filename']}")
if 'error' in file_info:
logging.warning(f" 错误: {file_info['error']}")
# 生成处理报告
stats = {
'success': success_count,
'failed': failed_count,
'total': total_files
}
logging.info(f"PDF解析完成! 成功: {success_count}/{total_files} ({success_count/total_files*100:.1f}%)")
if failed_count > 0:
logging.warning(f"失败: {failed_count}/{total_files} ({failed_count/total_files*100:.1f}%)")
return stats
except Exception as e:
logging.error(f"批量处理PDF文件时发生错误: {e}")
raise