triage/analysis/workflow_file_cleaner.py
iomgaa a1f8ffb09d 增强数据分析工具和工作流检查功能
- 优化数据对比分析工具的准确性和性能
- 完善评估指标分析的算法和统计功能
- 改进医疗工作流分析的深度和覆盖范围
- 增强工作流完整性检查的全面性
- 新增工作流文件清理工具提升维护效率

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-03 21:45:30 +08:00

188 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
工作流文件清理器
检测指定目录中的所有JSONL文件删除不完整的工作流记录文件
"""
import json
import os
import glob
from pathlib import Path
from typing import Dict, Any, List
import argparse
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class WorkflowFileCleaner:
"""工作流文件清理器"""
def __init__(self, directory: str, dry_run: bool = False):
"""
初始化清理器
Args:
directory: 要检查的目录路径
dry_run: 是否为试运行模式(不实际删除文件)
"""
self.directory = Path(directory)
self.dry_run = dry_run
self.stats = {
'total_files': 0,
'complete_files': 0,
'incomplete_files': 0,
'deleted_files': [],
'error_files': []
}
def check_workflow_completion(self, jsonl_file: str) -> bool:
"""
检查工作流是否完整
Args:
jsonl_file: JSONL文件路径
Returns:
bool: True表示工作流完整False表示不完整
"""
try:
with open(jsonl_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
if not lines:
logger.warning(f"文件为空: {jsonl_file}")
return False
# 获取最后一行
last_line = lines[-1].strip()
if not last_line:
logger.warning(f"文件最后一行为空: {jsonl_file}")
return False
try:
last_event = json.loads(last_line)
except json.JSONDecodeError as e:
logger.error(f"解析最后一行JSON失败 {jsonl_file}: {e}")
return False
# 检查是否包含workflow_complete事件
if last_event.get('event_type') != 'workflow_complete':
logger.info(f"工作流未完成 - 缺少workflow_complete事件: {jsonl_file}")
return False
# 检查final_summary中的phases完成状态
final_summary = last_event.get('final_summary', {})
phases = final_summary.get('phases', {})
required_phases = ['triage', 'hpi', 'ph']
for phase in required_phases:
phase_info = phases.get(phase, {})
is_completed = phase_info.get('is_completed', False)
completion_rate = phase_info.get('completion_rate', 0.0)
if not is_completed or completion_rate != 1.0:
logger.info(f"工作流未完成 - 阶段 {phase} 未完成: {jsonl_file}")
return False
logger.info(f"工作流完整: {jsonl_file}")
return True
except Exception as e:
logger.error(f"检查文件时发生错误 {jsonl_file}: {e}")
return False
def scan_and_clean_files(self) -> None:
"""扫描目录中的所有JSONL文件并清理不完整的文件"""
if not self.directory.exists():
logger.error(f"目录不存在: {self.directory}")
return
# 查找所有JSONL文件
jsonl_pattern = str(self.directory / "**" / "*.jsonl")
jsonl_files = glob.glob(jsonl_pattern, recursive=True)
self.stats['total_files'] = len(jsonl_files)
logger.info(f"找到 {len(jsonl_files)} 个JSONL文件")
for jsonl_file in jsonl_files:
try:
is_complete = self.check_workflow_completion(jsonl_file)
if is_complete:
self.stats['complete_files'] += 1
else:
self.stats['incomplete_files'] += 1
if self.dry_run:
logger.info(f"[试运行] 将删除不完整文件: {jsonl_file}")
self.stats['deleted_files'].append(jsonl_file)
else:
os.remove(jsonl_file)
logger.info(f"已删除不完整文件: {jsonl_file}")
self.stats['deleted_files'].append(jsonl_file)
except Exception as e:
logger.error(f"处理文件时发生错误 {jsonl_file}: {e}")
self.stats['error_files'].append(jsonl_file)
def print_summary(self) -> None:
"""打印统计摘要"""
print("\n" + "="*60)
print("工作流文件清理摘要")
print("="*60)
print(f"总文件数: {self.stats['total_files']}")
print(f"完整文件数: {self.stats['complete_files']}")
print(f"不完整文件数: {self.stats['incomplete_files']}")
print(f"删除文件数: {len(self.stats['deleted_files'])}")
print(f"错误文件数: {len(self.stats['error_files'])}")
if self.stats['deleted_files']:
print("\n已删除的文件:")
for file in self.stats['deleted_files']:
print(f" - {file}")
if self.stats['error_files']:
print("\n处理错误的文件:")
for file in self.stats['error_files']:
print(f" - {file}")
if self.dry_run and self.stats['deleted_files']:
print(f"\n注意: 这是试运行模式,实际上没有删除任何文件")
def run(self) -> Dict[str, Any]:
"""
运行清理器
Returns:
Dict: 包含统计信息的字典
"""
logger.info(f"开始检查目录: {self.directory}")
if self.dry_run:
logger.info("运行在试运行模式")
self.scan_and_clean_files()
self.print_summary()
return self.stats
def main():
"""主函数"""
parser = argparse.ArgumentParser(description='工作流文件清理器')
parser.add_argument('directory', nargs='?', default='results/results0903',
help='要检查的目录路径 (默认: results)')
parser.add_argument('--dry-run', action='store_true',
help='试运行模式,不实际删除文件')
args = parser.parse_args()
cleaner = WorkflowFileCleaner(args.directory, args.dry_run)
cleaner.run()
if __name__ == "__main__":
main()