triage/analysis/workflow_file_cleaner.py

188 lines
6.6 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
工作流文件清理器
检测指定目录中的所有JSONL文件删除不完整的工作流记录文件
"""
import json
import os
import glob
from pathlib import Path
from typing import Dict, Any, List
import argparse
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class WorkflowFileCleaner:
"""工作流文件清理器"""
def __init__(self, directory: str, dry_run: bool = False):
"""
初始化清理器
Args:
directory: 要检查的目录路径
dry_run: 是否为试运行模式不实际删除文件
"""
self.directory = Path(directory)
self.dry_run = dry_run
self.stats = {
'total_files': 0,
'complete_files': 0,
'incomplete_files': 0,
'deleted_files': [],
'error_files': []
}
def check_workflow_completion(self, jsonl_file: str) -> bool:
"""
检查工作流是否完整
Args:
jsonl_file: JSONL文件路径
Returns:
bool: True表示工作流完整False表示不完整
"""
try:
with open(jsonl_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
if not lines:
logger.warning(f"文件为空: {jsonl_file}")
return False
# 获取最后一行
last_line = lines[-1].strip()
if not last_line:
logger.warning(f"文件最后一行为空: {jsonl_file}")
return False
try:
last_event = json.loads(last_line)
except json.JSONDecodeError as e:
logger.error(f"解析最后一行JSON失败 {jsonl_file}: {e}")
return False
# 检查是否包含workflow_complete事件
if last_event.get('event_type') != 'workflow_complete':
logger.info(f"工作流未完成 - 缺少workflow_complete事件: {jsonl_file}")
return False
# 检查final_summary中的phases完成状态
final_summary = last_event.get('final_summary', {})
phases = final_summary.get('phases', {})
required_phases = ['triage', 'hpi', 'ph']
for phase in required_phases:
phase_info = phases.get(phase, {})
is_completed = phase_info.get('is_completed', False)
completion_rate = phase_info.get('completion_rate', 0.0)
if not is_completed or completion_rate != 1.0:
logger.info(f"工作流未完成 - 阶段 {phase} 未完成: {jsonl_file}")
return False
logger.info(f"工作流完整: {jsonl_file}")
return True
except Exception as e:
logger.error(f"检查文件时发生错误 {jsonl_file}: {e}")
return False
def scan_and_clean_files(self) -> None:
"""扫描目录中的所有JSONL文件并清理不完整的文件"""
if not self.directory.exists():
logger.error(f"目录不存在: {self.directory}")
return
# 查找所有JSONL文件
jsonl_pattern = str(self.directory / "**" / "*.jsonl")
jsonl_files = glob.glob(jsonl_pattern, recursive=True)
self.stats['total_files'] = len(jsonl_files)
logger.info(f"找到 {len(jsonl_files)} 个JSONL文件")
for jsonl_file in jsonl_files:
try:
is_complete = self.check_workflow_completion(jsonl_file)
if is_complete:
self.stats['complete_files'] += 1
else:
self.stats['incomplete_files'] += 1
if self.dry_run:
logger.info(f"[试运行] 将删除不完整文件: {jsonl_file}")
self.stats['deleted_files'].append(jsonl_file)
else:
os.remove(jsonl_file)
logger.info(f"已删除不完整文件: {jsonl_file}")
self.stats['deleted_files'].append(jsonl_file)
except Exception as e:
logger.error(f"处理文件时发生错误 {jsonl_file}: {e}")
self.stats['error_files'].append(jsonl_file)
def print_summary(self) -> None:
"""打印统计摘要"""
print("\n" + "="*60)
print("工作流文件清理摘要")
print("="*60)
print(f"总文件数: {self.stats['total_files']}")
print(f"完整文件数: {self.stats['complete_files']}")
print(f"不完整文件数: {self.stats['incomplete_files']}")
print(f"删除文件数: {len(self.stats['deleted_files'])}")
print(f"错误文件数: {len(self.stats['error_files'])}")
if self.stats['deleted_files']:
print("\n已删除的文件:")
for file in self.stats['deleted_files']:
print(f" - {file}")
if self.stats['error_files']:
print("\n处理错误的文件:")
for file in self.stats['error_files']:
print(f" - {file}")
if self.dry_run and self.stats['deleted_files']:
print(f"\n注意: 这是试运行模式,实际上没有删除任何文件")
def run(self) -> Dict[str, Any]:
"""
运行清理器
Returns:
Dict: 包含统计信息的字典
"""
logger.info(f"开始检查目录: {self.directory}")
if self.dry_run:
logger.info("运行在试运行模式")
self.scan_and_clean_files()
self.print_summary()
return self.stats
def main():
"""主函数"""
parser = argparse.ArgumentParser(description='工作流文件清理器')
parser.add_argument('directory', nargs='?', default='results/results0903',
help='要检查的目录路径 (默认: results)')
parser.add_argument('--dry-run', action='store_true',
help='试运行模式,不实际删除文件')
args = parser.parse_args()
cleaner = WorkflowFileCleaner(args.directory, args.dry_run)
cleaner.run()
if __name__ == "__main__":
main()