#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 工作流文件清理器 检测指定目录中的所有JSONL文件,删除不完整的工作流记录文件 """ import json import os import glob from pathlib import Path from typing import Dict, Any, List import argparse import logging # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class WorkflowFileCleaner: """工作流文件清理器""" def __init__(self, directory: str, dry_run: bool = False): """ 初始化清理器 Args: directory: 要检查的目录路径 dry_run: 是否为试运行模式(不实际删除文件) """ self.directory = Path(directory) self.dry_run = dry_run self.stats = { 'total_files': 0, 'complete_files': 0, 'incomplete_files': 0, 'deleted_files': [], 'error_files': [] } def check_workflow_completion(self, jsonl_file: str) -> bool: """ 检查工作流是否完整 Args: jsonl_file: JSONL文件路径 Returns: bool: True表示工作流完整,False表示不完整 """ try: with open(jsonl_file, 'r', encoding='utf-8') as f: lines = f.readlines() if not lines: logger.warning(f"文件为空: {jsonl_file}") return False # 获取最后一行 last_line = lines[-1].strip() if not last_line: logger.warning(f"文件最后一行为空: {jsonl_file}") return False try: last_event = json.loads(last_line) except json.JSONDecodeError as e: logger.error(f"解析最后一行JSON失败 {jsonl_file}: {e}") return False # 检查是否包含workflow_complete事件 if last_event.get('event_type') != 'workflow_complete': logger.info(f"工作流未完成 - 缺少workflow_complete事件: {jsonl_file}") return False # 检查final_summary中的phases完成状态 final_summary = last_event.get('final_summary', {}) phases = final_summary.get('phases', {}) required_phases = ['triage', 'hpi', 'ph'] for phase in required_phases: phase_info = phases.get(phase, {}) is_completed = phase_info.get('is_completed', False) completion_rate = phase_info.get('completion_rate', 0.0) if not is_completed or completion_rate != 1.0: logger.info(f"工作流未完成 - 阶段 {phase} 未完成: {jsonl_file}") return False logger.info(f"工作流完整: {jsonl_file}") return True except Exception as e: logger.error(f"检查文件时发生错误 {jsonl_file}: {e}") return False def scan_and_clean_files(self) -> None: """扫描目录中的所有JSONL文件并清理不完整的文件""" if not self.directory.exists(): logger.error(f"目录不存在: {self.directory}") return # 查找所有JSONL文件 jsonl_pattern = str(self.directory / "**" / "*.jsonl") jsonl_files = glob.glob(jsonl_pattern, recursive=True) self.stats['total_files'] = len(jsonl_files) logger.info(f"找到 {len(jsonl_files)} 个JSONL文件") for jsonl_file in jsonl_files: try: is_complete = self.check_workflow_completion(jsonl_file) if is_complete: self.stats['complete_files'] += 1 else: self.stats['incomplete_files'] += 1 if self.dry_run: logger.info(f"[试运行] 将删除不完整文件: {jsonl_file}") self.stats['deleted_files'].append(jsonl_file) else: os.remove(jsonl_file) logger.info(f"已删除不完整文件: {jsonl_file}") self.stats['deleted_files'].append(jsonl_file) except Exception as e: logger.error(f"处理文件时发生错误 {jsonl_file}: {e}") self.stats['error_files'].append(jsonl_file) def print_summary(self) -> None: """打印统计摘要""" print("\n" + "="*60) print("工作流文件清理摘要") print("="*60) print(f"总文件数: {self.stats['total_files']}") print(f"完整文件数: {self.stats['complete_files']}") print(f"不完整文件数: {self.stats['incomplete_files']}") print(f"删除文件数: {len(self.stats['deleted_files'])}") print(f"错误文件数: {len(self.stats['error_files'])}") if self.stats['deleted_files']: print("\n已删除的文件:") for file in self.stats['deleted_files']: print(f" - {file}") if self.stats['error_files']: print("\n处理错误的文件:") for file in self.stats['error_files']: print(f" - {file}") if self.dry_run and self.stats['deleted_files']: print(f"\n注意: 这是试运行模式,实际上没有删除任何文件") def run(self) -> Dict[str, Any]: """ 运行清理器 Returns: Dict: 包含统计信息的字典 """ logger.info(f"开始检查目录: {self.directory}") if self.dry_run: logger.info("运行在试运行模式") self.scan_and_clean_files() self.print_summary() return self.stats def main(): """主函数""" parser = argparse.ArgumentParser(description='工作流文件清理器') parser.add_argument('directory', nargs='?', default='results/results0903', help='要检查的目录路径 (默认: results)') parser.add_argument('--dry-run', action='store_true', help='试运行模式,不实际删除文件') args = parser.parse_args() cleaner = WorkflowFileCleaner(args.directory, args.dry_run) cleaner.run() if __name__ == "__main__": main()