188 lines
6.6 KiB
Python
188 lines
6.6 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
工作流文件清理器
|
|||
|
|
检测指定目录中的所有JSONL文件,删除不完整的工作流记录文件
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
import glob
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Dict, Any, List
|
|||
|
|
import argparse
|
|||
|
|
import logging
|
|||
|
|
|
|||
|
|
# 配置日志
|
|||
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class WorkflowFileCleaner:
|
|||
|
|
"""工作流文件清理器"""
|
|||
|
|
|
|||
|
|
def __init__(self, directory: str, dry_run: bool = False):
|
|||
|
|
"""
|
|||
|
|
初始化清理器
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
directory: 要检查的目录路径
|
|||
|
|
dry_run: 是否为试运行模式(不实际删除文件)
|
|||
|
|
"""
|
|||
|
|
self.directory = Path(directory)
|
|||
|
|
self.dry_run = dry_run
|
|||
|
|
self.stats = {
|
|||
|
|
'total_files': 0,
|
|||
|
|
'complete_files': 0,
|
|||
|
|
'incomplete_files': 0,
|
|||
|
|
'deleted_files': [],
|
|||
|
|
'error_files': []
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
def check_workflow_completion(self, jsonl_file: str) -> bool:
|
|||
|
|
"""
|
|||
|
|
检查工作流是否完整
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
jsonl_file: JSONL文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
bool: True表示工作流完整,False表示不完整
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
|||
|
|
lines = f.readlines()
|
|||
|
|
|
|||
|
|
if not lines:
|
|||
|
|
logger.warning(f"文件为空: {jsonl_file}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 获取最后一行
|
|||
|
|
last_line = lines[-1].strip()
|
|||
|
|
if not last_line:
|
|||
|
|
logger.warning(f"文件最后一行为空: {jsonl_file}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
last_event = json.loads(last_line)
|
|||
|
|
except json.JSONDecodeError as e:
|
|||
|
|
logger.error(f"解析最后一行JSON失败 {jsonl_file}: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 检查是否包含workflow_complete事件
|
|||
|
|
if last_event.get('event_type') != 'workflow_complete':
|
|||
|
|
logger.info(f"工作流未完成 - 缺少workflow_complete事件: {jsonl_file}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 检查final_summary中的phases完成状态
|
|||
|
|
final_summary = last_event.get('final_summary', {})
|
|||
|
|
phases = final_summary.get('phases', {})
|
|||
|
|
|
|||
|
|
required_phases = ['triage', 'hpi', 'ph']
|
|||
|
|
for phase in required_phases:
|
|||
|
|
phase_info = phases.get(phase, {})
|
|||
|
|
is_completed = phase_info.get('is_completed', False)
|
|||
|
|
completion_rate = phase_info.get('completion_rate', 0.0)
|
|||
|
|
|
|||
|
|
if not is_completed or completion_rate != 1.0:
|
|||
|
|
logger.info(f"工作流未完成 - 阶段 {phase} 未完成: {jsonl_file}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
logger.info(f"工作流完整: {jsonl_file}")
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"检查文件时发生错误 {jsonl_file}: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def scan_and_clean_files(self) -> None:
|
|||
|
|
"""扫描目录中的所有JSONL文件并清理不完整的文件"""
|
|||
|
|
if not self.directory.exists():
|
|||
|
|
logger.error(f"目录不存在: {self.directory}")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 查找所有JSONL文件
|
|||
|
|
jsonl_pattern = str(self.directory / "**" / "*.jsonl")
|
|||
|
|
jsonl_files = glob.glob(jsonl_pattern, recursive=True)
|
|||
|
|
|
|||
|
|
self.stats['total_files'] = len(jsonl_files)
|
|||
|
|
logger.info(f"找到 {len(jsonl_files)} 个JSONL文件")
|
|||
|
|
|
|||
|
|
for jsonl_file in jsonl_files:
|
|||
|
|
try:
|
|||
|
|
is_complete = self.check_workflow_completion(jsonl_file)
|
|||
|
|
|
|||
|
|
if is_complete:
|
|||
|
|
self.stats['complete_files'] += 1
|
|||
|
|
else:
|
|||
|
|
self.stats['incomplete_files'] += 1
|
|||
|
|
|
|||
|
|
if self.dry_run:
|
|||
|
|
logger.info(f"[试运行] 将删除不完整文件: {jsonl_file}")
|
|||
|
|
self.stats['deleted_files'].append(jsonl_file)
|
|||
|
|
else:
|
|||
|
|
os.remove(jsonl_file)
|
|||
|
|
logger.info(f"已删除不完整文件: {jsonl_file}")
|
|||
|
|
self.stats['deleted_files'].append(jsonl_file)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"处理文件时发生错误 {jsonl_file}: {e}")
|
|||
|
|
self.stats['error_files'].append(jsonl_file)
|
|||
|
|
|
|||
|
|
def print_summary(self) -> None:
|
|||
|
|
"""打印统计摘要"""
|
|||
|
|
print("\n" + "="*60)
|
|||
|
|
print("工作流文件清理摘要")
|
|||
|
|
print("="*60)
|
|||
|
|
print(f"总文件数: {self.stats['total_files']}")
|
|||
|
|
print(f"完整文件数: {self.stats['complete_files']}")
|
|||
|
|
print(f"不完整文件数: {self.stats['incomplete_files']}")
|
|||
|
|
print(f"删除文件数: {len(self.stats['deleted_files'])}")
|
|||
|
|
print(f"错误文件数: {len(self.stats['error_files'])}")
|
|||
|
|
|
|||
|
|
if self.stats['deleted_files']:
|
|||
|
|
print("\n已删除的文件:")
|
|||
|
|
for file in self.stats['deleted_files']:
|
|||
|
|
print(f" - {file}")
|
|||
|
|
|
|||
|
|
if self.stats['error_files']:
|
|||
|
|
print("\n处理错误的文件:")
|
|||
|
|
for file in self.stats['error_files']:
|
|||
|
|
print(f" - {file}")
|
|||
|
|
|
|||
|
|
if self.dry_run and self.stats['deleted_files']:
|
|||
|
|
print(f"\n注意: 这是试运行模式,实际上没有删除任何文件")
|
|||
|
|
|
|||
|
|
def run(self) -> Dict[str, Any]:
|
|||
|
|
"""
|
|||
|
|
运行清理器
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Dict: 包含统计信息的字典
|
|||
|
|
"""
|
|||
|
|
logger.info(f"开始检查目录: {self.directory}")
|
|||
|
|
if self.dry_run:
|
|||
|
|
logger.info("运行在试运行模式")
|
|||
|
|
|
|||
|
|
self.scan_and_clean_files()
|
|||
|
|
self.print_summary()
|
|||
|
|
|
|||
|
|
return self.stats
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
parser = argparse.ArgumentParser(description='工作流文件清理器')
|
|||
|
|
parser.add_argument('directory', nargs='?', default='results/results0903',
|
|||
|
|
help='要检查的目录路径 (默认: results)')
|
|||
|
|
parser.add_argument('--dry-run', action='store_true',
|
|||
|
|
help='试运行模式,不实际删除文件')
|
|||
|
|
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
cleaner = WorkflowFileCleaner(args.directory, args.dry_run)
|
|||
|
|
cleaner.run()
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|