triage/analysis/workflow_completeness_checker.py

189 lines
7.0 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
工作流完成度检查器
检查workflow文件是否完成所有任务生成过滤列表供其他分析脚本使用
"""
import os
import json
import sys
from pathlib import Path
from typing import List, Dict, Tuple
class WorkflowCompletenessChecker:
"""工作流完成度检查器"""
def __init__(self, data_dir: str, output_dir: str):
"""
初始化检查器
Args:
data_dir: 数据目录路径
output_dir: 输出目录路径
"""
self.data_dir = Path(data_dir)
self.output_dir = Path(output_dir)
self.incomplete_files = []
self.complete_files = []
self.error_files = []
def check_file_completeness(self, filepath: Path) -> bool:
"""
检查单个文件是否完成
Args:
filepath: 文件路径
Returns:
True if完成False if未完成
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
lines = f.readlines()
if len(lines) < 2: # 需要至少两行:倒数第二行和最后一行
return False
# 检查倒数第二行的task_completion_summary
second_to_last_line = lines[-2].strip()
if not second_to_last_line:
return False
try:
second_to_last_event = json.loads(second_to_last_line)
# 检查是否有task_completion_summary字段
task_summary = second_to_last_event.get('task_completion_summary', {})
if not task_summary:
return False
# 检查三个阶段的完成状态
phases = task_summary.get('phases', {})
required_phases = ['triage', 'hpi', 'ph']
for phase in required_phases:
phase_info = phases.get(phase, {})
if not phase_info.get('is_completed', False):
return False
return True
except json.JSONDecodeError:
return False
except Exception as e:
print(f"检查文件 {filepath.name} 时出错: {e}")
self.error_files.append(filepath.name)
return False
def scan_directory(self) -> None:
"""扫描目录中的所有workflow文件"""
if not self.data_dir.exists():
print(f"数据目录不存在: {self.data_dir}")
return
# 查找所有jsonl文件
jsonl_files = list(self.data_dir.glob("*.jsonl"))
print(f"找到 {len(jsonl_files)} 个数据文件")
for filepath in sorted(jsonl_files):
if self.check_file_completeness(filepath):
self.complete_files.append(filepath.name)
else:
self.incomplete_files.append(filepath.name)
print(f"完成文件: {len(self.complete_files)}")
print(f"未完成文件: {len(self.incomplete_files)}")
print(f"错误文件: {len(self.error_files)}")
def generate_filter_files(self) -> None:
"""生成过滤文件列表"""
# 创建输出目录
self.output_dir.mkdir(parents=True, exist_ok=True)
# 保存未完成文件列表(供其他脚本使用)
incomplete_list_file = self.output_dir / "incomplete_files.txt"
with open(incomplete_list_file, 'w', encoding='utf-8') as f:
for filename in self.incomplete_files:
f.write(f"{filename}\n")
# 保存完成文件列表
complete_list_file = self.output_dir / "complete_files.txt"
with open(complete_list_file, 'w', encoding='utf-8') as f:
for filename in self.complete_files:
f.write(f"{filename}\n")
# 生成详细统计报告
report_file = self.output_dir / "completeness_report.json"
report_data = {
"scan_directory": str(self.data_dir),
"total_files": len(self.complete_files) + len(self.incomplete_files) + len(self.error_files),
"complete_files_count": len(self.complete_files),
"incomplete_files_count": len(self.incomplete_files),
"error_files_count": len(self.error_files),
"completion_rate": len(self.complete_files) / (len(self.complete_files) + len(self.incomplete_files)) if (len(self.complete_files) + len(self.incomplete_files)) > 0 else 0.0,
"incomplete_files": self.incomplete_files,
"error_files": self.error_files
}
with open(report_file, 'w', encoding='utf-8') as f:
json.dump(report_data, f, ensure_ascii=False, indent=2)
print(f"\n过滤文件已生成:")
print(f" - 未完成文件列表: {incomplete_list_file}")
print(f" - 完成文件列表: {complete_list_file}")
print(f" - 完成度报告: {report_file}")
def print_summary(self) -> None:
"""打印汇总信息"""
total = len(self.complete_files) + len(self.incomplete_files)
if total > 0:
completion_rate = len(self.complete_files) / total * 100
print(f"\n=== 工作流完成度检查汇总 ===")
print(f"总文件数: {total}")
print(f"完成文件: {len(self.complete_files)} 个 ({completion_rate:.1f}%)")
print(f"未完成文件: {len(self.incomplete_files)}")
if self.error_files:
print(f"错误文件: {len(self.error_files)}")
if self.incomplete_files:
print(f"\n未完成的文件前10个:")
for filename in self.incomplete_files[:10]:
print(f" - {filename}")
if len(self.incomplete_files) > 10:
print(f" ... 还有 {len(self.incomplete_files) - 10}")
def run_check(self) -> None:
"""运行完整的检查流程"""
print("开始检查工作流完成度...")
# 1. 扫描目录
self.scan_directory()
# 2. 生成过滤文件
self.generate_filter_files()
# 3. 打印汇总
self.print_summary()
print("完成度检查完成!")
def main():
"""主函数"""
import sys
# 从命令行参数获取路径,如果没有提供则使用默认值
if len(sys.argv) >= 3:
data_dir = sys.argv[1]
output_dir = sys.argv[2]
else:
data_dir = "results/results0902"
output_dir = "analysis/0902"
checker = WorkflowCompletenessChecker(data_dir=data_dir, output_dir=output_dir)
checker.run_check()
if __name__ == "__main__":
main()