2025-09-03 21:44:01 +08:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
"""
|
|
|
|
|
|
工作流完成度检查器
|
|
|
|
|
|
检查workflow文件是否完成所有任务,生成过滤列表供其他分析脚本使用
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
import json
|
|
|
|
|
|
import sys
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
from typing import List, Dict, Tuple
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WorkflowCompletenessChecker:
|
|
|
|
|
|
"""工作流完成度检查器"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, data_dir: str, output_dir: str):
|
|
|
|
|
|
"""
|
|
|
|
|
|
初始化检查器
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
data_dir: 数据目录路径
|
|
|
|
|
|
output_dir: 输出目录路径
|
|
|
|
|
|
"""
|
|
|
|
|
|
self.data_dir = Path(data_dir)
|
|
|
|
|
|
self.output_dir = Path(output_dir)
|
|
|
|
|
|
self.incomplete_files = []
|
|
|
|
|
|
self.complete_files = []
|
|
|
|
|
|
self.error_files = []
|
|
|
|
|
|
|
|
|
|
|
|
def check_file_completeness(self, filepath: Path) -> bool:
|
|
|
|
|
|
"""
|
|
|
|
|
|
检查单个文件是否完成
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
filepath: 文件路径
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
True if完成,False if未完成
|
|
|
|
|
|
"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
|
|
|
|
lines = f.readlines()
|
2025-09-03 21:45:30 +08:00
|
|
|
|
if len(lines) < 2: # 需要至少两行:倒数第二行和最后一行
|
2025-09-03 21:44:01 +08:00
|
|
|
|
return False
|
|
|
|
|
|
|
2025-09-03 21:45:30 +08:00
|
|
|
|
# 检查倒数第二行的task_completion_summary
|
|
|
|
|
|
second_to_last_line = lines[-2].strip()
|
|
|
|
|
|
if not second_to_last_line:
|
2025-09-03 21:44:01 +08:00
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
2025-09-03 21:45:30 +08:00
|
|
|
|
second_to_last_event = json.loads(second_to_last_line)
|
|
|
|
|
|
# 检查是否有task_completion_summary字段
|
|
|
|
|
|
task_summary = second_to_last_event.get('task_completion_summary', {})
|
|
|
|
|
|
if not task_summary:
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
# 检查三个阶段的完成状态
|
|
|
|
|
|
phases = task_summary.get('phases', {})
|
|
|
|
|
|
required_phases = ['triage', 'hpi', 'ph']
|
|
|
|
|
|
|
|
|
|
|
|
for phase in required_phases:
|
|
|
|
|
|
phase_info = phases.get(phase, {})
|
|
|
|
|
|
if not phase_info.get('is_completed', False):
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
return True
|
2025-09-03 21:44:01 +08:00
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"检查文件 {filepath.name} 时出错: {e}")
|
|
|
|
|
|
self.error_files.append(filepath.name)
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def scan_directory(self) -> None:
|
|
|
|
|
|
"""扫描目录中的所有workflow文件"""
|
|
|
|
|
|
if not self.data_dir.exists():
|
|
|
|
|
|
print(f"数据目录不存在: {self.data_dir}")
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
# 查找所有jsonl文件
|
|
|
|
|
|
jsonl_files = list(self.data_dir.glob("*.jsonl"))
|
|
|
|
|
|
print(f"找到 {len(jsonl_files)} 个数据文件")
|
|
|
|
|
|
|
|
|
|
|
|
for filepath in sorted(jsonl_files):
|
|
|
|
|
|
if self.check_file_completeness(filepath):
|
|
|
|
|
|
self.complete_files.append(filepath.name)
|
|
|
|
|
|
else:
|
|
|
|
|
|
self.incomplete_files.append(filepath.name)
|
|
|
|
|
|
|
|
|
|
|
|
print(f"完成文件: {len(self.complete_files)} 个")
|
|
|
|
|
|
print(f"未完成文件: {len(self.incomplete_files)} 个")
|
|
|
|
|
|
print(f"错误文件: {len(self.error_files)} 个")
|
|
|
|
|
|
|
|
|
|
|
|
def generate_filter_files(self) -> None:
|
|
|
|
|
|
"""生成过滤文件列表"""
|
|
|
|
|
|
# 创建输出目录
|
|
|
|
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
# 保存未完成文件列表(供其他脚本使用)
|
|
|
|
|
|
incomplete_list_file = self.output_dir / "incomplete_files.txt"
|
|
|
|
|
|
with open(incomplete_list_file, 'w', encoding='utf-8') as f:
|
|
|
|
|
|
for filename in self.incomplete_files:
|
|
|
|
|
|
f.write(f"{filename}\n")
|
|
|
|
|
|
|
|
|
|
|
|
# 保存完成文件列表
|
|
|
|
|
|
complete_list_file = self.output_dir / "complete_files.txt"
|
|
|
|
|
|
with open(complete_list_file, 'w', encoding='utf-8') as f:
|
|
|
|
|
|
for filename in self.complete_files:
|
|
|
|
|
|
f.write(f"{filename}\n")
|
|
|
|
|
|
|
|
|
|
|
|
# 生成详细统计报告
|
|
|
|
|
|
report_file = self.output_dir / "completeness_report.json"
|
|
|
|
|
|
report_data = {
|
|
|
|
|
|
"scan_directory": str(self.data_dir),
|
|
|
|
|
|
"total_files": len(self.complete_files) + len(self.incomplete_files) + len(self.error_files),
|
|
|
|
|
|
"complete_files_count": len(self.complete_files),
|
|
|
|
|
|
"incomplete_files_count": len(self.incomplete_files),
|
|
|
|
|
|
"error_files_count": len(self.error_files),
|
|
|
|
|
|
"completion_rate": len(self.complete_files) / (len(self.complete_files) + len(self.incomplete_files)) if (len(self.complete_files) + len(self.incomplete_files)) > 0 else 0.0,
|
|
|
|
|
|
"incomplete_files": self.incomplete_files,
|
|
|
|
|
|
"error_files": self.error_files
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
|
|
|
|
json.dump(report_data, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\n过滤文件已生成:")
|
|
|
|
|
|
print(f" - 未完成文件列表: {incomplete_list_file}")
|
|
|
|
|
|
print(f" - 完成文件列表: {complete_list_file}")
|
|
|
|
|
|
print(f" - 完成度报告: {report_file}")
|
|
|
|
|
|
|
|
|
|
|
|
def print_summary(self) -> None:
|
|
|
|
|
|
"""打印汇总信息"""
|
|
|
|
|
|
total = len(self.complete_files) + len(self.incomplete_files)
|
|
|
|
|
|
if total > 0:
|
|
|
|
|
|
completion_rate = len(self.complete_files) / total * 100
|
|
|
|
|
|
print(f"\n=== 工作流完成度检查汇总 ===")
|
|
|
|
|
|
print(f"总文件数: {total}")
|
|
|
|
|
|
print(f"完成文件: {len(self.complete_files)} 个 ({completion_rate:.1f}%)")
|
|
|
|
|
|
print(f"未完成文件: {len(self.incomplete_files)} 个")
|
|
|
|
|
|
|
|
|
|
|
|
if self.error_files:
|
|
|
|
|
|
print(f"错误文件: {len(self.error_files)} 个")
|
|
|
|
|
|
|
|
|
|
|
|
if self.incomplete_files:
|
|
|
|
|
|
print(f"\n未完成的文件(前10个):")
|
|
|
|
|
|
for filename in self.incomplete_files[:10]:
|
|
|
|
|
|
print(f" - {filename}")
|
|
|
|
|
|
if len(self.incomplete_files) > 10:
|
|
|
|
|
|
print(f" ... 还有 {len(self.incomplete_files) - 10} 个")
|
|
|
|
|
|
|
|
|
|
|
|
def run_check(self) -> None:
|
|
|
|
|
|
"""运行完整的检查流程"""
|
|
|
|
|
|
print("开始检查工作流完成度...")
|
|
|
|
|
|
|
|
|
|
|
|
# 1. 扫描目录
|
|
|
|
|
|
self.scan_directory()
|
|
|
|
|
|
|
|
|
|
|
|
# 2. 生成过滤文件
|
|
|
|
|
|
self.generate_filter_files()
|
|
|
|
|
|
|
|
|
|
|
|
# 3. 打印汇总
|
|
|
|
|
|
self.print_summary()
|
|
|
|
|
|
|
|
|
|
|
|
print("完成度检查完成!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
"""主函数"""
|
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
|
|
# 从命令行参数获取路径,如果没有提供则使用默认值
|
|
|
|
|
|
if len(sys.argv) >= 3:
|
|
|
|
|
|
data_dir = sys.argv[1]
|
|
|
|
|
|
output_dir = sys.argv[2]
|
|
|
|
|
|
else:
|
|
|
|
|
|
data_dir = "results/results0902"
|
|
|
|
|
|
output_dir = "analysis/0902"
|
|
|
|
|
|
|
|
|
|
|
|
checker = WorkflowCompletenessChecker(data_dir=data_dir, output_dir=output_dir)
|
|
|
|
|
|
checker.run_check()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
main()
|