#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 工作流完成度检查器 检查workflow文件是否完成所有任务,生成过滤列表供其他分析脚本使用 """ import os import json import sys from pathlib import Path from typing import List, Dict, Tuple class WorkflowCompletenessChecker: """工作流完成度检查器""" def __init__(self, data_dir: str, output_dir: str): """ 初始化检查器 Args: data_dir: 数据目录路径 output_dir: 输出目录路径 """ self.data_dir = Path(data_dir) self.output_dir = Path(output_dir) self.incomplete_files = [] self.complete_files = [] self.error_files = [] def check_file_completeness(self, filepath: Path) -> bool: """ 检查单个文件是否完成 Args: filepath: 文件路径 Returns: True if完成,False if未完成 """ try: with open(filepath, 'r', encoding='utf-8') as f: # 读取最后一行 lines = f.readlines() if not lines: return False last_line = lines[-1].strip() if not last_line: return False # 解析最后一行JSON try: last_event = json.loads(last_line) return last_event.get('event_type') == 'workflow_complete' except json.JSONDecodeError: return False except Exception as e: print(f"检查文件 {filepath.name} 时出错: {e}") self.error_files.append(filepath.name) return False def scan_directory(self) -> None: """扫描目录中的所有workflow文件""" if not self.data_dir.exists(): print(f"数据目录不存在: {self.data_dir}") return # 查找所有jsonl文件 jsonl_files = list(self.data_dir.glob("*.jsonl")) print(f"找到 {len(jsonl_files)} 个数据文件") for filepath in sorted(jsonl_files): if self.check_file_completeness(filepath): self.complete_files.append(filepath.name) else: self.incomplete_files.append(filepath.name) print(f"完成文件: {len(self.complete_files)} 个") print(f"未完成文件: {len(self.incomplete_files)} 个") print(f"错误文件: {len(self.error_files)} 个") def generate_filter_files(self) -> None: """生成过滤文件列表""" # 创建输出目录 self.output_dir.mkdir(parents=True, exist_ok=True) # 保存未完成文件列表(供其他脚本使用) incomplete_list_file = self.output_dir / "incomplete_files.txt" with open(incomplete_list_file, 'w', encoding='utf-8') as f: for filename in self.incomplete_files: f.write(f"{filename}\n") # 保存完成文件列表 complete_list_file = self.output_dir / "complete_files.txt" with open(complete_list_file, 'w', encoding='utf-8') as f: for filename in self.complete_files: f.write(f"{filename}\n") # 生成详细统计报告 report_file = self.output_dir / "completeness_report.json" report_data = { "scan_directory": str(self.data_dir), "total_files": len(self.complete_files) + len(self.incomplete_files) + len(self.error_files), "complete_files_count": len(self.complete_files), "incomplete_files_count": len(self.incomplete_files), "error_files_count": len(self.error_files), "completion_rate": len(self.complete_files) / (len(self.complete_files) + len(self.incomplete_files)) if (len(self.complete_files) + len(self.incomplete_files)) > 0 else 0.0, "incomplete_files": self.incomplete_files, "error_files": self.error_files } with open(report_file, 'w', encoding='utf-8') as f: json.dump(report_data, f, ensure_ascii=False, indent=2) print(f"\n过滤文件已生成:") print(f" - 未完成文件列表: {incomplete_list_file}") print(f" - 完成文件列表: {complete_list_file}") print(f" - 完成度报告: {report_file}") def print_summary(self) -> None: """打印汇总信息""" total = len(self.complete_files) + len(self.incomplete_files) if total > 0: completion_rate = len(self.complete_files) / total * 100 print(f"\n=== 工作流完成度检查汇总 ===") print(f"总文件数: {total}") print(f"完成文件: {len(self.complete_files)} 个 ({completion_rate:.1f}%)") print(f"未完成文件: {len(self.incomplete_files)} 个") if self.error_files: print(f"错误文件: {len(self.error_files)} 个") if self.incomplete_files: print(f"\n未完成的文件(前10个):") for filename in self.incomplete_files[:10]: print(f" - {filename}") if len(self.incomplete_files) > 10: print(f" ... 还有 {len(self.incomplete_files) - 10} 个") def run_check(self) -> None: """运行完整的检查流程""" print("开始检查工作流完成度...") # 1. 扫描目录 self.scan_directory() # 2. 生成过滤文件 self.generate_filter_files() # 3. 打印汇总 self.print_summary() print("完成度检查完成!") def main(): """主函数""" import sys # 从命令行参数获取路径,如果没有提供则使用默认值 if len(sys.argv) >= 3: data_dir = sys.argv[1] output_dir = sys.argv[2] else: data_dir = "results/results0902" output_dir = "analysis/0902" checker = WorkflowCompletenessChecker(data_dir=data_dir, output_dir=output_dir) checker.run_check() if __name__ == "__main__": main()