triage/analysis/workflow_completeness_checker.py
iomgaa a1f8ffb09d 增强数据分析工具和工作流检查功能
- 优化数据对比分析工具的准确性和性能
- 完善评估指标分析的算法和统计功能
- 改进医疗工作流分析的深度和覆盖范围
- 增强工作流完整性检查的全面性
- 新增工作流文件清理工具提升维护效率

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-03 21:45:30 +08:00

189 lines
7.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
工作流完成度检查器
检查workflow文件是否完成所有任务生成过滤列表供其他分析脚本使用
"""
import os
import json
import sys
from pathlib import Path
from typing import List, Dict, Tuple
class WorkflowCompletenessChecker:
"""工作流完成度检查器"""
def __init__(self, data_dir: str, output_dir: str):
"""
初始化检查器
Args:
data_dir: 数据目录路径
output_dir: 输出目录路径
"""
self.data_dir = Path(data_dir)
self.output_dir = Path(output_dir)
self.incomplete_files = []
self.complete_files = []
self.error_files = []
def check_file_completeness(self, filepath: Path) -> bool:
"""
检查单个文件是否完成
Args:
filepath: 文件路径
Returns:
True if完成False if未完成
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
lines = f.readlines()
if len(lines) < 2: # 需要至少两行:倒数第二行和最后一行
return False
# 检查倒数第二行的task_completion_summary
second_to_last_line = lines[-2].strip()
if not second_to_last_line:
return False
try:
second_to_last_event = json.loads(second_to_last_line)
# 检查是否有task_completion_summary字段
task_summary = second_to_last_event.get('task_completion_summary', {})
if not task_summary:
return False
# 检查三个阶段的完成状态
phases = task_summary.get('phases', {})
required_phases = ['triage', 'hpi', 'ph']
for phase in required_phases:
phase_info = phases.get(phase, {})
if not phase_info.get('is_completed', False):
return False
return True
except json.JSONDecodeError:
return False
except Exception as e:
print(f"检查文件 {filepath.name} 时出错: {e}")
self.error_files.append(filepath.name)
return False
def scan_directory(self) -> None:
"""扫描目录中的所有workflow文件"""
if not self.data_dir.exists():
print(f"数据目录不存在: {self.data_dir}")
return
# 查找所有jsonl文件
jsonl_files = list(self.data_dir.glob("*.jsonl"))
print(f"找到 {len(jsonl_files)} 个数据文件")
for filepath in sorted(jsonl_files):
if self.check_file_completeness(filepath):
self.complete_files.append(filepath.name)
else:
self.incomplete_files.append(filepath.name)
print(f"完成文件: {len(self.complete_files)}")
print(f"未完成文件: {len(self.incomplete_files)}")
print(f"错误文件: {len(self.error_files)}")
def generate_filter_files(self) -> None:
"""生成过滤文件列表"""
# 创建输出目录
self.output_dir.mkdir(parents=True, exist_ok=True)
# 保存未完成文件列表(供其他脚本使用)
incomplete_list_file = self.output_dir / "incomplete_files.txt"
with open(incomplete_list_file, 'w', encoding='utf-8') as f:
for filename in self.incomplete_files:
f.write(f"{filename}\n")
# 保存完成文件列表
complete_list_file = self.output_dir / "complete_files.txt"
with open(complete_list_file, 'w', encoding='utf-8') as f:
for filename in self.complete_files:
f.write(f"{filename}\n")
# 生成详细统计报告
report_file = self.output_dir / "completeness_report.json"
report_data = {
"scan_directory": str(self.data_dir),
"total_files": len(self.complete_files) + len(self.incomplete_files) + len(self.error_files),
"complete_files_count": len(self.complete_files),
"incomplete_files_count": len(self.incomplete_files),
"error_files_count": len(self.error_files),
"completion_rate": len(self.complete_files) / (len(self.complete_files) + len(self.incomplete_files)) if (len(self.complete_files) + len(self.incomplete_files)) > 0 else 0.0,
"incomplete_files": self.incomplete_files,
"error_files": self.error_files
}
with open(report_file, 'w', encoding='utf-8') as f:
json.dump(report_data, f, ensure_ascii=False, indent=2)
print(f"\n过滤文件已生成:")
print(f" - 未完成文件列表: {incomplete_list_file}")
print(f" - 完成文件列表: {complete_list_file}")
print(f" - 完成度报告: {report_file}")
def print_summary(self) -> None:
"""打印汇总信息"""
total = len(self.complete_files) + len(self.incomplete_files)
if total > 0:
completion_rate = len(self.complete_files) / total * 100
print(f"\n=== 工作流完成度检查汇总 ===")
print(f"总文件数: {total}")
print(f"完成文件: {len(self.complete_files)} 个 ({completion_rate:.1f}%)")
print(f"未完成文件: {len(self.incomplete_files)}")
if self.error_files:
print(f"错误文件: {len(self.error_files)}")
if self.incomplete_files:
print(f"\n未完成的文件前10个:")
for filename in self.incomplete_files[:10]:
print(f" - {filename}")
if len(self.incomplete_files) > 10:
print(f" ... 还有 {len(self.incomplete_files) - 10}")
def run_check(self) -> None:
"""运行完整的检查流程"""
print("开始检查工作流完成度...")
# 1. 扫描目录
self.scan_directory()
# 2. 生成过滤文件
self.generate_filter_files()
# 3. 打印汇总
self.print_summary()
print("完成度检查完成!")
def main():
"""主函数"""
import sys
# 从命令行参数获取路径,如果没有提供则使用默认值
if len(sys.argv) >= 3:
data_dir = sys.argv[1]
output_dir = sys.argv[2]
else:
data_dir = "results/results0902"
output_dir = "analysis/0902"
checker = WorkflowCompletenessChecker(data_dir=data_dir, output_dir=output_dir)
checker.run_check()
if __name__ == "__main__":
main()