triage/analysis/workflow_file_cleaner.py
iomgaa d2baf53a38 优化评估器提示词和工作流清理工具
- 简化评估器提示词,移除冗长示例,保留核心评分标准
- 重构工作流清理工具为智能质量评估清理器
- 增强质量分析算法,支持专业指标和分诊错误惩罚计算
- 添加数据集同步删除功能,保持数据一致性
- 新增质量验证和数据一致性检查机制

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-08 18:16:26 +08:00

938 lines
40 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
智能工作流文件清理器
基于质量评估的智能清理策略:
- 不完整项目保留10%最优质的删除90%
- 完整项目删除20%质量最差的保留80%
"""
import json
import os
import glob
import re
import shutil
from pathlib import Path
from typing import Dict, Any, List, Optional, Set
import argparse
import logging
from dataclasses import dataclass
from datetime import datetime
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
@dataclass
class QualityScore:
"""质量评分数据类"""
professional_penalty: float # 专业指标惩罚分
triage_penalty: float # 分诊错误惩罚分
total_penalty: float # 总惩罚分
is_complete: bool # 是否完整
file_path: str # 文件路径
class IntelligentWorkflowCleaner:
"""基于质量评估的智能工作流文件清理器"""
def __init__(self, directory: str, dry_run: bool = False,
keep_incomplete_ratio: float = 0.1,
remove_complete_ratio: float = 0.2):
"""
初始化智能清理器
Args:
directory: 要检查的目录路径
dry_run: 是否为试运行模式(不实际删除文件)
keep_incomplete_ratio: 不完整项目保留比例默认10%
remove_complete_ratio: 完整项目删除比例默认20%
"""
self.directory = Path(directory)
self.dry_run = dry_run
self.keep_incomplete_ratio = keep_incomplete_ratio
self.remove_complete_ratio = remove_complete_ratio
# 质量评估相关的评估指标映射(四个核心指标)
self.quality_indicators = {
'clinical_inquiry': 'clinical_inquiry',
'communication_quality': 'communication_quality',
'information_completeness': 'information_completeness', # 修正为正确的字段名
'overall_professional': 'overall_professionalism'
}
# Dataset路径
self.dataset_path = Path('dataset/bbb.json')
self.stats = {
'total_files': 0,
'complete_files': 0,
'incomplete_files': 0,
'kept_incomplete_files': [],
'deleted_incomplete_files': [],
'kept_complete_files': [],
'deleted_complete_files': [],
'error_files': [],
'deleted_case_indices': [], # 记录被删除的case索引
'deleted_cases_info': [], # 记录被删除的case详细信息
'dataset_backup_path': '', # 备份文件路径
'quality_analysis': {
'incomplete': {'avg_penalty': 0.0, 'score_range': (0.0, 0.0)},
'complete': {'avg_penalty': 0.0, 'score_range': (0.0, 0.0)}
}
}
def calculate_professional_penalty(self, evaluation_data_by_round: Dict[int, Dict[str, Any]]) -> float:
"""
计算专业指标惩罚分数
公式: Σ(round_i * Σ(四个指标的惩罚分))
Args:
evaluation_data_by_round: 按轮次组织的评估数据字典
Returns:
float: 专业指标惩罚分数
"""
penalty = 0.0
# 遍历所有轮次
for round_num, round_data in evaluation_data_by_round.items():
# 计算该轮次四个指标的惩罚分总和
round_penalty_sum = 0.0
for indicator_key in self.quality_indicators.values():
if indicator_key in round_data:
indicator_data = round_data[indicator_key]
# 处理嵌套的score结构
if isinstance(indicator_data, dict):
score = indicator_data.get('score', 3.0)
else:
# 兼容直接存储score的情况
score = float(indicator_data) if isinstance(indicator_data, (int, float)) else 3.0
# 只有分数低于3.0才计算惩罚
if score < 3.0:
round_penalty_sum += (3.0 - score)
# 轮次惩罚 = 轮次编号 × 该轮次四个指标惩罚分之和
penalty += round_num * round_penalty_sum
return penalty
def calculate_triage_penalty(self, jsonl_file: str, case_data: Dict[str, Any]) -> float:
"""
计算分诊错误惩罚分数
如果第一轮的一级和二级都正确,才开始计算。后续错几轮就是几分
Args:
jsonl_file: JSONL文件路径
case_data: 案例数据
Returns:
float: 分诊错误惩罚分数
"""
try:
correct_primary = case_data.get('一级科室', '')
correct_secondary = case_data.get('二级科室', '')
# 提取所有triager agent的分诊结果
triage_steps = []
with open(jsonl_file, 'r', encoding='utf-8') as f:
for line in f:
try:
event = json.loads(line.strip())
if (event.get('event_type') == 'agent_execution' and
event.get('agent_name') == 'triager'):
output_data = event.get('output_data', {})
step_number = event.get('step_number', 0)
predicted_primary = output_data.get('primary_department', '')
predicted_secondary = output_data.get('secondary_department', '')
triage_steps.append({
'step_number': step_number,
'primary_department': predicted_primary,
'secondary_department': predicted_secondary,
'primary_correct': predicted_primary == correct_primary,
'secondary_correct': predicted_secondary == correct_secondary
})
except (json.JSONDecodeError, KeyError):
continue
if not triage_steps:
return 0.0
# 按步骤号排序
triage_steps.sort(key=lambda x: x['step_number'])
# 检查第一轮是否完全正确(一级和二级都正确)
first_round = triage_steps[0]
if not (first_round['primary_correct'] and first_round['secondary_correct']):
# 第一轮不完全正确,不计算惩罚分
return 0.0
# 计算后续轮次的错误数
error_rounds = 0
for step in triage_steps[1:]: # 从第二轮开始
# 只要一级或二级有一个错误,就算这轮错误
if not (step['primary_correct'] and step['secondary_correct']):
error_rounds += 1
return float(error_rounds)
except Exception as e:
logger.warning(f"计算分诊惩罚分时出错 {jsonl_file}: {e}")
return 0.0
def calculate_quality_score(self, jsonl_file: str) -> Optional[QualityScore]:
"""
计算文件的质量分数
Returns:
QualityScore: 质量评分对象如果无法计算则返回None
"""
try:
with open(jsonl_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
if not lines:
return None
# 检查是否完整
is_complete = self.check_workflow_completion(jsonl_file)
# 获取案例数据
case_data = {}
evaluation_data_by_round = {} # 按轮次组织评估数据
for line in lines:
try:
event = json.loads(line.strip())
# 获取案例数据
if event.get('event_type') == 'workflow_start':
case_data = event.get('case_data', {})
# 获取评估数据,按轮次组织
elif (event.get('event_type') == 'agent_execution' and
event.get('agent_name') == 'evaluator'):
output_data = event.get('output_data', {})
# 从execution_metadata中获取轮次信息
execution_metadata = event.get('execution_metadata', {})
round_num = execution_metadata.get('round', 1) # 默认第1轮
# 按轮次存储评估数据
if round_num not in evaluation_data_by_round:
evaluation_data_by_round[round_num] = {}
evaluation_data_by_round[round_num].update(output_data)
except (json.JSONDecodeError, KeyError):
continue
# 计算专业指标惩罚分
professional_penalty = self.calculate_professional_penalty(evaluation_data_by_round)
# 计算分诊惩罚分
triage_penalty = self.calculate_triage_penalty(jsonl_file, case_data)
# 计算总惩罚分
total_penalty = professional_penalty + 5 * triage_penalty
return QualityScore(
professional_penalty=professional_penalty,
triage_penalty=triage_penalty,
total_penalty=total_penalty,
is_complete=is_complete,
file_path=jsonl_file
)
except Exception as e:
logger.error(f"计算质量分数时出错 {jsonl_file}: {e}")
return None
def check_workflow_completion(self, jsonl_file: str) -> bool:
"""
检查工作流是否完整
Args:
jsonl_file: JSONL文件路径
Returns:
bool: True表示工作流完整False表示不完整
"""
try:
with open(jsonl_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
if not lines:
logger.warning(f"文件为空: {jsonl_file}")
return False
# 获取最后一行
last_line = lines[-1].strip()
if not last_line:
logger.warning(f"文件最后一行为空: {jsonl_file}")
return False
try:
last_event = json.loads(last_line)
except json.JSONDecodeError as e:
logger.error(f"解析最后一行JSON失败 {jsonl_file}: {e}")
return False
# 检查是否包含workflow_complete事件
if last_event.get('event_type') != 'workflow_complete':
logger.info(f"工作流未完成 - 缺少workflow_complete事件: {jsonl_file}")
return False
# 检查final_summary中的phases完成状态
final_summary = last_event.get('final_summary', {})
phases = final_summary.get('phases', {})
required_phases = ['triage', 'hpi', 'ph']
for phase in required_phases:
phase_info = phases.get(phase, {})
is_completed = phase_info.get('is_completed', False)
completion_rate = phase_info.get('completion_rate', 0.0)
if not is_completed or completion_rate != 1.0:
logger.info(f"工作流未完成 - 阶段 {phase} 未完成: {jsonl_file}")
return False
logger.info(f"工作流完整: {jsonl_file}")
return True
except Exception as e:
logger.error(f"检查文件时发生错误 {jsonl_file}: {e}")
return False
def extract_case_index_from_filename(self, filename: str) -> Optional[int]:
"""
从工作流文件名中提取case索引
Args:
filename: 工作流文件名 (如: workflow_20250819_001717_case_0000.jsonl)
Returns:
int: case索引号如果无法提取则返回None
"""
try:
# 匹配模式: workflow_*_case_*.jsonl
match = re.search(r'workflow_.*_case_(\d+)\.jsonl$', filename)
if match:
return int(match.group(1))
return None
except Exception as e:
logger.warning(f"无法从文件名提取case索引 {filename}: {e}")
return None
def backup_dataset(self) -> bool:
"""
备份dataset文件
Returns:
bool: 备份成功返回True失败返回False
"""
try:
if not self.dataset_path.exists():
logger.warning(f"Dataset文件不存在: {self.dataset_path}")
return False
# 生成带时间戳的备份文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_filename = f"bbb_backup_{timestamp}.json"
backup_path = self.dataset_path.parent / backup_filename
# 执行备份
shutil.copy2(self.dataset_path, backup_path)
self.stats['dataset_backup_path'] = str(backup_path)
logger.info(f"Dataset已备份到: {backup_path}")
return True
except Exception as e:
logger.error(f"备份dataset失败: {e}")
return False
def load_dataset(self) -> Optional[List[Dict[str, Any]]]:
"""
加载dataset数据
Returns:
List: dataset数据列表失败返回None
"""
try:
if not self.dataset_path.exists():
logger.error(f"Dataset文件不存在: {self.dataset_path}")
return None
with open(self.dataset_path, 'r', encoding='utf-8') as f:
dataset = json.load(f)
logger.info(f"成功加载dataset包含{len(dataset)}个case")
return dataset
except Exception as e:
logger.error(f"加载dataset失败: {e}")
return None
def save_dataset(self, dataset: List[Dict[str, Any]]) -> bool:
"""
保存更新后的dataset
Args:
dataset: 更新后的dataset数据
Returns:
bool: 保存成功返回True失败返回False
"""
try:
with open(self.dataset_path, 'w', encoding='utf-8') as f:
json.dump(dataset, f, ensure_ascii=False, indent=2)
logger.info(f"成功保存更新后的dataset包含{len(dataset)}个case")
return True
except Exception as e:
logger.error(f"保存dataset失败: {e}")
return False
def collect_case_info(self, jsonl_file: str, case_index: int,
dataset: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
收集被删除case的详细信息
Args:
jsonl_file: 工作流文件路径
case_index: case索引号
dataset: dataset数据
Returns:
Dict: case详细信息
"""
case_info = {
'case_index': case_index,
'jsonl_file': jsonl_file,
'case_data': None,
'primary_department': '',
'secondary_department': ''
}
try:
# 从dataset获取case数据
if 0 <= case_index < len(dataset):
case_info['case_data'] = dataset[case_index]
case_info['primary_department'] = dataset[case_index].get('一级科室', '')
case_info['secondary_department'] = dataset[case_index].get('二级科室', '')
else:
logger.warning(f"Case索引超出范围: {case_index}")
except Exception as e:
logger.error(f"收集case信息时出错 {jsonl_file}: {e}")
return case_info
def sync_delete_dataset_cases(self, deleted_case_indices: Set[int]) -> bool:
"""
同步删除dataset中的case数据
Args:
deleted_case_indices: 要删除的case索引集合
Returns:
bool: 删除成功返回True失败返回False
"""
try:
# 加载dataset
dataset = self.load_dataset()
if dataset is None:
return False
# 备份dataset
if not self.backup_dataset():
logger.error("无法备份dataset取消删除操作")
return False
# 按索引降序排列,避免删除时索引偏移
sorted_indices = sorted(deleted_case_indices, reverse=True)
original_count = len(dataset)
# 删除对应的case
for case_index in sorted_indices:
if 0 <= case_index < len(dataset):
removed_case = dataset.pop(case_index)
logger.info(f"从dataset删除case {case_index}: {removed_case.get('一级科室', '')}-{removed_case.get('二级科室', '')}")
else:
logger.warning(f"无效的case索引: {case_index}")
# 保存更新后的dataset
if self.save_dataset(dataset):
logger.info(f"成功从dataset删除{original_count - len(dataset)}个case")
return True
else:
logger.error("保存更新后的dataset失败")
return False
except Exception as e:
logger.error(f"同步删除dataset中的case时出错: {e}")
return False
def validate_data_consistency(self) -> Dict[str, Any]:
"""
验证工作流文件与dataset的数据一致性
Returns:
Dict: 验证结果
"""
validation_results = {
'total_jsonl_files': 0,
'valid_case_mappings': 0,
'invalid_case_mappings': [],
'missing_case_indices': [],
'dataset_size': 0,
'max_case_index': -1,
'consistency_rate': 0.0,
'validation_passed': False
}
try:
# 加载dataset
dataset = self.load_dataset()
if dataset is None:
validation_results['error'] = "无法加载dataset"
return validation_results
validation_results['dataset_size'] = len(dataset)
# 查找所有JSONL文件
jsonl_pattern = str(self.directory / "**" / "*.jsonl")
jsonl_files = glob.glob(jsonl_pattern, recursive=True)
validation_results['total_jsonl_files'] = len(jsonl_files)
# 验证每个文件的case索引
for jsonl_file in jsonl_files:
filename = os.path.basename(jsonl_file)
case_index = self.extract_case_index_from_filename(filename)
if case_index is not None:
validation_results['max_case_index'] = max(validation_results['max_case_index'], case_index)
if 0 <= case_index < len(dataset):
validation_results['valid_case_mappings'] += 1
else:
validation_results['invalid_case_mappings'].append({
'file': jsonl_file,
'case_index': case_index,
'reason': '索引超出dataset范围'
})
else:
validation_results['invalid_case_mappings'].append({
'file': jsonl_file,
'case_index': None,
'reason': '无法从文件名提取case索引'
})
# 检查缺失的case索引
if validation_results['max_case_index'] >= 0:
existing_indices = set()
for jsonl_file in jsonl_files:
filename = os.path.basename(jsonl_file)
case_index = self.extract_case_index_from_filename(filename)
if case_index is not None:
existing_indices.add(case_index)
expected_indices = set(range(validation_results['max_case_index'] + 1))
missing_indices = expected_indices - existing_indices
validation_results['missing_case_indices'] = sorted(missing_indices)
# 计算一致性率
if validation_results['total_jsonl_files'] > 0:
validation_results['consistency_rate'] = validation_results['valid_case_mappings'] / validation_results['total_jsonl_files']
# 判断验证是否通过
validation_results['validation_passed'] = (
validation_results['consistency_rate'] >= 0.95 and
len(validation_results['missing_case_indices']) == 0
)
logger.info(f"数据一致性验证完成: 一致性率 {validation_results['consistency_rate']:.2%}")
except Exception as e:
logger.error(f"数据一致性验证时出错: {e}")
validation_results['error'] = str(e)
return validation_results
def analyze_and_clean_files(self) -> None:
"""基于质量评估扫描并智能清理文件"""
if not self.directory.exists():
logger.error(f"目录不存在: {self.directory}")
return
# 查找所有JSONL文件
jsonl_pattern = str(self.directory / "**" / "*.jsonl")
jsonl_files = glob.glob(jsonl_pattern, recursive=True)
self.stats['total_files'] = len(jsonl_files)
logger.info(f"找到 {len(jsonl_files)} 个JSONL文件")
# 预加载dataset以供后续使用
dataset = self.load_dataset()
if dataset is None:
logger.warning("无法加载dataset将跳过dataset同步删除")
# 计算所有文件的质量分数
logger.info("正在计算质量分数...")
complete_files = []
incomplete_files = []
for jsonl_file in jsonl_files:
try:
quality_score = self.calculate_quality_score(jsonl_file)
if quality_score is None:
self.stats['error_files'].append(jsonl_file)
continue
if quality_score.is_complete:
complete_files.append(quality_score)
self.stats['complete_files'] += 1
else:
incomplete_files.append(quality_score)
self.stats['incomplete_files'] += 1
except Exception as e:
logger.error(f"处理文件时发生错误 {jsonl_file}: {e}")
self.stats['error_files'].append(jsonl_file)
# 智能清理逻辑增强版包含dataset同步删除
self._smart_cleanup_with_sync(complete_files, incomplete_files, dataset)
def _smart_cleanup_with_sync(self, complete_files: List[QualityScore],
incomplete_files: List[QualityScore],
dataset: Optional[List[Dict[str, Any]]]) -> None:
"""
执行智能清理逻辑包含dataset同步删除功能
Args:
complete_files: 完整文件的质量评分列表
incomplete_files: 不完整文件的质量评分列表
dataset: dataset数据用于收集case信息和同步删除
"""
deleted_case_indices = set() # 收集所有要删除的case索引
# 处理不完整文件保留10%最优质的
if incomplete_files:
# 按总惩罚分排序(分数越低质量越好)
incomplete_files.sort(key=lambda x: x.total_penalty)
keep_count = max(1, int(len(incomplete_files) * self.keep_incomplete_ratio))
keep_files = incomplete_files[:keep_count]
delete_files = incomplete_files[keep_count:]
self.stats['kept_incomplete_files'] = [f.file_path for f in keep_files]
# 记录质量分析
if incomplete_files:
penalties = [f.total_penalty for f in incomplete_files]
self.stats['quality_analysis']['incomplete'] = {
'avg_penalty': sum(penalties) / len(penalties),
'score_range': (min(penalties), max(penalties))
}
logger.info(f"不完整文件: 总数 {len(incomplete_files)}, 保留 {len(keep_files)}, 删除 {len(delete_files)}")
# 删除不完整文件并收集case信息
for quality_score in delete_files:
self._delete_file_with_case_tracking(quality_score, "低质量不完整文件", dataset, deleted_case_indices)
self.stats['deleted_incomplete_files'].append(quality_score.file_path)
# 处理完整文件删除20%质量最差的
if complete_files:
# 按总惩罚分排序(分数越高质量越差)
complete_files.sort(key=lambda x: x.total_penalty, reverse=True)
delete_count = int(len(complete_files) * self.remove_complete_ratio)
delete_files = complete_files[:delete_count]
keep_files = complete_files[delete_count:]
self.stats['kept_complete_files'] = [f.file_path for f in keep_files]
# 记录质量分析
if complete_files:
penalties = [f.total_penalty for f in complete_files]
self.stats['quality_analysis']['complete'] = {
'avg_penalty': sum(penalties) / len(penalties),
'score_range': (min(penalties), max(penalties))
}
logger.info(f"完整文件: 总数 {len(complete_files)}, 保留 {len(keep_files)}, 删除 {len(delete_files)}")
# 删除低质量完整文件并收集case信息
for quality_score in delete_files:
self._delete_file_with_case_tracking(quality_score, "低质量完整文件", dataset, deleted_case_indices)
self.stats['deleted_complete_files'].append(quality_score.file_path)
# 同步删除dataset中的对应case
if deleted_case_indices and dataset is not None:
logger.info(f"准备从dataset中删除 {len(deleted_case_indices)} 个case: {sorted(deleted_case_indices)}")
if self.sync_delete_dataset_cases(deleted_case_indices):
logger.info("Dataset同步删除完成")
else:
logger.error("Dataset同步删除失败")
elif deleted_case_indices:
logger.warning(f"检测到 {len(deleted_case_indices)} 个case需要删除但dataset不可用")
# 记录删除的case索引
self.stats['deleted_case_indices'] = sorted(deleted_case_indices)
def _delete_file_with_case_tracking(self, quality_score: QualityScore, reason: str,
dataset: Optional[List[Dict[str, Any]]],
deleted_case_indices: Set[int]) -> None:
"""
删除文件并跟踪相关的case信息
Args:
quality_score: 质量评分对象
reason: 删除原因
dataset: dataset数据
deleted_case_indices: 用于收集被删除case索引的集合
"""
file_path = quality_score.file_path
# 从文件名提取case索引
filename = os.path.basename(file_path)
case_index = self.extract_case_index_from_filename(filename)
if case_index is not None and dataset is not None:
# 收集case信息
case_info = self.collect_case_info(file_path, case_index, dataset)
self.stats['deleted_cases_info'].append(case_info)
deleted_case_indices.add(case_index)
logger.info(f"准备删除{reason}: {file_path} (case_{case_index}: {case_info['primary_department']}-{case_info['secondary_department']})")
else:
logger.info(f"准备删除{reason}: {file_path} (无法提取case索引)")
# 执行文件删除
if self.dry_run:
logger.info(f"[试运行] 将删除{reason}: {file_path}")
else:
try:
os.remove(file_path)
logger.info(f"已删除{reason}: {file_path}")
except Exception as e:
logger.error(f"删除文件失败 {file_path}: {e}")
self.stats['error_files'].append(file_path)
def _delete_file(self, file_path: str, reason: str) -> None:
"""
删除文件(兼容性方法)
Args:
file_path: 文件路径
reason: 删除原因
"""
if self.dry_run:
logger.info(f"[试运行] 将删除{reason}: {file_path}")
else:
try:
os.remove(file_path)
logger.info(f"已删除{reason}: {file_path}")
except Exception as e:
logger.error(f"删除文件失败 {file_path}: {e}")
self.stats['error_files'].append(file_path)
def print_summary(self) -> None:
"""打印详细的统计摘要"""
print("\n" + "="*80)
print("🧠 智能工作流文件清理摘要")
print("="*80)
# 基本统计
print(f"📊 基本统计:")
print(f" 总文件数: {self.stats['total_files']}")
print(f" 完整文件数: {self.stats['complete_files']}")
print(f" 不完整文件数: {self.stats['incomplete_files']}")
print(f" 错误文件数: {len(self.stats['error_files'])}")
# 清理策略统计
print(f"\n🎯 清理策略统计:")
print(f" 不完整文件保留比例: {self.keep_incomplete_ratio*100:.1f}%")
print(f" 完整文件删除比例: {self.remove_complete_ratio*100:.1f}%")
# 不完整文件处理结果
if self.stats['incomplete_files'] > 0:
kept_incomplete = len(self.stats['kept_incomplete_files'])
deleted_incomplete = len(self.stats['deleted_incomplete_files'])
print(f"\n📋 不完整文件处理:")
print(f" 保留数量: {kept_incomplete} ({kept_incomplete/self.stats['incomplete_files']*100:.1f}%)")
print(f" 删除数量: {deleted_incomplete} ({deleted_incomplete/self.stats['incomplete_files']*100:.1f}%)")
qa = self.stats['quality_analysis']['incomplete']
if qa['avg_penalty'] > 0:
print(f" 平均惩罚分: {qa['avg_penalty']:.2f}")
print(f" 分数范围: {qa['score_range'][0]:.2f} - {qa['score_range'][1]:.2f}")
# 完整文件处理结果
if self.stats['complete_files'] > 0:
kept_complete = len(self.stats['kept_complete_files'])
deleted_complete = len(self.stats['deleted_complete_files'])
print(f"\n✅ 完整文件处理:")
print(f" 保留数量: {kept_complete} ({kept_complete/self.stats['complete_files']*100:.1f}%)")
print(f" 删除数量: {deleted_complete} ({deleted_complete/self.stats['complete_files']*100:.1f}%)")
qa = self.stats['quality_analysis']['complete']
if qa['avg_penalty'] > 0:
print(f" 平均惩罚分: {qa['avg_penalty']:.2f}")
print(f" 分数范围: {qa['score_range'][0]:.2f} - {qa['score_range'][1]:.2f}")
# 总删除统计
total_deleted = len(self.stats['deleted_incomplete_files']) + len(self.stats['deleted_complete_files'])
if total_deleted > 0:
print(f"\n🗑️ 总删除统计:")
print(f" 删除的不完整文件: {len(self.stats['deleted_incomplete_files'])}")
print(f" 删除的完整文件: {len(self.stats['deleted_complete_files'])}")
print(f" 总删除数量: {total_deleted}")
# 删除的case信息统计
if self.stats['deleted_case_indices']:
print(f"\n📋 删除的Case统计:")
print(f" 删除的case数量: {len(self.stats['deleted_case_indices'])}")
print(f" 删除的case索引: {self.stats['deleted_case_indices'][:10]}{'...' if len(self.stats['deleted_case_indices']) > 10 else ''}")
# 按科室统计删除的case
if self.stats['deleted_cases_info']:
dept_stats = {}
for case_info in self.stats['deleted_cases_info']:
dept_key = f"{case_info['primary_department']}-{case_info['secondary_department']}"
dept_stats[dept_key] = dept_stats.get(dept_key, 0) + 1
print(f"\n 按科室统计删除的case:")
for dept, count in sorted(dept_stats.items(), key=lambda x: x[1], reverse=True)[:10]:
print(f" {dept}: {count}")
if len(dept_stats) > 10:
print(f" ... 以及其他 {len(dept_stats) - 10} 个科室")
# Dataset备份信息
if self.stats['dataset_backup_path']:
print(f"\n💾 Dataset备份:")
print(f" 备份文件: {self.stats['dataset_backup_path']}")
# 错误文件
if self.stats['error_files']:
print(f"\n⚠️ 处理错误的文件 ({len(self.stats['error_files'])})个:")
for file in self.stats['error_files'][:5]: # 只显示前5个
print(f" - {file}")
if len(self.stats['error_files']) > 5:
print(f" ... 以及其他 {len(self.stats['error_files'])-5} 个文件")
# 数据一致性验证结果
if 'validation_results' in self.stats:
validation = self.stats['validation_results']
print(f"\n🔍 数据一致性验证:")
print(f" Dataset大小: {validation.get('dataset_size', 0)}")
print(f" JSONL文件数: {validation.get('total_jsonl_files', 0)}")
print(f" 有效映射数: {validation.get('valid_case_mappings', 0)}")
print(f" 一致性率: {validation.get('consistency_rate', 0):.2%}")
print(f" 验证状态: {'✅ 通过' if validation.get('validation_passed', False) else '❌ 未通过'}")
if validation.get('missing_case_indices'):
missing_count = len(validation['missing_case_indices'])
print(f" 缺失索引: {missing_count}{validation['missing_case_indices'][:5]}{'...' if missing_count > 5 else ''}")
if validation.get('invalid_case_mappings'):
invalid_count = len(validation['invalid_case_mappings'])
print(f" 无效映射: {invalid_count}")
if self.dry_run:
print(f"\n💡 注意: 这是试运行模式,实际上没有删除任何文件")
# 质量分析建议
print(f"\n🔍 质量分析建议:")
incomplete_avg = self.stats['quality_analysis']['incomplete']['avg_penalty']
complete_avg = self.stats['quality_analysis']['complete']['avg_penalty']
if incomplete_avg > complete_avg:
print(f" - 不完整文件的平均质量较低,建议优化工作流执行")
else:
print(f" - 完整文件中仍有质量问题,建议加强质量控制")
if incomplete_avg > 3.0:
print(f" - 不完整文件质量分数偏高,建议检查中断原因")
if complete_avg > 2.0:
print(f" - 完整文件质量有待提升,建议优化评估标准")
def run(self) -> Dict[str, Any]:
"""
运行清理器
Returns:
Dict: 包含统计信息的字典
"""
logger.info(f"🚀 开始智能分析目录: {self.directory}")
logger.info(f"📋 清理策略: 保留{self.keep_incomplete_ratio*100:.0f}%最优不完整文件,删除{self.remove_complete_ratio*100:.0f}%最差完整文件")
if self.dry_run:
logger.info("🧪 运行在试运行模式")
# 执行数据一致性验证
logger.info("🔍 执行数据一致性验证...")
validation_results = self.validate_data_consistency()
self.stats['validation_results'] = validation_results
if not validation_results.get('validation_passed', False):
logger.warning(f"⚠️ 数据一致性验证未通过: 一致性率 {validation_results.get('consistency_rate', 0):.2%}")
if validation_results.get('missing_case_indices'):
logger.warning(f" 缺失的case索引: {validation_results['missing_case_indices'][:10]}{'...' if len(validation_results['missing_case_indices']) > 10 else ''}")
if validation_results.get('invalid_case_mappings'):
logger.warning(f" 无效的case映射: {len(validation_results['invalid_case_mappings'])}")
else:
logger.info("✅ 数据一致性验证通过")
self.analyze_and_clean_files()
self.print_summary()
return self.stats
def main():
"""主函数"""
parser = argparse.ArgumentParser(description='基于质量评估的智能工作流文件清理器')
parser.add_argument('directory', nargs='?', default='results/results0905-2',
help='要检查的目录路径 (默认: results/results0903)')
parser.add_argument('--dry-run', action='store_true',
help='试运行模式,不实际删除文件')
parser.add_argument('--keep-incomplete', type=float, default=0.1,
help='不完整文件保留比例 (默认: 0.1, 即10%%)')
parser.add_argument('--remove-complete', type=float, default=0.2,
help='完整文件删除比例 (默认: 0.2, 即20%%)')
args = parser.parse_args()
# 参数验证
if not (0.0 <= args.keep_incomplete <= 1.0):
logger.error("--keep-incomplete 参数必须在 0.0 到 1.0 之间")
return
if not (0.0 <= args.remove_complete <= 1.0):
logger.error("--remove-complete 参数必须在 0.0 到 1.0 之间")
return
cleaner = IntelligentWorkflowCleaner(
directory=args.directory,
dry_run=args.dry_run,
keep_incomplete_ratio=args.keep_incomplete,
remove_complete_ratio=args.remove_complete
)
cleaner.run()
if __name__ == "__main__":
main()