- 简化评估器提示词,移除冗长示例,保留核心评分标准 - 重构工作流清理工具为智能质量评估清理器 - 增强质量分析算法,支持专业指标和分诊错误惩罚计算 - 添加数据集同步删除功能,保持数据一致性 - 新增质量验证和数据一致性检查机制 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
938 lines
40 KiB
Python
938 lines
40 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
智能工作流文件清理器
|
||
基于质量评估的智能清理策略:
|
||
- 不完整项目:保留10%最优质的,删除90%
|
||
- 完整项目:删除20%质量最差的,保留80%
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import glob
|
||
import re
|
||
import shutil
|
||
from pathlib import Path
|
||
from typing import Dict, Any, List, Optional, Set
|
||
import argparse
|
||
import logging
|
||
from dataclasses import dataclass
|
||
from datetime import datetime
|
||
|
||
# 配置日志
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
@dataclass
|
||
class QualityScore:
|
||
"""质量评分数据类"""
|
||
professional_penalty: float # 专业指标惩罚分
|
||
triage_penalty: float # 分诊错误惩罚分
|
||
total_penalty: float # 总惩罚分
|
||
is_complete: bool # 是否完整
|
||
file_path: str # 文件路径
|
||
|
||
|
||
class IntelligentWorkflowCleaner:
|
||
"""基于质量评估的智能工作流文件清理器"""
|
||
|
||
def __init__(self, directory: str, dry_run: bool = False,
|
||
keep_incomplete_ratio: float = 0.1,
|
||
remove_complete_ratio: float = 0.2):
|
||
"""
|
||
初始化智能清理器
|
||
|
||
Args:
|
||
directory: 要检查的目录路径
|
||
dry_run: 是否为试运行模式(不实际删除文件)
|
||
keep_incomplete_ratio: 不完整项目保留比例(默认10%)
|
||
remove_complete_ratio: 完整项目删除比例(默认20%)
|
||
"""
|
||
self.directory = Path(directory)
|
||
self.dry_run = dry_run
|
||
self.keep_incomplete_ratio = keep_incomplete_ratio
|
||
self.remove_complete_ratio = remove_complete_ratio
|
||
|
||
# 质量评估相关的评估指标映射(四个核心指标)
|
||
self.quality_indicators = {
|
||
'clinical_inquiry': 'clinical_inquiry',
|
||
'communication_quality': 'communication_quality',
|
||
'information_completeness': 'information_completeness', # 修正为正确的字段名
|
||
'overall_professional': 'overall_professionalism'
|
||
}
|
||
|
||
# Dataset路径
|
||
self.dataset_path = Path('dataset/bbb.json')
|
||
|
||
self.stats = {
|
||
'total_files': 0,
|
||
'complete_files': 0,
|
||
'incomplete_files': 0,
|
||
'kept_incomplete_files': [],
|
||
'deleted_incomplete_files': [],
|
||
'kept_complete_files': [],
|
||
'deleted_complete_files': [],
|
||
'error_files': [],
|
||
'deleted_case_indices': [], # 记录被删除的case索引
|
||
'deleted_cases_info': [], # 记录被删除的case详细信息
|
||
'dataset_backup_path': '', # 备份文件路径
|
||
'quality_analysis': {
|
||
'incomplete': {'avg_penalty': 0.0, 'score_range': (0.0, 0.0)},
|
||
'complete': {'avg_penalty': 0.0, 'score_range': (0.0, 0.0)}
|
||
}
|
||
}
|
||
|
||
def calculate_professional_penalty(self, evaluation_data_by_round: Dict[int, Dict[str, Any]]) -> float:
|
||
"""
|
||
计算专业指标惩罚分数
|
||
|
||
公式: Σ(round_i * Σ(四个指标的惩罚分))
|
||
|
||
Args:
|
||
evaluation_data_by_round: 按轮次组织的评估数据字典
|
||
|
||
Returns:
|
||
float: 专业指标惩罚分数
|
||
"""
|
||
penalty = 0.0
|
||
|
||
# 遍历所有轮次
|
||
for round_num, round_data in evaluation_data_by_round.items():
|
||
# 计算该轮次四个指标的惩罚分总和
|
||
round_penalty_sum = 0.0
|
||
|
||
for indicator_key in self.quality_indicators.values():
|
||
if indicator_key in round_data:
|
||
indicator_data = round_data[indicator_key]
|
||
|
||
# 处理嵌套的score结构
|
||
if isinstance(indicator_data, dict):
|
||
score = indicator_data.get('score', 3.0)
|
||
else:
|
||
# 兼容直接存储score的情况
|
||
score = float(indicator_data) if isinstance(indicator_data, (int, float)) else 3.0
|
||
|
||
# 只有分数低于3.0才计算惩罚
|
||
if score < 3.0:
|
||
round_penalty_sum += (3.0 - score)
|
||
|
||
# 轮次惩罚 = 轮次编号 × 该轮次四个指标惩罚分之和
|
||
penalty += round_num * round_penalty_sum
|
||
|
||
return penalty
|
||
|
||
def calculate_triage_penalty(self, jsonl_file: str, case_data: Dict[str, Any]) -> float:
|
||
"""
|
||
计算分诊错误惩罚分数
|
||
|
||
如果第一轮的一级和二级都正确,才开始计算。后续错几轮就是几分
|
||
|
||
Args:
|
||
jsonl_file: JSONL文件路径
|
||
case_data: 案例数据
|
||
|
||
Returns:
|
||
float: 分诊错误惩罚分数
|
||
"""
|
||
try:
|
||
correct_primary = case_data.get('一级科室', '')
|
||
correct_secondary = case_data.get('二级科室', '')
|
||
|
||
# 提取所有triager agent的分诊结果
|
||
triage_steps = []
|
||
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
||
for line in f:
|
||
try:
|
||
event = json.loads(line.strip())
|
||
if (event.get('event_type') == 'agent_execution' and
|
||
event.get('agent_name') == 'triager'):
|
||
|
||
output_data = event.get('output_data', {})
|
||
step_number = event.get('step_number', 0)
|
||
|
||
predicted_primary = output_data.get('primary_department', '')
|
||
predicted_secondary = output_data.get('secondary_department', '')
|
||
|
||
triage_steps.append({
|
||
'step_number': step_number,
|
||
'primary_department': predicted_primary,
|
||
'secondary_department': predicted_secondary,
|
||
'primary_correct': predicted_primary == correct_primary,
|
||
'secondary_correct': predicted_secondary == correct_secondary
|
||
})
|
||
|
||
except (json.JSONDecodeError, KeyError):
|
||
continue
|
||
|
||
if not triage_steps:
|
||
return 0.0
|
||
|
||
# 按步骤号排序
|
||
triage_steps.sort(key=lambda x: x['step_number'])
|
||
|
||
# 检查第一轮是否完全正确(一级和二级都正确)
|
||
first_round = triage_steps[0]
|
||
if not (first_round['primary_correct'] and first_round['secondary_correct']):
|
||
# 第一轮不完全正确,不计算惩罚分
|
||
return 0.0
|
||
|
||
# 计算后续轮次的错误数
|
||
error_rounds = 0
|
||
for step in triage_steps[1:]: # 从第二轮开始
|
||
# 只要一级或二级有一个错误,就算这轮错误
|
||
if not (step['primary_correct'] and step['secondary_correct']):
|
||
error_rounds += 1
|
||
|
||
return float(error_rounds)
|
||
|
||
except Exception as e:
|
||
logger.warning(f"计算分诊惩罚分时出错 {jsonl_file}: {e}")
|
||
|
||
return 0.0
|
||
|
||
def calculate_quality_score(self, jsonl_file: str) -> Optional[QualityScore]:
|
||
"""
|
||
计算文件的质量分数
|
||
|
||
Returns:
|
||
QualityScore: 质量评分对象,如果无法计算则返回None
|
||
"""
|
||
try:
|
||
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
||
lines = f.readlines()
|
||
|
||
if not lines:
|
||
return None
|
||
|
||
# 检查是否完整
|
||
is_complete = self.check_workflow_completion(jsonl_file)
|
||
|
||
# 获取案例数据
|
||
case_data = {}
|
||
evaluation_data_by_round = {} # 按轮次组织评估数据
|
||
|
||
for line in lines:
|
||
try:
|
||
event = json.loads(line.strip())
|
||
|
||
# 获取案例数据
|
||
if event.get('event_type') == 'workflow_start':
|
||
case_data = event.get('case_data', {})
|
||
|
||
# 获取评估数据,按轮次组织
|
||
elif (event.get('event_type') == 'agent_execution' and
|
||
event.get('agent_name') == 'evaluator'):
|
||
output_data = event.get('output_data', {})
|
||
|
||
# 从execution_metadata中获取轮次信息
|
||
execution_metadata = event.get('execution_metadata', {})
|
||
round_num = execution_metadata.get('round', 1) # 默认第1轮
|
||
|
||
# 按轮次存储评估数据
|
||
if round_num not in evaluation_data_by_round:
|
||
evaluation_data_by_round[round_num] = {}
|
||
evaluation_data_by_round[round_num].update(output_data)
|
||
|
||
except (json.JSONDecodeError, KeyError):
|
||
continue
|
||
|
||
# 计算专业指标惩罚分
|
||
professional_penalty = self.calculate_professional_penalty(evaluation_data_by_round)
|
||
|
||
# 计算分诊惩罚分
|
||
triage_penalty = self.calculate_triage_penalty(jsonl_file, case_data)
|
||
|
||
# 计算总惩罚分
|
||
total_penalty = professional_penalty + 5 * triage_penalty
|
||
|
||
return QualityScore(
|
||
professional_penalty=professional_penalty,
|
||
triage_penalty=triage_penalty,
|
||
total_penalty=total_penalty,
|
||
is_complete=is_complete,
|
||
file_path=jsonl_file
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"计算质量分数时出错 {jsonl_file}: {e}")
|
||
return None
|
||
|
||
def check_workflow_completion(self, jsonl_file: str) -> bool:
|
||
"""
|
||
检查工作流是否完整
|
||
|
||
Args:
|
||
jsonl_file: JSONL文件路径
|
||
|
||
Returns:
|
||
bool: True表示工作流完整,False表示不完整
|
||
"""
|
||
try:
|
||
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
||
lines = f.readlines()
|
||
|
||
if not lines:
|
||
logger.warning(f"文件为空: {jsonl_file}")
|
||
return False
|
||
|
||
# 获取最后一行
|
||
last_line = lines[-1].strip()
|
||
if not last_line:
|
||
logger.warning(f"文件最后一行为空: {jsonl_file}")
|
||
return False
|
||
|
||
try:
|
||
last_event = json.loads(last_line)
|
||
except json.JSONDecodeError as e:
|
||
logger.error(f"解析最后一行JSON失败 {jsonl_file}: {e}")
|
||
return False
|
||
|
||
# 检查是否包含workflow_complete事件
|
||
if last_event.get('event_type') != 'workflow_complete':
|
||
logger.info(f"工作流未完成 - 缺少workflow_complete事件: {jsonl_file}")
|
||
return False
|
||
|
||
# 检查final_summary中的phases完成状态
|
||
final_summary = last_event.get('final_summary', {})
|
||
phases = final_summary.get('phases', {})
|
||
|
||
required_phases = ['triage', 'hpi', 'ph']
|
||
for phase in required_phases:
|
||
phase_info = phases.get(phase, {})
|
||
is_completed = phase_info.get('is_completed', False)
|
||
completion_rate = phase_info.get('completion_rate', 0.0)
|
||
|
||
if not is_completed or completion_rate != 1.0:
|
||
logger.info(f"工作流未完成 - 阶段 {phase} 未完成: {jsonl_file}")
|
||
return False
|
||
|
||
logger.info(f"工作流完整: {jsonl_file}")
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(f"检查文件时发生错误 {jsonl_file}: {e}")
|
||
return False
|
||
|
||
def extract_case_index_from_filename(self, filename: str) -> Optional[int]:
|
||
"""
|
||
从工作流文件名中提取case索引
|
||
|
||
Args:
|
||
filename: 工作流文件名 (如: workflow_20250819_001717_case_0000.jsonl)
|
||
|
||
Returns:
|
||
int: case索引号,如果无法提取则返回None
|
||
"""
|
||
try:
|
||
# 匹配模式: workflow_*_case_*.jsonl
|
||
match = re.search(r'workflow_.*_case_(\d+)\.jsonl$', filename)
|
||
if match:
|
||
return int(match.group(1))
|
||
return None
|
||
except Exception as e:
|
||
logger.warning(f"无法从文件名提取case索引 {filename}: {e}")
|
||
return None
|
||
|
||
def backup_dataset(self) -> bool:
|
||
"""
|
||
备份dataset文件
|
||
|
||
Returns:
|
||
bool: 备份成功返回True,失败返回False
|
||
"""
|
||
try:
|
||
if not self.dataset_path.exists():
|
||
logger.warning(f"Dataset文件不存在: {self.dataset_path}")
|
||
return False
|
||
|
||
# 生成带时间戳的备份文件名
|
||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
backup_filename = f"bbb_backup_{timestamp}.json"
|
||
backup_path = self.dataset_path.parent / backup_filename
|
||
|
||
# 执行备份
|
||
shutil.copy2(self.dataset_path, backup_path)
|
||
self.stats['dataset_backup_path'] = str(backup_path)
|
||
logger.info(f"Dataset已备份到: {backup_path}")
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(f"备份dataset失败: {e}")
|
||
return False
|
||
|
||
def load_dataset(self) -> Optional[List[Dict[str, Any]]]:
|
||
"""
|
||
加载dataset数据
|
||
|
||
Returns:
|
||
List: dataset数据列表,失败返回None
|
||
"""
|
||
try:
|
||
if not self.dataset_path.exists():
|
||
logger.error(f"Dataset文件不存在: {self.dataset_path}")
|
||
return None
|
||
|
||
with open(self.dataset_path, 'r', encoding='utf-8') as f:
|
||
dataset = json.load(f)
|
||
|
||
logger.info(f"成功加载dataset,包含{len(dataset)}个case")
|
||
return dataset
|
||
|
||
except Exception as e:
|
||
logger.error(f"加载dataset失败: {e}")
|
||
return None
|
||
|
||
def save_dataset(self, dataset: List[Dict[str, Any]]) -> bool:
|
||
"""
|
||
保存更新后的dataset
|
||
|
||
Args:
|
||
dataset: 更新后的dataset数据
|
||
|
||
Returns:
|
||
bool: 保存成功返回True,失败返回False
|
||
"""
|
||
try:
|
||
with open(self.dataset_path, 'w', encoding='utf-8') as f:
|
||
json.dump(dataset, f, ensure_ascii=False, indent=2)
|
||
|
||
logger.info(f"成功保存更新后的dataset,包含{len(dataset)}个case")
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(f"保存dataset失败: {e}")
|
||
return False
|
||
|
||
def collect_case_info(self, jsonl_file: str, case_index: int,
|
||
dataset: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||
"""
|
||
收集被删除case的详细信息
|
||
|
||
Args:
|
||
jsonl_file: 工作流文件路径
|
||
case_index: case索引号
|
||
dataset: dataset数据
|
||
|
||
Returns:
|
||
Dict: case详细信息
|
||
"""
|
||
case_info = {
|
||
'case_index': case_index,
|
||
'jsonl_file': jsonl_file,
|
||
'case_data': None,
|
||
'primary_department': '',
|
||
'secondary_department': ''
|
||
}
|
||
|
||
try:
|
||
# 从dataset获取case数据
|
||
if 0 <= case_index < len(dataset):
|
||
case_info['case_data'] = dataset[case_index]
|
||
case_info['primary_department'] = dataset[case_index].get('一级科室', '')
|
||
case_info['secondary_department'] = dataset[case_index].get('二级科室', '')
|
||
else:
|
||
logger.warning(f"Case索引超出范围: {case_index}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"收集case信息时出错 {jsonl_file}: {e}")
|
||
|
||
return case_info
|
||
|
||
def sync_delete_dataset_cases(self, deleted_case_indices: Set[int]) -> bool:
|
||
"""
|
||
同步删除dataset中的case数据
|
||
|
||
Args:
|
||
deleted_case_indices: 要删除的case索引集合
|
||
|
||
Returns:
|
||
bool: 删除成功返回True,失败返回False
|
||
"""
|
||
try:
|
||
# 加载dataset
|
||
dataset = self.load_dataset()
|
||
if dataset is None:
|
||
return False
|
||
|
||
# 备份dataset
|
||
if not self.backup_dataset():
|
||
logger.error("无法备份dataset,取消删除操作")
|
||
return False
|
||
|
||
# 按索引降序排列,避免删除时索引偏移
|
||
sorted_indices = sorted(deleted_case_indices, reverse=True)
|
||
original_count = len(dataset)
|
||
|
||
# 删除对应的case
|
||
for case_index in sorted_indices:
|
||
if 0 <= case_index < len(dataset):
|
||
removed_case = dataset.pop(case_index)
|
||
logger.info(f"从dataset删除case {case_index}: {removed_case.get('一级科室', '')}-{removed_case.get('二级科室', '')}")
|
||
else:
|
||
logger.warning(f"无效的case索引: {case_index}")
|
||
|
||
# 保存更新后的dataset
|
||
if self.save_dataset(dataset):
|
||
logger.info(f"成功从dataset删除{original_count - len(dataset)}个case")
|
||
return True
|
||
else:
|
||
logger.error("保存更新后的dataset失败")
|
||
return False
|
||
|
||
except Exception as e:
|
||
logger.error(f"同步删除dataset中的case时出错: {e}")
|
||
return False
|
||
|
||
def validate_data_consistency(self) -> Dict[str, Any]:
|
||
"""
|
||
验证工作流文件与dataset的数据一致性
|
||
|
||
Returns:
|
||
Dict: 验证结果
|
||
"""
|
||
validation_results = {
|
||
'total_jsonl_files': 0,
|
||
'valid_case_mappings': 0,
|
||
'invalid_case_mappings': [],
|
||
'missing_case_indices': [],
|
||
'dataset_size': 0,
|
||
'max_case_index': -1,
|
||
'consistency_rate': 0.0,
|
||
'validation_passed': False
|
||
}
|
||
|
||
try:
|
||
# 加载dataset
|
||
dataset = self.load_dataset()
|
||
if dataset is None:
|
||
validation_results['error'] = "无法加载dataset"
|
||
return validation_results
|
||
|
||
validation_results['dataset_size'] = len(dataset)
|
||
|
||
# 查找所有JSONL文件
|
||
jsonl_pattern = str(self.directory / "**" / "*.jsonl")
|
||
jsonl_files = glob.glob(jsonl_pattern, recursive=True)
|
||
validation_results['total_jsonl_files'] = len(jsonl_files)
|
||
|
||
# 验证每个文件的case索引
|
||
for jsonl_file in jsonl_files:
|
||
filename = os.path.basename(jsonl_file)
|
||
case_index = self.extract_case_index_from_filename(filename)
|
||
|
||
if case_index is not None:
|
||
validation_results['max_case_index'] = max(validation_results['max_case_index'], case_index)
|
||
|
||
if 0 <= case_index < len(dataset):
|
||
validation_results['valid_case_mappings'] += 1
|
||
else:
|
||
validation_results['invalid_case_mappings'].append({
|
||
'file': jsonl_file,
|
||
'case_index': case_index,
|
||
'reason': '索引超出dataset范围'
|
||
})
|
||
else:
|
||
validation_results['invalid_case_mappings'].append({
|
||
'file': jsonl_file,
|
||
'case_index': None,
|
||
'reason': '无法从文件名提取case索引'
|
||
})
|
||
|
||
# 检查缺失的case索引
|
||
if validation_results['max_case_index'] >= 0:
|
||
existing_indices = set()
|
||
for jsonl_file in jsonl_files:
|
||
filename = os.path.basename(jsonl_file)
|
||
case_index = self.extract_case_index_from_filename(filename)
|
||
if case_index is not None:
|
||
existing_indices.add(case_index)
|
||
|
||
expected_indices = set(range(validation_results['max_case_index'] + 1))
|
||
missing_indices = expected_indices - existing_indices
|
||
validation_results['missing_case_indices'] = sorted(missing_indices)
|
||
|
||
# 计算一致性率
|
||
if validation_results['total_jsonl_files'] > 0:
|
||
validation_results['consistency_rate'] = validation_results['valid_case_mappings'] / validation_results['total_jsonl_files']
|
||
|
||
# 判断验证是否通过
|
||
validation_results['validation_passed'] = (
|
||
validation_results['consistency_rate'] >= 0.95 and
|
||
len(validation_results['missing_case_indices']) == 0
|
||
)
|
||
|
||
logger.info(f"数据一致性验证完成: 一致性率 {validation_results['consistency_rate']:.2%}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"数据一致性验证时出错: {e}")
|
||
validation_results['error'] = str(e)
|
||
|
||
return validation_results
|
||
|
||
def analyze_and_clean_files(self) -> None:
|
||
"""基于质量评估扫描并智能清理文件"""
|
||
if not self.directory.exists():
|
||
logger.error(f"目录不存在: {self.directory}")
|
||
return
|
||
|
||
# 查找所有JSONL文件
|
||
jsonl_pattern = str(self.directory / "**" / "*.jsonl")
|
||
jsonl_files = glob.glob(jsonl_pattern, recursive=True)
|
||
|
||
self.stats['total_files'] = len(jsonl_files)
|
||
logger.info(f"找到 {len(jsonl_files)} 个JSONL文件")
|
||
|
||
# 预加载dataset以供后续使用
|
||
dataset = self.load_dataset()
|
||
if dataset is None:
|
||
logger.warning("无法加载dataset,将跳过dataset同步删除")
|
||
|
||
# 计算所有文件的质量分数
|
||
logger.info("正在计算质量分数...")
|
||
complete_files = []
|
||
incomplete_files = []
|
||
|
||
for jsonl_file in jsonl_files:
|
||
try:
|
||
quality_score = self.calculate_quality_score(jsonl_file)
|
||
if quality_score is None:
|
||
self.stats['error_files'].append(jsonl_file)
|
||
continue
|
||
|
||
if quality_score.is_complete:
|
||
complete_files.append(quality_score)
|
||
self.stats['complete_files'] += 1
|
||
else:
|
||
incomplete_files.append(quality_score)
|
||
self.stats['incomplete_files'] += 1
|
||
|
||
except Exception as e:
|
||
logger.error(f"处理文件时发生错误 {jsonl_file}: {e}")
|
||
self.stats['error_files'].append(jsonl_file)
|
||
|
||
# 智能清理逻辑(增强版,包含dataset同步删除)
|
||
self._smart_cleanup_with_sync(complete_files, incomplete_files, dataset)
|
||
|
||
def _smart_cleanup_with_sync(self, complete_files: List[QualityScore],
|
||
incomplete_files: List[QualityScore],
|
||
dataset: Optional[List[Dict[str, Any]]]) -> None:
|
||
"""
|
||
执行智能清理逻辑,包含dataset同步删除功能
|
||
|
||
Args:
|
||
complete_files: 完整文件的质量评分列表
|
||
incomplete_files: 不完整文件的质量评分列表
|
||
dataset: dataset数据,用于收集case信息和同步删除
|
||
"""
|
||
deleted_case_indices = set() # 收集所有要删除的case索引
|
||
|
||
# 处理不完整文件:保留10%最优质的
|
||
if incomplete_files:
|
||
# 按总惩罚分排序(分数越低质量越好)
|
||
incomplete_files.sort(key=lambda x: x.total_penalty)
|
||
|
||
keep_count = max(1, int(len(incomplete_files) * self.keep_incomplete_ratio))
|
||
keep_files = incomplete_files[:keep_count]
|
||
delete_files = incomplete_files[keep_count:]
|
||
|
||
self.stats['kept_incomplete_files'] = [f.file_path for f in keep_files]
|
||
|
||
# 记录质量分析
|
||
if incomplete_files:
|
||
penalties = [f.total_penalty for f in incomplete_files]
|
||
self.stats['quality_analysis']['incomplete'] = {
|
||
'avg_penalty': sum(penalties) / len(penalties),
|
||
'score_range': (min(penalties), max(penalties))
|
||
}
|
||
|
||
logger.info(f"不完整文件: 总数 {len(incomplete_files)}, 保留 {len(keep_files)}, 删除 {len(delete_files)}")
|
||
|
||
# 删除不完整文件并收集case信息
|
||
for quality_score in delete_files:
|
||
self._delete_file_with_case_tracking(quality_score, "低质量不完整文件", dataset, deleted_case_indices)
|
||
self.stats['deleted_incomplete_files'].append(quality_score.file_path)
|
||
|
||
# 处理完整文件:删除20%质量最差的
|
||
if complete_files:
|
||
# 按总惩罚分排序(分数越高质量越差)
|
||
complete_files.sort(key=lambda x: x.total_penalty, reverse=True)
|
||
|
||
delete_count = int(len(complete_files) * self.remove_complete_ratio)
|
||
delete_files = complete_files[:delete_count]
|
||
keep_files = complete_files[delete_count:]
|
||
|
||
self.stats['kept_complete_files'] = [f.file_path for f in keep_files]
|
||
|
||
# 记录质量分析
|
||
if complete_files:
|
||
penalties = [f.total_penalty for f in complete_files]
|
||
self.stats['quality_analysis']['complete'] = {
|
||
'avg_penalty': sum(penalties) / len(penalties),
|
||
'score_range': (min(penalties), max(penalties))
|
||
}
|
||
|
||
logger.info(f"完整文件: 总数 {len(complete_files)}, 保留 {len(keep_files)}, 删除 {len(delete_files)}")
|
||
|
||
# 删除低质量完整文件并收集case信息
|
||
for quality_score in delete_files:
|
||
self._delete_file_with_case_tracking(quality_score, "低质量完整文件", dataset, deleted_case_indices)
|
||
self.stats['deleted_complete_files'].append(quality_score.file_path)
|
||
|
||
# 同步删除dataset中的对应case
|
||
if deleted_case_indices and dataset is not None:
|
||
logger.info(f"准备从dataset中删除 {len(deleted_case_indices)} 个case: {sorted(deleted_case_indices)}")
|
||
if self.sync_delete_dataset_cases(deleted_case_indices):
|
||
logger.info("Dataset同步删除完成")
|
||
else:
|
||
logger.error("Dataset同步删除失败")
|
||
elif deleted_case_indices:
|
||
logger.warning(f"检测到 {len(deleted_case_indices)} 个case需要删除,但dataset不可用")
|
||
|
||
# 记录删除的case索引
|
||
self.stats['deleted_case_indices'] = sorted(deleted_case_indices)
|
||
|
||
def _delete_file_with_case_tracking(self, quality_score: QualityScore, reason: str,
|
||
dataset: Optional[List[Dict[str, Any]]],
|
||
deleted_case_indices: Set[int]) -> None:
|
||
"""
|
||
删除文件并跟踪相关的case信息
|
||
|
||
Args:
|
||
quality_score: 质量评分对象
|
||
reason: 删除原因
|
||
dataset: dataset数据
|
||
deleted_case_indices: 用于收集被删除case索引的集合
|
||
"""
|
||
file_path = quality_score.file_path
|
||
|
||
# 从文件名提取case索引
|
||
filename = os.path.basename(file_path)
|
||
case_index = self.extract_case_index_from_filename(filename)
|
||
|
||
if case_index is not None and dataset is not None:
|
||
# 收集case信息
|
||
case_info = self.collect_case_info(file_path, case_index, dataset)
|
||
self.stats['deleted_cases_info'].append(case_info)
|
||
deleted_case_indices.add(case_index)
|
||
|
||
logger.info(f"准备删除{reason}: {file_path} (case_{case_index}: {case_info['primary_department']}-{case_info['secondary_department']})")
|
||
else:
|
||
logger.info(f"准备删除{reason}: {file_path} (无法提取case索引)")
|
||
|
||
# 执行文件删除
|
||
if self.dry_run:
|
||
logger.info(f"[试运行] 将删除{reason}: {file_path}")
|
||
else:
|
||
try:
|
||
os.remove(file_path)
|
||
logger.info(f"已删除{reason}: {file_path}")
|
||
except Exception as e:
|
||
logger.error(f"删除文件失败 {file_path}: {e}")
|
||
self.stats['error_files'].append(file_path)
|
||
|
||
def _delete_file(self, file_path: str, reason: str) -> None:
|
||
"""
|
||
删除文件(兼容性方法)
|
||
|
||
Args:
|
||
file_path: 文件路径
|
||
reason: 删除原因
|
||
"""
|
||
if self.dry_run:
|
||
logger.info(f"[试运行] 将删除{reason}: {file_path}")
|
||
else:
|
||
try:
|
||
os.remove(file_path)
|
||
logger.info(f"已删除{reason}: {file_path}")
|
||
except Exception as e:
|
||
logger.error(f"删除文件失败 {file_path}: {e}")
|
||
self.stats['error_files'].append(file_path)
|
||
|
||
def print_summary(self) -> None:
|
||
"""打印详细的统计摘要"""
|
||
print("\n" + "="*80)
|
||
print("🧠 智能工作流文件清理摘要")
|
||
print("="*80)
|
||
|
||
# 基本统计
|
||
print(f"📊 基本统计:")
|
||
print(f" 总文件数: {self.stats['total_files']}")
|
||
print(f" 完整文件数: {self.stats['complete_files']}")
|
||
print(f" 不完整文件数: {self.stats['incomplete_files']}")
|
||
print(f" 错误文件数: {len(self.stats['error_files'])}")
|
||
|
||
# 清理策略统计
|
||
print(f"\n🎯 清理策略统计:")
|
||
print(f" 不完整文件保留比例: {self.keep_incomplete_ratio*100:.1f}%")
|
||
print(f" 完整文件删除比例: {self.remove_complete_ratio*100:.1f}%")
|
||
|
||
# 不完整文件处理结果
|
||
if self.stats['incomplete_files'] > 0:
|
||
kept_incomplete = len(self.stats['kept_incomplete_files'])
|
||
deleted_incomplete = len(self.stats['deleted_incomplete_files'])
|
||
print(f"\n📋 不完整文件处理:")
|
||
print(f" 保留数量: {kept_incomplete} ({kept_incomplete/self.stats['incomplete_files']*100:.1f}%)")
|
||
print(f" 删除数量: {deleted_incomplete} ({deleted_incomplete/self.stats['incomplete_files']*100:.1f}%)")
|
||
|
||
qa = self.stats['quality_analysis']['incomplete']
|
||
if qa['avg_penalty'] > 0:
|
||
print(f" 平均惩罚分: {qa['avg_penalty']:.2f}")
|
||
print(f" 分数范围: {qa['score_range'][0]:.2f} - {qa['score_range'][1]:.2f}")
|
||
|
||
# 完整文件处理结果
|
||
if self.stats['complete_files'] > 0:
|
||
kept_complete = len(self.stats['kept_complete_files'])
|
||
deleted_complete = len(self.stats['deleted_complete_files'])
|
||
print(f"\n✅ 完整文件处理:")
|
||
print(f" 保留数量: {kept_complete} ({kept_complete/self.stats['complete_files']*100:.1f}%)")
|
||
print(f" 删除数量: {deleted_complete} ({deleted_complete/self.stats['complete_files']*100:.1f}%)")
|
||
|
||
qa = self.stats['quality_analysis']['complete']
|
||
if qa['avg_penalty'] > 0:
|
||
print(f" 平均惩罚分: {qa['avg_penalty']:.2f}")
|
||
print(f" 分数范围: {qa['score_range'][0]:.2f} - {qa['score_range'][1]:.2f}")
|
||
|
||
# 总删除统计
|
||
total_deleted = len(self.stats['deleted_incomplete_files']) + len(self.stats['deleted_complete_files'])
|
||
if total_deleted > 0:
|
||
print(f"\n🗑️ 总删除统计:")
|
||
print(f" 删除的不完整文件: {len(self.stats['deleted_incomplete_files'])}")
|
||
print(f" 删除的完整文件: {len(self.stats['deleted_complete_files'])}")
|
||
print(f" 总删除数量: {total_deleted}")
|
||
|
||
# 删除的case信息统计
|
||
if self.stats['deleted_case_indices']:
|
||
print(f"\n📋 删除的Case统计:")
|
||
print(f" 删除的case数量: {len(self.stats['deleted_case_indices'])}")
|
||
print(f" 删除的case索引: {self.stats['deleted_case_indices'][:10]}{'...' if len(self.stats['deleted_case_indices']) > 10 else ''}")
|
||
|
||
# 按科室统计删除的case
|
||
if self.stats['deleted_cases_info']:
|
||
dept_stats = {}
|
||
for case_info in self.stats['deleted_cases_info']:
|
||
dept_key = f"{case_info['primary_department']}-{case_info['secondary_department']}"
|
||
dept_stats[dept_key] = dept_stats.get(dept_key, 0) + 1
|
||
|
||
print(f"\n 按科室统计删除的case:")
|
||
for dept, count in sorted(dept_stats.items(), key=lambda x: x[1], reverse=True)[:10]:
|
||
print(f" {dept}: {count}个")
|
||
if len(dept_stats) > 10:
|
||
print(f" ... 以及其他 {len(dept_stats) - 10} 个科室")
|
||
|
||
# Dataset备份信息
|
||
if self.stats['dataset_backup_path']:
|
||
print(f"\n💾 Dataset备份:")
|
||
print(f" 备份文件: {self.stats['dataset_backup_path']}")
|
||
|
||
# 错误文件
|
||
if self.stats['error_files']:
|
||
print(f"\n⚠️ 处理错误的文件 ({len(self.stats['error_files'])})个:")
|
||
for file in self.stats['error_files'][:5]: # 只显示前5个
|
||
print(f" - {file}")
|
||
if len(self.stats['error_files']) > 5:
|
||
print(f" ... 以及其他 {len(self.stats['error_files'])-5} 个文件")
|
||
|
||
# 数据一致性验证结果
|
||
if 'validation_results' in self.stats:
|
||
validation = self.stats['validation_results']
|
||
print(f"\n🔍 数据一致性验证:")
|
||
print(f" Dataset大小: {validation.get('dataset_size', 0)}")
|
||
print(f" JSONL文件数: {validation.get('total_jsonl_files', 0)}")
|
||
print(f" 有效映射数: {validation.get('valid_case_mappings', 0)}")
|
||
print(f" 一致性率: {validation.get('consistency_rate', 0):.2%}")
|
||
print(f" 验证状态: {'✅ 通过' if validation.get('validation_passed', False) else '❌ 未通过'}")
|
||
|
||
if validation.get('missing_case_indices'):
|
||
missing_count = len(validation['missing_case_indices'])
|
||
print(f" 缺失索引: {missing_count}个 {validation['missing_case_indices'][:5]}{'...' if missing_count > 5 else ''}")
|
||
|
||
if validation.get('invalid_case_mappings'):
|
||
invalid_count = len(validation['invalid_case_mappings'])
|
||
print(f" 无效映射: {invalid_count}个")
|
||
|
||
if self.dry_run:
|
||
print(f"\n💡 注意: 这是试运行模式,实际上没有删除任何文件")
|
||
|
||
# 质量分析建议
|
||
print(f"\n🔍 质量分析建议:")
|
||
incomplete_avg = self.stats['quality_analysis']['incomplete']['avg_penalty']
|
||
complete_avg = self.stats['quality_analysis']['complete']['avg_penalty']
|
||
|
||
if incomplete_avg > complete_avg:
|
||
print(f" - 不完整文件的平均质量较低,建议优化工作流执行")
|
||
else:
|
||
print(f" - 完整文件中仍有质量问题,建议加强质量控制")
|
||
|
||
if incomplete_avg > 3.0:
|
||
print(f" - 不完整文件质量分数偏高,建议检查中断原因")
|
||
|
||
if complete_avg > 2.0:
|
||
print(f" - 完整文件质量有待提升,建议优化评估标准")
|
||
|
||
def run(self) -> Dict[str, Any]:
|
||
"""
|
||
运行清理器
|
||
|
||
Returns:
|
||
Dict: 包含统计信息的字典
|
||
"""
|
||
logger.info(f"🚀 开始智能分析目录: {self.directory}")
|
||
logger.info(f"📋 清理策略: 保留{self.keep_incomplete_ratio*100:.0f}%最优不完整文件,删除{self.remove_complete_ratio*100:.0f}%最差完整文件")
|
||
if self.dry_run:
|
||
logger.info("🧪 运行在试运行模式")
|
||
|
||
# 执行数据一致性验证
|
||
logger.info("🔍 执行数据一致性验证...")
|
||
validation_results = self.validate_data_consistency()
|
||
self.stats['validation_results'] = validation_results
|
||
|
||
if not validation_results.get('validation_passed', False):
|
||
logger.warning(f"⚠️ 数据一致性验证未通过: 一致性率 {validation_results.get('consistency_rate', 0):.2%}")
|
||
if validation_results.get('missing_case_indices'):
|
||
logger.warning(f" 缺失的case索引: {validation_results['missing_case_indices'][:10]}{'...' if len(validation_results['missing_case_indices']) > 10 else ''}")
|
||
if validation_results.get('invalid_case_mappings'):
|
||
logger.warning(f" 无效的case映射: {len(validation_results['invalid_case_mappings'])} 个")
|
||
else:
|
||
logger.info("✅ 数据一致性验证通过")
|
||
|
||
self.analyze_and_clean_files()
|
||
self.print_summary()
|
||
|
||
return self.stats
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
parser = argparse.ArgumentParser(description='基于质量评估的智能工作流文件清理器')
|
||
parser.add_argument('directory', nargs='?', default='results/results0905-2',
|
||
help='要检查的目录路径 (默认: results/results0903)')
|
||
parser.add_argument('--dry-run', action='store_true',
|
||
help='试运行模式,不实际删除文件')
|
||
parser.add_argument('--keep-incomplete', type=float, default=0.1,
|
||
help='不完整文件保留比例 (默认: 0.1, 即10%%)')
|
||
parser.add_argument('--remove-complete', type=float, default=0.2,
|
||
help='完整文件删除比例 (默认: 0.2, 即20%%)')
|
||
|
||
args = parser.parse_args()
|
||
|
||
# 参数验证
|
||
if not (0.0 <= args.keep_incomplete <= 1.0):
|
||
logger.error("--keep-incomplete 参数必须在 0.0 到 1.0 之间")
|
||
return
|
||
|
||
if not (0.0 <= args.remove_complete <= 1.0):
|
||
logger.error("--remove-complete 参数必须在 0.0 到 1.0 之间")
|
||
return
|
||
|
||
cleaner = IntelligentWorkflowCleaner(
|
||
directory=args.directory,
|
||
dry_run=args.dry_run,
|
||
keep_incomplete_ratio=args.keep_incomplete,
|
||
remove_complete_ratio=args.remove_complete
|
||
)
|
||
cleaner.run()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |