# 统计标注好的数据，同时给出统计结果保存为txt
import os

def count_imgs(path: str, tag: str) -> dict:
    target_path = os.path.join(path, tag)
    # 获取类别子目录
    sta_res = {}
    total_count = 0
    
    for c in os.listdir(target_path):
        cls_path = os.path.join(target_path, c)
        if os.path.isdir(cls_path):  # 确保是目录
            # 获取图片
            imgs = [f for f in os.listdir(cls_path) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif'))]
            count = len(imgs)
            sta_res[c] = count
            total_count += count
    
    return {
        'dataset_type': tag,
        'class_counts': sta_res,
        'total_count': total_count,
        'num_classes': len(sta_res)
    }

def format_statistics(stats: dict) -> str:
    """格式化统计数据为易读的字符串"""
    dataset_type = stats['dataset_type']
    class_counts = stats['class_counts']
    total_count = stats['total_count']
    num_classes = stats['num_classes']
    
    formatted = []
    formatted.append(f"{'='*50}")
    formatted.append(f"{dataset_type.upper()} 数据集统计")
    formatted.append(f"{'='*50}")
    formatted.append(f"总图片数量: {total_count}")
    formatted.append(f"类别数量: {num_classes}")
    formatted.append("-" * 30)
    formatted.append("各类别分布:")
    
    # 按类别名称排序
    for class_name in sorted(class_counts.keys()):
        count = class_counts[class_name]
        percentage = (count / total_count * 100) if total_count > 0 else 0
        formatted.append(f"  {class_name:<20}: {count:>6} 张 ({percentage:>5.1f}%)")
    
    formatted.append(f"{'='*50}")
    return "\n".join(formatted)

def main():
    # TODO:修改数据集路径
    train_data_path = r'D:\code\water_turbidity_det\label_data'
    dirs = os.listdir(train_data_path)
    
    # 检查数据集目录是否存在
    dataset_types = []
    if 'train' in dirs:
        dataset_types.append('train')
    if 'test' in dirs:
        dataset_types.append('test')
    if 'val' in dirs:
        dataset_types.append('val')
    
    if not dataset_types:
        print(f"在 {train_data_path} 中未找到 train, test, 或 val 目录")
        return
    
    all_stats = []
    summary_stats = {
        'total_overall': 0,
        'datasets': {}
    }
    
    for dataset_type in dataset_types:
        stats = count_imgs(train_data_path, dataset_type)
        all_stats.append(stats)
        summary_stats['datasets'][dataset_type] = stats
        summary_stats['total_overall'] += stats['total_count']
    
    # 写入详细的统计信息到文件
    output_path = os.path.join(train_data_path, 'statistic.txt')
    with open(output_path, 'w', encoding='utf-8') as fw:
        fw.write("数据集详细统计报告\n")
        fw.write("="*60 + "\n\n")
        
        for stats in all_stats:
            fw.write(format_statistics(stats))
            fw.write("\n\n")
        
        # 添加汇总统计
        fw.write("汇总统计\n")
        fw.write("="*30 + "\n")
        fw.write(f"总体图片数量: {summary_stats['total_overall']}\n")
        fw.write(f"数据集类型: {', '.join(summary_stats['datasets'].keys())}\n")
        
        if len(summary_stats['datasets']) > 1:
            fw.write("\n数据集分布:\n")
            for dataset_type, stats in summary_stats['datasets'].items():
                percentage = (stats['total_count'] / summary_stats['total_overall'] * 100) if summary_stats['total_overall'] > 0 else 0
                fw.write(f"  {dataset_type:<10}: {stats['total_count']:>6} 张 ({percentage:>5.1f}%)\n")
    
    # 同时在控制台输出简要统计
    print("数据集统计完成！详细信息已保存到:", output_path)
    print("\n简要统计:")
    for stats in all_stats:
        print(f"{stats['dataset_type']}集: {stats['total_count']}张图片, {stats['num_classes']}个类别")
    print(f"总计: {summary_stats['total_overall']}张图片")

if __name__ == '__main__':
    main()