| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- # 统计标注好的数据,同时给出统计结果保存为txt
- import os
- def count_imgs(path: str, tag: str) -> dict:
- target_path = os.path.join(path, tag)
- # 获取类别子目录
- sta_res = {}
- total_count = 0
-
- for c in os.listdir(target_path):
- cls_path = os.path.join(target_path, c)
- if os.path.isdir(cls_path): # 确保是目录
- # 获取图片
- imgs = [f for f in os.listdir(cls_path) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif'))]
- count = len(imgs)
- sta_res[c] = count
- total_count += count
-
- return {
- 'dataset_type': tag,
- 'class_counts': sta_res,
- 'total_count': total_count,
- 'num_classes': len(sta_res)
- }
- def format_statistics(stats: dict) -> str:
- """格式化统计数据为易读的字符串"""
- dataset_type = stats['dataset_type']
- class_counts = stats['class_counts']
- total_count = stats['total_count']
- num_classes = stats['num_classes']
-
- formatted = []
- formatted.append(f"{'='*50}")
- formatted.append(f"{dataset_type.upper()} 数据集统计")
- formatted.append(f"{'='*50}")
- formatted.append(f"总图片数量: {total_count}")
- formatted.append(f"类别数量: {num_classes}")
- formatted.append("-" * 30)
- formatted.append("各类别分布:")
-
- # 按类别名称排序
- for class_name in sorted(class_counts.keys()):
- count = class_counts[class_name]
- percentage = (count / total_count * 100) if total_count > 0 else 0
- formatted.append(f" {class_name:<20}: {count:>6} 张 ({percentage:>5.1f}%)")
-
- formatted.append(f"{'='*50}")
- return "\n".join(formatted)
- def main():
- # TODO:修改数据集路径
- train_data_path = r'D:\code\water_turbidity_det\label_data'
- dirs = os.listdir(train_data_path)
-
- # 检查数据集目录是否存在
- dataset_types = []
- if 'train' in dirs:
- dataset_types.append('train')
- if 'test' in dirs:
- dataset_types.append('test')
- if 'val' in dirs:
- dataset_types.append('val')
-
- if not dataset_types:
- print(f"在 {train_data_path} 中未找到 train, test, 或 val 目录")
- return
-
- all_stats = []
- summary_stats = {
- 'total_overall': 0,
- 'datasets': {}
- }
-
- for dataset_type in dataset_types:
- stats = count_imgs(train_data_path, dataset_type)
- all_stats.append(stats)
- summary_stats['datasets'][dataset_type] = stats
- summary_stats['total_overall'] += stats['total_count']
-
- # 写入详细的统计信息到文件
- output_path = os.path.join(train_data_path, 'statistic.txt')
- with open(output_path, 'w', encoding='utf-8') as fw:
- fw.write("数据集详细统计报告\n")
- fw.write("="*60 + "\n\n")
-
- for stats in all_stats:
- fw.write(format_statistics(stats))
- fw.write("\n\n")
-
- # 添加汇总统计
- fw.write("汇总统计\n")
- fw.write("="*30 + "\n")
- fw.write(f"总体图片数量: {summary_stats['total_overall']}\n")
- fw.write(f"数据集类型: {', '.join(summary_stats['datasets'].keys())}\n")
-
- if len(summary_stats['datasets']) > 1:
- fw.write("\n数据集分布:\n")
- for dataset_type, stats in summary_stats['datasets'].items():
- percentage = (stats['total_count'] / summary_stats['total_overall'] * 100) if summary_stats['total_overall'] > 0 else 0
- fw.write(f" {dataset_type:<10}: {stats['total_count']:>6} 张 ({percentage:>5.1f}%)\n")
-
- # 同时在控制台输出简要统计
- print("数据集统计完成!详细信息已保存到:", output_path)
- print("\n简要统计:")
- for stats in all_stats:
- print(f"{stats['dataset_type']}集: {stats['total_count']}张图片, {stats['num_classes']}个类别")
- print(f"总计: {summary_stats['total_overall']}张图片")
- if __name__ == '__main__':
- main()
|