statistic.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. # 统计标注好的数据,同时给出统计结果保存为txt
  2. import os
  3. def count_imgs(path: str, tag: str) -> dict:
  4. target_path = os.path.join(path, tag)
  5. # 获取类别子目录
  6. sta_res = {}
  7. total_count = 0
  8. for c in os.listdir(target_path):
  9. cls_path = os.path.join(target_path, c)
  10. if os.path.isdir(cls_path): # 确保是目录
  11. # 获取图片
  12. imgs = [f for f in os.listdir(cls_path) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif'))]
  13. count = len(imgs)
  14. sta_res[c] = count
  15. total_count += count
  16. return {
  17. 'dataset_type': tag,
  18. 'class_counts': sta_res,
  19. 'total_count': total_count,
  20. 'num_classes': len(sta_res)
  21. }
  22. def format_statistics(stats: dict) -> str:
  23. """格式化统计数据为易读的字符串"""
  24. dataset_type = stats['dataset_type']
  25. class_counts = stats['class_counts']
  26. total_count = stats['total_count']
  27. num_classes = stats['num_classes']
  28. formatted = []
  29. formatted.append(f"{'='*50}")
  30. formatted.append(f"{dataset_type.upper()} 数据集统计")
  31. formatted.append(f"{'='*50}")
  32. formatted.append(f"总图片数量: {total_count}")
  33. formatted.append(f"类别数量: {num_classes}")
  34. formatted.append("-" * 30)
  35. formatted.append("各类别分布:")
  36. # 按类别名称排序
  37. for class_name in sorted(class_counts.keys()):
  38. count = class_counts[class_name]
  39. percentage = (count / total_count * 100) if total_count > 0 else 0
  40. formatted.append(f" {class_name:<20}: {count:>6} 张 ({percentage:>5.1f}%)")
  41. formatted.append(f"{'='*50}")
  42. return "\n".join(formatted)
  43. def main():
  44. # TODO:修改数据集路径
  45. train_data_path = r'D:\code\water_turbidity_det\label_data'
  46. dirs = os.listdir(train_data_path)
  47. # 检查数据集目录是否存在
  48. dataset_types = []
  49. if 'train' in dirs:
  50. dataset_types.append('train')
  51. if 'test' in dirs:
  52. dataset_types.append('test')
  53. if 'val' in dirs:
  54. dataset_types.append('val')
  55. if not dataset_types:
  56. print(f"在 {train_data_path} 中未找到 train, test, 或 val 目录")
  57. return
  58. all_stats = []
  59. summary_stats = {
  60. 'total_overall': 0,
  61. 'datasets': {}
  62. }
  63. for dataset_type in dataset_types:
  64. stats = count_imgs(train_data_path, dataset_type)
  65. all_stats.append(stats)
  66. summary_stats['datasets'][dataset_type] = stats
  67. summary_stats['total_overall'] += stats['total_count']
  68. # 写入详细的统计信息到文件
  69. output_path = os.path.join(train_data_path, 'statistic.txt')
  70. with open(output_path, 'w', encoding='utf-8') as fw:
  71. fw.write("数据集详细统计报告\n")
  72. fw.write("="*60 + "\n\n")
  73. for stats in all_stats:
  74. fw.write(format_statistics(stats))
  75. fw.write("\n\n")
  76. # 添加汇总统计
  77. fw.write("汇总统计\n")
  78. fw.write("="*30 + "\n")
  79. fw.write(f"总体图片数量: {summary_stats['total_overall']}\n")
  80. fw.write(f"数据集类型: {', '.join(summary_stats['datasets'].keys())}\n")
  81. if len(summary_stats['datasets']) > 1:
  82. fw.write("\n数据集分布:\n")
  83. for dataset_type, stats in summary_stats['datasets'].items():
  84. percentage = (stats['total_count'] / summary_stats['total_overall'] * 100) if summary_stats['total_overall'] > 0 else 0
  85. fw.write(f" {dataset_type:<10}: {stats['total_count']:>6} 张 ({percentage:>5.1f}%)\n")
  86. # 同时在控制台输出简要统计
  87. print("数据集统计完成!详细信息已保存到:", output_path)
  88. print("\n简要统计:")
  89. for stats in all_stats:
  90. print(f"{stats['dataset_type']}集: {stats['total_count']}张图片, {stats['num_classes']}个类别")
  91. print(f"总计: {summary_stats['total_overall']}张图片")
  92. if __name__ == '__main__':
  93. main()