# 统计标注好的数据,同时给出统计结果保存为txt import os def count_imgs(path: str, tag: str) -> dict: target_path = os.path.join(path, tag) # 获取类别子目录 sta_res = {} total_count = 0 for c in os.listdir(target_path): cls_path = os.path.join(target_path, c) if os.path.isdir(cls_path): # 确保是目录 # 获取图片 imgs = [f for f in os.listdir(cls_path) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif'))] count = len(imgs) sta_res[c] = count total_count += count return { 'dataset_type': tag, 'class_counts': sta_res, 'total_count': total_count, 'num_classes': len(sta_res) } def format_statistics(stats: dict) -> str: """格式化统计数据为易读的字符串""" dataset_type = stats['dataset_type'] class_counts = stats['class_counts'] total_count = stats['total_count'] num_classes = stats['num_classes'] formatted = [] formatted.append(f"{'='*50}") formatted.append(f"{dataset_type.upper()} 数据集统计") formatted.append(f"{'='*50}") formatted.append(f"总图片数量: {total_count}") formatted.append(f"类别数量: {num_classes}") formatted.append("-" * 30) formatted.append("各类别分布:") # 按类别名称排序 for class_name in sorted(class_counts.keys()): count = class_counts[class_name] percentage = (count / total_count * 100) if total_count > 0 else 0 formatted.append(f" {class_name:<20}: {count:>6} 张 ({percentage:>5.1f}%)") formatted.append(f"{'='*50}") return "\n".join(formatted) def main(): # TODO:修改数据集路径 train_data_path = r'D:\code\water_turbidity_det\label_data' dirs = os.listdir(train_data_path) # 检查数据集目录是否存在 dataset_types = [] if 'train' in dirs: dataset_types.append('train') if 'test' in dirs: dataset_types.append('test') if 'val' in dirs: dataset_types.append('val') if not dataset_types: print(f"在 {train_data_path} 中未找到 train, test, 或 val 目录") return all_stats = [] summary_stats = { 'total_overall': 0, 'datasets': {} } for dataset_type in dataset_types: stats = count_imgs(train_data_path, dataset_type) all_stats.append(stats) summary_stats['datasets'][dataset_type] = stats summary_stats['total_overall'] += stats['total_count'] # 写入详细的统计信息到文件 output_path = os.path.join(train_data_path, 'statistic.txt') with open(output_path, 'w', encoding='utf-8') as fw: fw.write("数据集详细统计报告\n") fw.write("="*60 + "\n\n") for stats in all_stats: fw.write(format_statistics(stats)) fw.write("\n\n") # 添加汇总统计 fw.write("汇总统计\n") fw.write("="*30 + "\n") fw.write(f"总体图片数量: {summary_stats['total_overall']}\n") fw.write(f"数据集类型: {', '.join(summary_stats['datasets'].keys())}\n") if len(summary_stats['datasets']) > 1: fw.write("\n数据集分布:\n") for dataset_type, stats in summary_stats['datasets'].items(): percentage = (stats['total_count'] / summary_stats['total_overall'] * 100) if summary_stats['total_overall'] > 0 else 0 fw.write(f" {dataset_type:<10}: {stats['total_count']:>6} 张 ({percentage:>5.1f}%)\n") # 同时在控制台输出简要统计 print("数据集统计完成!详细信息已保存到:", output_path) print("\n简要统计:") for stats in all_stats: print(f"{stats['dataset_type']}集: {stats['total_count']}张图片, {stats['num_classes']}个类别") print(f"总计: {summary_stats['total_overall']}张图片") if __name__ == '__main__': main()