import os import glob import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties # ===================== 配置 ===================== data_dir = r"E:\Greentech\models\uf-rl\datasets\processed\segments" target_col = "cycle_long_r2" # ===================== 中文字体设置 ===================== # 注意:这里使用 SimHei 字体,可显示中文 font = FontProperties(fname=r"C:\Windows\Fonts\simhei.ttf", size=12) # ===================== 读取所有 CSV ===================== all_files = glob.glob(os.path.join(data_dir, "*.csv")) values = [] for file in all_files: try: df = pd.read_csv(file) if target_col in df.columns: vals = df[target_col].dropna().values values.append(vals) except Exception as e: print(f"读取失败: {file}, 错误: {e}") # 合并所有数据 if len(values) == 0: raise ValueError("未在任何 CSV 中找到有效的 cycle_long_R2 数据") data = np.concatenate(values) total_count = len(data) # ===================== 定义区间 ===================== bins = [ -np.inf, 0.0, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 ] labels = [ "<0", "0 – 0.5", "0.5 – 0.6", "0.6 – 0.7", "0.7 – 0.8", "0.8 – 0.9", "0.9 – 1.0" ] # ===================== 统计分布 ===================== counts = pd.cut( data, bins=bins, labels=labels, right=True, include_lowest=True ).value_counts().sort_index() ratios = counts / total_count * 100 # ===================== 输出结果 ===================== result = pd.DataFrame({ "样本数": counts, "占比 (%)": ratios.round(2) }) print(f"\n总样本数: {total_count}\n") print(result) # ===================== 绘制柱状图 ===================== plt.figure(figsize=(10, 6)) plt.bar(labels, ratios, color='skyblue', edgecolor='black') plt.title("cycle_long_R2 数据分布柱状图", fontproperties=font) plt.xlabel("区间", fontproperties=font) plt.ylabel("占比 (%)", fontproperties=font) plt.ylim(0, 100) plt.grid(axis='y', linestyle='--', alpha=0.7) # 在柱子上显示百分比 for i, v in enumerate(ratios): plt.text(i, v + 1, f"{v:.1f}%", ha='center', va='bottom', fontsize=10, fontproperties=font) plt.tight_layout() plt.show()