build_cip_cycles.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. import pandas as pd
  2. from datetime import timedelta
  3. # =========================
  4. # 1. 读取标签数据
  5. # =========================
  6. def load_labels(file_path):
  7. df = pd.read_csv(file_path)
  8. df['date'] = pd.to_datetime(df['date'])
  9. df['unit_id'] = df['unit_id'].astype(str).str.strip()
  10. return df
  11. # =========================
  12. # 2. 提取高置信CIP日期(按月筛选)
  13. # =========================
  14. def get_cip_anchor_days(df_unit):
  15. df_unit = df_unit.copy()
  16. df_unit['year_month'] = df_unit['date'].dt.to_period('M')
  17. selected_dates = []
  18. for ym, group in sorted(df_unit.groupby('year_month')):
  19. group = group.sort_values('date')
  20. if len(group) == 1:
  21. selected_dates.extend(group['date'].tolist())
  22. else:
  23. group_0 = group[group['label'] == 0]
  24. if len(group_0) > 0:
  25. selected_dates.extend(group_0['date'].tolist())
  26. else:
  27. selected_dates.extend(group['date'].tolist())
  28. return sorted(selected_dates)
  29. # =========================
  30. # 3. 合并连续CIP天
  31. # =========================
  32. def merge_consecutive_days(dates):
  33. if len(dates) == 0:
  34. return []
  35. merged = []
  36. start = dates[0]
  37. prev = dates[0]
  38. for d in dates[1:]:
  39. if (d - prev).days <= 1:
  40. prev = d
  41. else:
  42. merged.append((start, prev))
  43. start = d
  44. prev = d
  45. merged.append((start, prev))
  46. return merged
  47. # =========================
  48. # 4. 构建周期 + 筛选
  49. # =========================
  50. def build_cycles(cip_events, unit_id):
  51. MIN_CYCLE_DAYS = 25
  52. MAX_CYCLE_DAYS = 150
  53. cycles = []
  54. valid_id = 1
  55. for i in range(len(cip_events) - 1):
  56. prev_end = cip_events[i][1]
  57. next_start = cip_events[i + 1][0]
  58. cycle_start = prev_end + timedelta(days=2)
  59. cycle_end = next_start - timedelta(days=1)
  60. if cycle_start <= cycle_end:
  61. length_days = (cycle_end - cycle_start).days + 1
  62. if MIN_CYCLE_DAYS <= length_days <= MAX_CYCLE_DAYS:
  63. cycles.append({
  64. "unit_id": unit_id,
  65. "cycle_id": valid_id,
  66. "start_date": cycle_start,
  67. "end_date": cycle_end,
  68. "length_days": length_days
  69. })
  70. valid_id += 1
  71. else:
  72. print(
  73. f"[过滤] 机组{unit_id} 周期候选{i+1} "
  74. f"长度{length_days}天(不在25–150范围)"
  75. )
  76. return pd.DataFrame(cycles)
  77. # =========================
  78. # 5. 单机组处理(已修复核心错误)
  79. # =========================
  80. def process_unit(df, unit_id):
  81. print(f"\n处理机组 {unit_id}...")
  82. df_unit = df[df['unit_id'] == unit_id].copy()
  83. if df_unit.empty:
  84. print(f"机组 {unit_id} 无数据")
  85. return pd.DataFrame()
  86. cip_days = get_cip_anchor_days(df_unit)
  87. cip_events = merge_consecutive_days(cip_days)
  88. cycles_df = build_cycles(cip_events, unit_id)
  89. return cycles_df
  90. # =========================
  91. # 6. 主程序
  92. # =========================
  93. def main():
  94. input_path = "../use_data/cip_day_labels_all_units.csv"
  95. output_path = "../use_data/cip_cycles_all_units.csv"
  96. df = load_labels(input_path)
  97. all_cycles = []
  98. for unit_id in sorted(df['unit_id'].unique()):
  99. cycles_df = process_unit(df, unit_id)
  100. if not cycles_df.empty:
  101. all_cycles.append(cycles_df)
  102. else:
  103. print(f"机组{unit_id}无有效周期")
  104. if len(all_cycles) == 0:
  105. print("未检测到任何有效周期")
  106. return
  107. final_df = pd.concat(all_cycles, ignore_index=True)
  108. final_df = final_df.sort_values(['unit_id', 'cycle_id'])
  109. final_df['start_date'] = final_df['start_date'].dt.date
  110. final_df['end_date'] = final_df['end_date'].dt.date
  111. final_df.to_csv(output_path, index=False)
  112. print(f"\nCIP周期已保存到: {output_path}")
  113. print("\n周期长度统计:")
  114. print(final_df.groupby('unit_id')['length_days'].describe())
  115. if __name__ == "__main__":
  116. main()