uf_env.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. """
  2. 超滤强化学习环境模块
  3. ========================
  4. 本模块定义了超滤系统的强化学习环境,包括:
  5. 1. UFParams: 超滤系统参数配置类
  6. 2. 膜阻力与跨膜压差转换函数
  7. 3. simulate_one_supercycle: 超级周期模拟函数
  8. 4. calculate_reward: 奖励函数
  9. 5. is_dead_cycle: 失败判定函数
  10. 6. UFSuperCycleEnv: Gymnasium环境类
  11. 模块设计说明:
  12. - 基于 Gymnasium (原OpenAI Gym) 标准接口
  13. - 模拟超滤膜的"超级周期"运行(多次物理反洗 + 一次化学反洗)
  14. - 强化学习智能体通过优化过滤时长和反洗时长来最大化回收率并控制污染累积
  15. """
  16. import os
  17. import torch
  18. import numpy as np
  19. import gymnasium as gym
  20. from gymnasium import spaces
  21. from uf_train.env.env_params import UFState, UFPhysicsParams, UFStateBounds, UFRewardParams, UFActionSpec
  22. from uf_train.env.uf_physics import UFPhysicsModel
  23. from uf_train.env.env_reset import ResetSampler
  24. import copy
  25. class UFSuperCycleEnv(gym.Env):
  26. """
  27. 超滤系统强化学习环境(Gymnasium标准接口)
  28. 功能:
  29. - 模拟超滤膜的超级周期运行
  30. - 智能体在每个超级周期选择过滤时长和反洗时长
  31. - 目标:最大化回收率同时控制污染累积
  32. 状态空间 (8维,归一化到 [0,1]):
  33. 1. TMP0: 初始跨膜压差
  34. 2. q_UF: 过滤流量
  35. 3. temp: 水温
  36. 4. R0: 初始膜阻力
  37. 5. nuK: 短期污染系数
  38. 6. slope: 长期污染斜率
  39. 7. power: 长期污染幂次
  40. 8. ceb_removal: CEB去除能力
  41. 动作空间 (离散):
  42. - 二维离散动作组合:(过滤时长, 反洗时长)
  43. - 过滤时长: L_min_s ~ L_max_s,步长 L_step_s
  44. - 反洗时长: t_bw_min_s ~ t_bw_max_s,步长 t_bw_step_s
  45. - 总动作数 = len(L_values) × len(t_bw_values)
  46. 奖励机制:
  47. - 基于回收率和残余污染的平衡
  48. - 失败 (TMP超限、回收率过低、污染过快) 时给予大负奖励 (-10)
  49. 终止条件:
  50. - terminated: 违反运行约束(失败)
  51. - truncated: 达到最大步数 (max_episode_steps)
  52. """
  53. metadata = {"render_modes": ["human"]}
  54. def __init__(
  55. self,
  56. physics: UFPhysicsModel,
  57. reward_params: UFRewardParams,
  58. action_spec:UFActionSpec,
  59. statebounds:UFStateBounds,
  60. real_state_pool,
  61. max_episode_steps: int = 45,
  62. RANDOM_SEED = 1024
  63. ):
  64. """
  65. 超滤强化学习环境
  66. 参数:
  67. physics(UFPhysicsModel): 超滤物理模型
  68. reward_params(UFRewardParams): 奖励函数参数
  69. max_episode_steps (int): 每个episode的最大步数,默认45
  70. 注:每步代表一个超级周期(约2-3天),45步约三个月
  71. """
  72. super(UFSuperCycleEnv, self).__init__()
  73. self.RANDOM_SEED = RANDOM_SEED
  74. self.physics = physics
  75. self.reward_params = reward_params
  76. self.max_episode_steps = max_episode_steps
  77. self.current_step = 0
  78. # -------- 动作空间 --------
  79. self.action_spec = action_spec
  80. self.L_values = np.arange(
  81. self.action_spec.L_min_s,
  82. self.action_spec.L_max_s + self.action_spec.L_step_s,
  83. self.action_spec.L_step_s,
  84. )
  85. self.t_bw_values = np.arange(
  86. self.action_spec.t_bw_min_s,
  87. self.action_spec.t_bw_max_s + self.action_spec.t_bw_step_s,
  88. self.action_spec.t_bw_step_s,
  89. )
  90. self.num_L = len(self.L_values)
  91. self.num_bw = len(self.t_bw_values)
  92. self.action_space = spaces.Discrete(self.num_L * self.num_bw)
  93. # -------- 状态空间 --------
  94. self.observation_space = spaces.Box(
  95. low=0.0,
  96. high=1.0,
  97. shape=(8,),
  98. dtype=np.float32,
  99. )
  100. self.state_bounds = statebounds # 状态边界
  101. self.real_state_pool = real_state_pool
  102. self.reset_sampler = ResetSampler(
  103. bounds=self.state_bounds,
  104. physics=physics,
  105. real_state_pool=self.real_state_pool,
  106. max_resample_attempts=50,
  107. random_state=np.random.RandomState(RANDOM_SEED)
  108. )
  109. def _generate_initial_state(self) -> UFState | None:
  110. """
  111. 在 UFStateBounds 定义的范围内采样一个【合法】初始状态。
  112. 若采样失败(约束不满足)返回 None,由 reset() 负责重试。
  113. """
  114. b = self.state_bounds
  115. A = 128 * 40.0 # 有效膜面积
  116. # ---- 1. 基础工况 ----
  117. # ---- 随机生成 TMP、q_UF、温度 ----
  118. TMP0 = np.random.uniform(b.TMP0_min, b.TMP0_max)
  119. q_UF = np.random.uniform(b.q_UF_min, b.q_UF_max)
  120. temp = np.random.uniform(b.temp_min, b.temp_max)
  121. # ---- 2. 污染增长参数 ----
  122. slope = np.random.uniform(b.slope_min, b.slope_max)
  123. power = np.random.uniform(b.power_min, b.power_max)
  124. # ---- 3. 约束:污染增长速率可实现 ----
  125. t_max = 60 if power >= 1 else 1
  126. required_nuK_min = slope * power * (t_max ** (power - 1)) * (A / q_UF)
  127. # 若 required_nuK_min 超过可选范围 → 初始状态非法
  128. if required_nuK_min > b.nuK_max:
  129. return None
  130. # 在可行范围中采样 nuK
  131. nuK = np.random.uniform(
  132. max(required_nuK_min, b.nuK_min),
  133. b.nuK_max
  134. )
  135. # ---- 4. CEB 去除率 ----
  136. ceb_removal = np.random.uniform(
  137. b.ceb_removal_min,
  138. b.ceb_removal_max
  139. )
  140. # ---- 5. 初始膜阻力(物理模型) ----
  141. R0 = self.physics.calculate_initial_resistance(
  142. TMP=TMP0,
  143. q_UF=q_UF,
  144. temp=temp
  145. )
  146. return UFState(
  147. TMP=TMP0,
  148. q_UF=q_UF,
  149. temp=temp,
  150. R=R0,
  151. slope=slope,
  152. power=power,
  153. nuK=nuK,
  154. ceb_removal=ceb_removal,
  155. )
  156. def _get_training_progress(self) -> float:
  157. """
  158. 返回训练进度,用于 reset_sampler 的 curriculum sampling
  159. """
  160. return min(1.0, self.current_step / self.max_episode_steps )
  161. def reset(self, seed=None, options=None, max_attempts: int = 1000):
  162. super().reset(seed=seed)
  163. progress = self._get_training_progress()
  164. for _ in range(max_attempts):
  165. state = self.reset_sampler.sample(progress)
  166. if state is None:
  167. continue
  168. ok_run = self.physics.check_dead_initial_state(
  169. init_state=state,
  170. max_steps=self.max_episode_steps,
  171. L_s=self.action_spec.L_min_s,
  172. t_bw_s=self.action_spec.t_bw_max_s
  173. )
  174. if ok_run:
  175. self.state = state
  176. break
  177. else:
  178. raise RuntimeError("无法生成可行初始状态")
  179. self.current_step = 0
  180. self.last_action = None
  181. self.max_TMP_during_filtration = self.state.TMP
  182. return self._get_obs(), {}
  183. def _get_state_copy(self):
  184. return copy.deepcopy(self.state)
  185. def _get_obs(self):
  186. """
  187. 构建当前环境归一化状态向量
  188. """
  189. # === 1. 从 self.state 读取动态参数 ===
  190. TMP = self.state.TMP
  191. q_UF = self.state.q_UF
  192. temp = self.state.temp
  193. # === 2. 计算本周期初始膜阻力 ===
  194. R = self.state.R
  195. # === 3. 从 self.state 读取膜阻力增长模型参数 ===
  196. nuk = self.state.nuK
  197. slope = self.state.slope
  198. power = self.state.power
  199. ceb_removal = self.state.ceb_removal
  200. # === 4. 从 current_params 动态读取上下限 ===
  201. TMP0_min, TMP0_max = self.state_bounds.TMP0_min, self.state_bounds.global_TMP_hard_limit
  202. q_UF_min, q_UF_max = self.state_bounds.q_UF_min, self.state_bounds.q_UF_max
  203. temp_min, temp_max = self.state_bounds.temp_min, self.state_bounds.temp_max
  204. nuK_min, nuK_max = self.state_bounds.nuK_min, self.state_bounds.nuK_max
  205. slope_min, slope_max = self.state_bounds.slope_min, self.state_bounds.slope_max
  206. power_min, power_max = self.state_bounds.power_min, self.state_bounds.power_max
  207. ceb_min, ceb_max = self.state_bounds.ceb_removal_min, self.state_bounds.ceb_removal_max
  208. # === 5. 归一化计算(clip防止越界) ===
  209. TMP0_norm = np.clip((TMP - TMP0_min) / (TMP0_max - TMP0_min), 0, 1)
  210. q_UF_norm = np.clip((q_UF - q_UF_min) / (q_UF_max - q_UF_min), 0, 1)
  211. temp_norm = np.clip((temp - temp_min) / (temp_max - temp_min), 0, 1)
  212. # R0 不在 current_params 中定义上下限,设定经验范围
  213. R0_norm = np.clip((R - 100.0) / (800.0 - 100.0), 0, 1)
  214. short_term_norm = np.clip((nuk - nuK_min) / (nuK_max - nuK_min), 0, 1)
  215. long_term_slope_norm = np.clip((slope - slope_min) / (slope_max - slope_min), 0, 1)
  216. long_term_power_norm = np.clip((power - power_min) / (power_max - power_min), 0, 1)
  217. ceb_removal_norm = np.clip((ceb_removal - ceb_min) / (ceb_max - ceb_min), 0, 1)
  218. # === 6. 构建观测向量 ===
  219. obs = np.array([
  220. TMP0_norm,
  221. q_UF_norm,
  222. temp_norm,
  223. R0_norm,
  224. short_term_norm,
  225. long_term_slope_norm,
  226. long_term_power_norm,
  227. ceb_removal_norm
  228. ], dtype=np.float32)
  229. return obs
  230. def _get_action_values(self, action):
  231. """
  232. 将动作还原为实际时长
  233. """
  234. L_idx = action // self.num_bw
  235. t_bw_idx = action % self.num_bw
  236. return self.L_values[L_idx], self.t_bw_values[t_bw_idx]
  237. def step(self, action):
  238. self.current_step += 1
  239. L_s, t_bw_s = self._get_action_values(action)
  240. L_s = np.clip(L_s, self.action_spec.L_min_s, self.action_spec.L_max_s)
  241. t_bw_s = np.clip(t_bw_s, self.action_spec.t_bw_min_s, self.action_spec.t_bw_max_s)
  242. # 模拟超级周期
  243. info, next_state = self.physics.simulate_one_supercycle(state=self.state,L_s=L_s, t_bw_s=t_bw_s)
  244. # 根据 info 判断是否成功
  245. feasible = self.physics.is_dead_cycle(info) # True 表示成功循环,False 表示失败
  246. if feasible:
  247. # 每步奖励
  248. reward, rec_reward, energy_reward, res_penalty = self._calculate_reward(info)
  249. info["rec_reward"] = rec_reward
  250. info["energy_reward"] = energy_reward
  251. info["res_penalty"] = res_penalty
  252. self.state = next_state
  253. terminated = False
  254. else:
  255. # 中途失败惩罚
  256. reward = -10
  257. info["rec_reward"] = None
  258. info["energy_reward"] = None
  259. info["res_penalty"] = None
  260. terminated = True
  261. # 判断是否到达最大步数
  262. truncated = self.current_step >= self.max_episode_steps
  263. self.last_action = (L_s, t_bw_s)
  264. next_obs = self._get_obs()
  265. info["feasible"] = feasible
  266. info["step"] = self.current_step
  267. info["L_s"] = L_s.copy()
  268. info["t_bw_s"] = t_bw_s.copy()
  269. # # ===================== 测试终末奖励:鼓励 TMP 接近初始状态 =====================
  270. # # 仅在 episode 自然结束(满步但未提前失败)时触发
  271. # if truncated and not terminated:
  272. # TMP_initial = self.TMP0 # reset 时记录的初始 TMP
  273. # TMP_final = next_obs[0] # next_obs 提供的最终 TMP
  274. #
  275. # delta_ratio = abs((TMP_final - TMP_initial) / TMP_initial)
  276. #
  277. # alpha = 4.0 # TMP 偏差敏感度
  278. # gamma = 5.0 # 奖励幅度
  279. # stability_reward = gamma * (np.exp(-alpha * delta_ratio) - 1) # 量级在0到-5之间
  280. #
  281. # reward += stability_reward
  282. # terminated = True # episode 正式结束
  283. # # ===================== 测试结果 =====================
  284. # 增加该奖励后强化学习依然能保证奖励收敛,但是损失函数在2-3之间反复震荡,无法降低,见reward_test&loss_test
  285. # 原设想是只能听在大额偏移发生前能通过该奖励学习到提前减小偏移步伐,但是实际训练时该惩罚反复被触发
  286. # 推测是终末的大额奖惩无法有效传递回过往时间步引导智能体学习,可能由于状态中缺少预测值,智能体会将其观测为不可控事件,暂时不添加该奖励,TODO:等待优化
  287. return next_obs, reward, terminated, truncated, info
  288. def _calculate_reward(self, info: dict) -> float:
  289. """
  290. 计算强化学习奖励函数(扩展版)
  291. 功能:
  292. - 平衡回收率、残余污染和吨水电耗三个目标
  293. - TMP不直接参与奖励计算(通过失败判定间接影响)
  294. - 使用 tanh 函数实现平滑的非线性奖励
  295. 参数:
  296. info (dict): 周期性能指标字典,需包含
  297. - recovery: 回收率 [0-1]
  298. - R_after_ceb: 本周期结束膜阻力
  299. - initial_R: 本周期初始膜阻力
  300. - delta_R_allow: 本周期允许最大阻力上升
  301. - ton_water_energy_kWh_per_m3: 本周期吨水电耗
  302. 返回:
  303. float: 奖励值(通常在 -3 到 +3 之间)
  304. 设计思想:
  305. - 高回收率 → 水资源利用率高 → 正奖励
  306. - 低残余污染 → 膜长期稳定运行 → 正奖励
  307. - 低吨水电耗 → 节能 → 正奖励
  308. - 三者需要权衡:过短的过滤时间提高回收率但污染去除不彻底;过长时间污染控制好但回收率下降,过高功率增加耗能
  309. 参考点设计:
  310. - 残余污染:
  311. - 高污染参考点 = 1 / self.max_episode_steps
  312. - 平衡点 = 0.5 / self.max_episode_steps
  313. - 吨水电耗:
  314. - 高点 = 0.1034 kWh/m³
  315. - 平衡点 = 0.1011 kWh/m³
  316. - 低点 = 0.0993 kWh/m³
  317. - 回收率参考点保持原有设计
  318. """
  319. # ========== 提取性能指标 ==========
  320. recovery = info["recovery"] # 回收率 [0-1]
  321. # 污染比例:实际上升的阻力 / 允许上升的阻力
  322. # 允许上升的阻力值 = 当前阻力值软上限 - 当前阻力
  323. residual_ratio = info['residual_ratio']
  324. # 吨水电耗指标
  325. energy = info["ton_water_energy_kWh_per_m3"]
  326. # ========== 回收率奖励项 ==========
  327. # 将回收率归一化到 [0, 1] 区间(基于预期范围)
  328. rec_norm = (recovery - self.reward_params.rec_low) / (self.reward_params.rec_high - self.reward_params.rec_low)
  329. # 使用 tanh 函数构建平滑的 S 型奖励曲线
  330. # - rec_norm = 0.5 时(回收率处于中间值),rec_reward = 0
  331. # - rec_norm > 0.5 时,rec_reward > 0(鼓励高回收率)
  332. # - rec_norm < 0.5 时,rec_reward < 0(惩罚低回收率)
  333. # - k_rec 控制曲线陡峭程度,越大变化越陡
  334. rec_reward = np.clip(np.tanh(self.reward_params.k_rec * (rec_norm - 0.5)), -1, 1)
  335. # ========== 残余污染惩罚项 ==========
  336. # 新参考点:每步允许上升比例 = 1 / max_episode_steps
  337. # 平衡点 = 0.5 / max_episode_steps
  338. ref_residual = 0.5 / self.max_episode_steps
  339. # 使用 tanh 构建惩罚曲线
  340. # - residual_ratio < 平衡点时,res_penalty > 0(奖励低污染)
  341. # - residual_ratio > 平衡点时,res_penalty < 0(惩罚高污染)
  342. # - k_res 控制曲线陡峭程度
  343. res_penalty = -np.tanh(self.reward_params.k_res * (residual_ratio / ref_residual - 1))
  344. # ========== 吨水电耗奖励项 ==========
  345. # 设置高/平衡/低点
  346. energy_low = 0.0993
  347. energy_high = 0.1034
  348. # 将能耗归一化到 [0, 1],平衡点对应 energy_norm = 0.5
  349. energy_norm = (energy - energy_low) / (energy_high - energy_low)
  350. # 使用 tanh 构建平滑奖励
  351. # - energy_norm < 0.5 时,energy_reward > 0(节能奖励)
  352. # - energy_norm > 0.5 时,energy_reward < 0(高能耗惩罚)
  353. # - k_energy 控制曲线陡峭程度
  354. energy_reward = -np.tanh(self.reward_params.k_energy * (energy_norm - 0.5))
  355. # ========== 组合奖励 ==========
  356. # 简单线性组合三项(为污染项加权)
  357. total_reward = rec_reward + 2.0 * res_penalty + energy_reward
  358. # 可选:添加平移项使特定点的奖励为零(当前未使用)
  359. # total_reward -= offset
  360. return total_reward, rec_reward, energy_reward, res_penalty