uf_env.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. """
  2. 超滤强化学习环境模块
  3. ========================
  4. 本模块定义了超滤系统的强化学习环境,包括:
  5. 1. UFParams: 超滤系统参数配置类
  6. 2. 膜阻力与跨膜压差转换函数
  7. 3. simulate_one_supercycle: 超级周期模拟函数
  8. 4. calculate_reward: 奖励函数
  9. 5. is_dead_cycle: 失败判定函数
  10. 6. UFSuperCycleEnv: Gymnasium环境类
  11. 模块设计说明:
  12. - 基于 Gymnasium (原OpenAI Gym) 标准接口
  13. - 模拟超滤膜的"超级周期"运行(多次物理反洗 + 一次化学反洗)
  14. - 强化学习智能体通过优化过滤时长和反洗时长来最大化回收率并控制污染累积
  15. """
  16. import os
  17. import torch
  18. import numpy as np
  19. import gymnasium as gym
  20. from gymnasium import spaces
  21. from uf_train.env.env_params import UFState, UFPhysicsParams, UFStateBounds, UFRewardParams, UFActionSpec
  22. from uf_train.env.uf_physics import UFPhysicsModel
  23. from uf_train.env.env_reset import ResetSampler
  24. import copy
  25. class UFSuperCycleEnv(gym.Env):
  26. """
  27. 超滤系统强化学习环境(Gymnasium标准接口)
  28. 功能:
  29. - 模拟超滤膜的超级周期运行
  30. - 智能体在每个超级周期选择过滤时长和反洗时长
  31. - 目标:最大化回收率同时控制污染累积
  32. 状态空间 (8维,归一化到 [0,1]):
  33. 1. TMP0: 初始跨膜压差
  34. 2. q_UF: 过滤流量
  35. 3. temp: 水温
  36. 4. R0: 初始膜阻力
  37. 5. nuK: 短期污染系数
  38. 6. slope: 长期污染斜率
  39. 7. power: 长期污染幂次
  40. 8. ceb_removal: CEB去除能力
  41. 动作空间 (离散):
  42. - 二维离散动作组合:(过滤时长, 反洗时长)
  43. - 过滤时长: L_min_s ~ L_max_s,步长 L_step_s
  44. - 反洗时长: t_bw_min_s ~ t_bw_max_s,步长 t_bw_step_s
  45. - 总动作数 = len(L_values) × len(t_bw_values)
  46. 奖励机制:
  47. - 基于回收率和残余污染的平衡
  48. - 失败 (TMP超限、回收率过低、污染过快) 时给予大负奖励 (-10)
  49. 终止条件:
  50. - terminated: 违反运行约束(失败)
  51. - truncated: 达到最大步数 (max_episode_steps)
  52. """
  53. metadata = {"render_modes": ["human"]}
  54. def __init__(
  55. self,
  56. physics: UFPhysicsModel,
  57. reward_params: UFRewardParams,
  58. action_spec:UFActionSpec,
  59. statebounds:UFStateBounds,
  60. real_state_pool,
  61. max_episode_steps: int = 45,
  62. RANDOM_SEED = 1024
  63. ):
  64. """
  65. 超滤强化学习环境
  66. 参数:
  67. physics(UFPhysicsModel): 超滤物理模型
  68. reward_params(UFRewardParams): 奖励函数参数
  69. max_episode_steps (int): 每个episode的最大步数,默认45
  70. 注:每步代表一个超级周期(约2-3天),45步约三个月
  71. """
  72. super(UFSuperCycleEnv, self).__init__()
  73. self.RANDOM_SEED = RANDOM_SEED
  74. self.physics = physics
  75. self.reward_params = reward_params
  76. self.max_episode_steps = max_episode_steps
  77. self.current_step = 0
  78. # -------- 动作空间 --------
  79. self.action_spec = action_spec
  80. self.L_values = np.arange(
  81. self.action_spec.L_min_s,
  82. self.action_spec.L_max_s + self.action_spec.L_step_s,
  83. self.action_spec.L_step_s,
  84. )
  85. self.t_bw_values = np.arange(
  86. self.action_spec.t_bw_min_s,
  87. self.action_spec.t_bw_max_s + self.action_spec.t_bw_step_s,
  88. self.action_spec.t_bw_step_s,
  89. )
  90. self.num_L = len(self.L_values)
  91. self.num_bw = len(self.t_bw_values)
  92. self.action_space = spaces.Discrete(self.num_L * self.num_bw)
  93. # -------- 状态空间 --------
  94. self.observation_space = spaces.Box(
  95. low=0.0,
  96. high=1.0,
  97. shape=(8,),
  98. dtype=np.float32,
  99. )
  100. self.state_bounds = statebounds # 状态边界
  101. self.real_state_pool = real_state_pool
  102. self.reset_sampler = ResetSampler(
  103. bounds=self.state_bounds,
  104. physics=physics,
  105. real_state_pool=self.real_state_pool,
  106. max_resample_attempts=50,
  107. random_state=np.random.RandomState(RANDOM_SEED)
  108. )
  109. def _generate_initial_state(self) -> UFState | None:
  110. """
  111. 在 UFStateBounds 定义的范围内采样一个【合法】初始状态。
  112. 若采样失败(约束不满足)返回 None,由 reset() 负责重试。
  113. """
  114. b = self.state_bounds
  115. A = 128 * 40.0 # 有效膜面积
  116. # ---- 1. 基础工况 ----
  117. # ---- 随机生成 TMP、q_UF、温度 ----
  118. TMP0 = np.random.uniform(b.TMP0_min, b.TMP0_max)
  119. q_UF = np.random.uniform(b.q_UF_min, b.q_UF_max)
  120. temp = np.random.uniform(b.temp_min, b.temp_max)
  121. # ---- 2. 污染增长参数 ----
  122. slope = np.random.uniform(b.slope_min, b.slope_max)
  123. power = np.random.uniform(b.power_min, b.power_max)
  124. # ---- 3. 约束:污染增长速率可实现 ----
  125. t_max = 60 if power >= 1 else 1
  126. required_nuK_min = slope * power * (t_max ** (power - 1)) * (A / q_UF)
  127. # 若 required_nuK_min 超过可选范围 → 初始状态非法
  128. if required_nuK_min > b.nuK_max:
  129. return None
  130. # 在可行范围中采样 nuK
  131. nuK = np.random.uniform(
  132. max(required_nuK_min, b.nuK_min),
  133. b.nuK_max
  134. )
  135. # ---- 4. CEB 去除率 ----
  136. ceb_removal = np.random.uniform(
  137. b.ceb_removal_min,
  138. b.ceb_removal_max
  139. )
  140. # ---- 5. 初始膜阻力(物理模型) ----
  141. R0 = self.physics.calculate_initial_resistance(
  142. TMP=TMP0,
  143. q_UF=q_UF,
  144. temp=temp
  145. )
  146. return UFState(
  147. TMP=TMP0,
  148. q_UF=q_UF,
  149. temp=temp,
  150. R=R0,
  151. slope=slope,
  152. power=power,
  153. nuK=nuK,
  154. ceb_removal=ceb_removal,
  155. )
  156. def _get_training_progress(self) -> float:
  157. """
  158. 返回训练进度,用于 reset_sampler 的 curriculum sampling
  159. """
  160. return min(1.0, self.current_step / self.max_episode_steps )
  161. def reset(self, seed=None, options=None, max_attempts: int = 1000):
  162. super().reset(seed=seed)
  163. progress = self._get_training_progress()
  164. for _ in range(max_attempts):
  165. state = self.reset_sampler.sample(progress)
  166. if state is None:
  167. continue
  168. ok_run = self.physics.check_dead_initial_state(
  169. init_state=state,
  170. max_steps=self.max_episode_steps,
  171. L_s=self.action_spec.L_min_s,
  172. t_bw_s=self.action_spec.t_bw_max_s
  173. )
  174. if ok_run:
  175. self.state = state
  176. break
  177. else:
  178. raise RuntimeError("无法生成可行初始状态")
  179. self.current_step = 0
  180. self.tmp_over_limit_flag = False
  181. self.last_action = None
  182. self.max_TMP_during_filtration = self.state.TMP
  183. return self.get_obs(self.state), {}
  184. def _get_state_copy(self):
  185. return copy.deepcopy(self.state)
  186. def get_obs(self, state):
  187. """
  188. 构建当前环境归一化状态向量
  189. """
  190. # === 1. 从 state 读取动态参数 ===
  191. TMP = state.TMP
  192. q_UF = state.q_UF
  193. temp = state.temp
  194. # === 2. 计算本周期初始膜阻力 ===
  195. R = state.R
  196. # === 3. 从 self.state 读取膜阻力增长模型参数 ===
  197. nuk = state.nuK
  198. slope = state.slope
  199. power = state.power
  200. ceb_removal = state.ceb_removal
  201. # === 4. 从 current_params 动态读取上下限 ===
  202. TMP0_min, TMP0_max = self.state_bounds.TMP0_min, self.state_bounds.global_TMP_hard_limit
  203. q_UF_min, q_UF_max = self.state_bounds.q_UF_min, self.state_bounds.q_UF_max
  204. temp_min, temp_max = self.state_bounds.temp_min, self.state_bounds.temp_max
  205. nuK_min, nuK_max = self.state_bounds.nuK_min, self.state_bounds.nuK_max
  206. slope_min, slope_max = self.state_bounds.slope_min, self.state_bounds.slope_max
  207. power_min, power_max = self.state_bounds.power_min, self.state_bounds.power_max
  208. ceb_min, ceb_max = self.state_bounds.ceb_removal_min, self.state_bounds.ceb_removal_max
  209. # === 5. 归一化计算(clip防止越界) ===
  210. TMP0_norm = np.clip((TMP - TMP0_min) / (TMP0_max - TMP0_min), 0, 1)
  211. q_UF_norm = np.clip((q_UF - q_UF_min) / (q_UF_max - q_UF_min), 0, 1)
  212. temp_norm = np.clip((temp - temp_min) / (temp_max - temp_min), 0, 1)
  213. # R0 不在 current_params 中定义上下限,设定经验范围
  214. R0_norm = np.clip((R - 100.0) / (800.0 - 100.0), 0, 1)
  215. short_term_norm = np.clip((nuk - nuK_min) / (nuK_max - nuK_min), 0, 1)
  216. long_term_slope_norm = np.clip((slope - slope_min) / (slope_max - slope_min), 0, 1)
  217. long_term_power_norm = np.clip((power - power_min) / (power_max - power_min), 0, 1)
  218. ceb_removal_norm = np.clip((ceb_removal - ceb_min) / (ceb_max - ceb_min), 0, 1)
  219. # === 6. 构建观测向量 ===
  220. obs = np.array([
  221. TMP0_norm,
  222. q_UF_norm,
  223. temp_norm,
  224. R0_norm,
  225. short_term_norm,
  226. long_term_slope_norm,
  227. long_term_power_norm,
  228. ceb_removal_norm
  229. ], dtype=np.float32)
  230. return obs
  231. def get_action_values(self, action):
  232. """
  233. 将动作还原为实际时长
  234. """
  235. L_idx = action // self.num_bw
  236. t_bw_idx = action % self.num_bw
  237. return self.L_values[L_idx], self.t_bw_values[t_bw_idx]
  238. def step(self, action):
  239. self.current_step += 1
  240. L_s, t_bw_s = self.get_action_values(action)
  241. L_s = np.clip(L_s, self.action_spec.L_min_s, self.action_spec.L_max_s)
  242. t_bw_s = np.clip(t_bw_s, self.action_spec.t_bw_min_s, self.action_spec.t_bw_max_s)
  243. # 模拟超级周期
  244. info, next_state = self.physics.simulate_one_supercycle(state=self.state,L_s=L_s, t_bw_s=t_bw_s)
  245. # 根据 info 判断是否成功
  246. feasible = self.physics.is_dead_cycle(info) # True 表示成功循环,False 表示失败
  247. if info["max_TMP_during_filtration"] >= self.reward_params.global_TMP_hard_limit:
  248. self.tmp_over_limit_flag = True
  249. # ================== 孤立观察下一周期 ==================
  250. info_next = None
  251. if info["max_TMP_during_filtration"] > self.reward_params.global_TMP_soft_limit:
  252. info_next, _ = self.physics.simulate_one_supercycle(state=next_state,L_s=L_s,t_bw_s=t_bw_s)
  253. reward, tmp_penalty, rec_reward, energy_reward, res_penalty = self._calculate_reward(info, info_next)
  254. info["tmp_penalty"] = tmp_penalty
  255. info["rec_reward"] = rec_reward
  256. info["energy_reward"] = energy_reward
  257. info["res_penalty"] = res_penalty
  258. self.state = next_state
  259. terminated = False
  260. # 判断是否到达最大步数
  261. truncated = self.current_step >= self.max_episode_steps
  262. self.last_action = (L_s, t_bw_s)
  263. next_obs = self.get_obs(next_state)
  264. info["feasible"] = feasible
  265. info["step"] = self.current_step
  266. info["L_s"] = L_s.copy()
  267. info["t_bw_s"] = t_bw_s.copy()
  268. # # ===================== 测试终末奖励:鼓励 TMP 接近初始状态 =====================
  269. # # 仅在 episode 自然结束(满步但未提前失败)时触发
  270. # if truncated and not terminated:
  271. # TMP_initial = self.TMP0 # reset 时记录的初始 TMP
  272. # TMP_final = next_obs[0] # next_obs 提供的最终 TMP
  273. #
  274. # delta_ratio = abs((TMP_final - TMP_initial) / TMP_initial)
  275. #
  276. # alpha = 4.0 # TMP 偏差敏感度
  277. # gamma = 5.0 # 奖励幅度
  278. # stability_reward = gamma * (np.exp(-alpha * delta_ratio) - 1) # 量级在0到-5之间
  279. #
  280. # reward += stability_reward
  281. # terminated = True # episode 正式结束
  282. # # ===================== 测试结果 =====================
  283. # 增加该奖励后强化学习依然能保证奖励收敛,但是损失函数在2-3之间反复震荡,无法降低,见reward_test&loss_test
  284. # 原设想是只能听在大额偏移发生前能通过该奖励学习到提前减小偏移步伐,但是实际训练时该惩罚反复被触发
  285. # 推测是终末的大额奖惩无法有效传递回过往时间步引导智能体学习,可能由于状态中缺少预测值,智能体会将其观测为不可控事件,暂时不添加该奖励,TODO:等待优化
  286. return next_obs, reward, terminated, truncated, info
  287. def _calculate_reward(self, info: dict, info_next) -> float:
  288. """
  289. 计算强化学习奖励函数(扩展版)
  290. 功能:
  291. - 平衡回收率、残余污染和吨水电耗三个目标
  292. - TMP不直接参与奖励计算(通过失败判定间接影响)
  293. - 使用 tanh 函数实现平滑的非线性奖励
  294. 参数:
  295. info (dict): 周期性能指标字典,需包含
  296. - recovery: 回收率 [0-1]
  297. - R_after_ceb: 本周期结束膜阻力
  298. - initial_R: 本周期初始膜阻力
  299. - delta_R_allow: 本周期允许最大阻力上升
  300. - ton_water_energy_kWh_per_m3: 本周期吨水电耗
  301. 返回:
  302. float: 奖励值(通常在 -3 到 +3 之间)
  303. 设计思想:
  304. - 高回收率 → 水资源利用率高 → 正奖励
  305. - 低残余污染 → 膜长期稳定运行 → 正奖励
  306. - 低吨水电耗 → 节能 → 正奖励
  307. - 三者需要权衡:过短的过滤时间提高回收率但污染去除不彻底;过长时间污染控制好但回收率下降,过高功率增加耗能
  308. 参考点设计:
  309. - 残余污染:
  310. - 高污染参考点 = 1 / self.max_episode_steps
  311. - 平衡点 = 0.5 / self.max_episode_steps
  312. - 吨水电耗:
  313. - 高点 = 0.1034 kWh/m³
  314. - 平衡点 = 0.1011 kWh/m³
  315. - 低点 = 0.0993 kWh/m³
  316. - 回收率参考点保持原有设计
  317. """
  318. #新增:将TMP超限改为持续惩罚
  319. # ========== TMP 超标风险惩罚项 ==========
  320. tmp = info["max_TMP_during_filtration"]
  321. tmp_soft = self.reward_params.global_TMP_soft_limit
  322. tmp_hard = self.reward_params.global_TMP_hard_limit
  323. if self.tmp_over_limit_flag:
  324. tmp_state_penalty = -self.reward_params.w_tmp_hard
  325. elif tmp <= tmp_soft:
  326. tmp_state_penalty = 0.0
  327. elif tmp < tmp_hard:
  328. x = (tmp - tmp_soft) / (tmp_hard - tmp_soft)
  329. tmp_state_penalty = -self.reward_params.w_tmp * x ** self.reward_params.p
  330. # -------- TMP 趋势惩罚 --------
  331. tmp_trend_penalty = 0.0
  332. if info_next is not None:
  333. delta_tmp = info_next["max_TMP_during_filtration"] - tmp
  334. tmp_trend_penalty = -self.reward_params.w_trend * delta_tmp
  335. tmp_penalty = tmp_state_penalty + tmp_trend_penalty
  336. # ========== 提取性能指标 ==========
  337. recovery = info["recovery"] # 回收率 [0-1]
  338. # 污染比例:实际上升的阻力 / 允许上升的阻力
  339. # 允许上升的阻力值 = 当前阻力值软上限 - 当前阻力
  340. residual_ratio = info['residual_ratio']
  341. # 吨水电耗指标
  342. energy = info["ton_water_energy_kWh_per_m3"]
  343. # ========== 回收率奖励项 ==========
  344. # 将回收率归一化到 [0, 1] 区间(基于预期范围)
  345. rec_norm = (recovery - self.reward_params.rec_low) / (self.reward_params.rec_high - self.reward_params.rec_low)
  346. # 使用 tanh 函数构建平滑的 S 型奖励曲线
  347. # - rec_norm = 0.5 时(回收率处于中间值),rec_reward = 0
  348. # - rec_norm > 0.5 时,rec_reward > 0(鼓励高回收率)
  349. # - rec_norm < 0.5 时,rec_reward < 0(惩罚低回收率)
  350. # - k_rec 控制曲线陡峭程度,越大变化越陡
  351. rec_reward = np.clip(np.tanh(self.reward_params.k_rec * (rec_norm - 0.5)), -1, 1)
  352. # ========== 残余污染惩罚项 ==========
  353. # 新参考点:每步允许上升比例 = 1 / max_episode_steps
  354. # 平衡点 = 0.8 / max_episode_steps
  355. ref_residual = 0.8 / self.max_episode_steps
  356. # 使用 tanh 构建惩罚曲线
  357. # - residual_ratio < 平衡点时,res_penalty > 0(奖励低污染)
  358. # - residual_ratio > 平衡点时,res_penalty < 0(惩罚高污染)
  359. # - k_res 控制曲线陡峭程度
  360. res_penalty = -np.tanh(self.reward_params.k_res * (residual_ratio / ref_residual - 1))
  361. # ========== 吨水电耗奖励项 ==========
  362. # 设置高/平衡/低点
  363. energy_low = 0.0993
  364. energy_high = 0.1034
  365. # 将能耗归一化到 [0, 1],平衡点对应 energy_norm = 0.5
  366. energy_norm = (energy - energy_low) / (energy_high - energy_low)
  367. # 使用 tanh 构建平滑奖励
  368. # - energy_norm < 0.5 时,energy_reward > 0(节能奖励)
  369. # - energy_norm > 0.5 时,energy_reward < 0(高能耗惩罚)
  370. # - k_energy 控制曲线陡峭程度
  371. energy_reward = -np.tanh(self.reward_params.k_energy * (energy_norm - 0.5))
  372. # ========== 组合奖励 ==========
  373. # 简单线性组合三项(为污染项加权)
  374. total_reward = rec_reward + 2.0 * res_penalty + energy_reward + tmp_penalty
  375. # 可选:添加平移项使特定点的奖励为零(当前未使用)
  376. # total_reward -= offset
  377. return total_reward, tmp_penalty, rec_reward, energy_reward, res_penalty