uf_env.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. """
  2. 超滤强化学习环境模块
  3. ========================
  4. 本模块定义了超滤系统的强化学习环境,包括:
  5. 1. UFParams: 超滤系统参数配置类
  6. 2. 膜阻力与跨膜压差转换函数
  7. 3. simulate_one_supercycle: 超级周期模拟函数
  8. 4. calculate_reward: 奖励函数
  9. 5. is_dead_cycle: 失败判定函数
  10. 6. UFSuperCycleEnv: Gymnasium环境类
  11. 模块设计说明:
  12. - 基于 Gymnasium (原OpenAI Gym) 标准接口
  13. - 模拟超滤膜的"超级周期"运行(多次物理反洗 + 一次化学反洗)
  14. - 强化学习智能体通过优化过滤时长和反洗时长来最大化回收率并控制污染累积
  15. """
  16. import numpy as np
  17. import gymnasium as gym
  18. from gymnasium import spaces
  19. from env.env_params import UFState, UFStateBounds, UFRewardParams, UFActionSpec
  20. from env.uf_physics import UFPhysicsModel
  21. from env.env_reset import ResetSampler
  22. import copy
  23. class UFSuperCycleEnv(gym.Env):
  24. """
  25. 超滤系统强化学习环境(Gymnasium标准接口)
  26. 功能:
  27. - 模拟超滤膜的超级周期运行
  28. - 智能体在每个超级周期选择过滤时长和反洗时长
  29. - 目标:最大化回收率同时控制污染累积
  30. 状态空间 (8维,归一化到 [0,1]):
  31. 1. TMP0: 初始跨膜压差
  32. 2. q_UF: 过滤流量
  33. 3. temp: 水温
  34. 4. R0: 初始膜阻力
  35. 5. nuK: 短期污染系数
  36. 6. slope: 长期污染斜率
  37. 7. power: 长期污染幂次
  38. 8. ceb_removal: CEB去除能力
  39. 动作空间 (离散):
  40. - 二维离散动作组合:(过滤时长, 反洗时长)
  41. - 过滤时长: L_min_s ~ L_max_s,步长 L_step_s
  42. - 反洗时长: t_bw_min_s ~ t_bw_max_s,步长 t_bw_step_s
  43. - 总动作数 = len(L_values) × len(t_bw_values)
  44. 奖励机制:
  45. - 基于回收率和残余污染的平衡
  46. - 失败 (TMP超限、回收率过低、污染过快) 时给予大负奖励 (-10)
  47. 终止条件:
  48. - terminated: 违反运行约束(失败)
  49. - truncated: 达到最大步数 (max_episode_steps)
  50. """
  51. metadata = {"render_modes": ["human"]}
  52. def __init__(
  53. self,
  54. physics: UFPhysicsModel,
  55. reward_params: UFRewardParams,
  56. action_spec:UFActionSpec,
  57. statebounds:UFStateBounds,
  58. real_state_pool,
  59. max_episode_steps: int = 45,
  60. RANDOM_SEED = 1024
  61. ):
  62. """
  63. 超滤强化学习环境
  64. 参数:
  65. physics(UFPhysicsModel): 超滤物理模型
  66. reward_params(UFRewardParams): 奖励函数参数
  67. max_episode_steps (int): 每个episode的最大步数,默认45
  68. 注:每步代表一个超级周期(约2-3天),45步约三个月
  69. """
  70. super(UFSuperCycleEnv, self).__init__()
  71. self.RANDOM_SEED = RANDOM_SEED
  72. self.physics = physics
  73. self.reward_params = reward_params
  74. self.max_episode_steps = max_episode_steps
  75. self.current_step = 0
  76. # -------- 动作空间 --------
  77. self.action_spec = action_spec
  78. self.L_values = np.arange(
  79. self.action_spec.L_min_s,
  80. self.action_spec.L_max_s + self.action_spec.L_step_s,
  81. self.action_spec.L_step_s,
  82. )
  83. self.t_bw_values = np.arange(
  84. self.action_spec.t_bw_min_s,
  85. self.action_spec.t_bw_max_s + self.action_spec.t_bw_step_s,
  86. self.action_spec.t_bw_step_s,
  87. )
  88. self.num_L = len(self.L_values)
  89. self.num_bw = len(self.t_bw_values)
  90. self.action_space = spaces.Discrete(self.num_L * self.num_bw)
  91. # -------- 状态空间 --------
  92. self.observation_space = spaces.Box(
  93. low=0.0,
  94. high=1.0,
  95. shape=(8,),
  96. dtype=np.float32,
  97. )
  98. self.state_bounds = statebounds # 状态边界
  99. self.real_state_pool = real_state_pool
  100. self.reset_sampler = ResetSampler(
  101. bounds=self.state_bounds,
  102. physics=physics,
  103. real_state_pool=self.real_state_pool,
  104. max_resample_attempts=50,
  105. random_state=np.random.RandomState(RANDOM_SEED)
  106. )
  107. def _generate_initial_state(self) -> UFState | None:
  108. """
  109. 在 UFStateBounds 定义的范围内采样一个【合法】初始状态。
  110. 若采样失败(约束不满足)返回 None,由 reset() 负责重试。
  111. """
  112. b = self.state_bounds
  113. A = 128 * 40.0 # 有效膜面积
  114. # ---- 1. 基础工况 ----
  115. # ---- 随机生成 TMP、q_UF、温度 ----
  116. TMP0 = np.random.uniform(b.TMP0_min, b.TMP0_max)
  117. q_UF = np.random.uniform(b.q_UF_min, b.q_UF_max)
  118. temp = np.random.uniform(b.temp_min, b.temp_max)
  119. # ---- 2. 污染增长参数 ----
  120. slope = np.random.uniform(b.slope_min, b.slope_max)
  121. power = np.random.uniform(b.power_min, b.power_max)
  122. # ---- 3. 约束:污染增长速率可实现 ----
  123. t_max = 60 if power >= 1 else 1
  124. required_nuK_min = slope * power * (t_max ** (power - 1)) * (A / q_UF)
  125. # 若 required_nuK_min 超过可选范围 → 初始状态非法
  126. if required_nuK_min > b.nuK_max:
  127. return None
  128. # 在可行范围中采样 nuK
  129. nuK = np.random.uniform(
  130. max(required_nuK_min, b.nuK_min),
  131. b.nuK_max
  132. )
  133. # ---- 4. CEB 去除率 ----
  134. ceb_removal = np.random.uniform(
  135. b.ceb_removal_min,
  136. b.ceb_removal_max
  137. )
  138. # ---- 5. 初始膜阻力(物理模型) ----
  139. R0 = self.physics.calculate_initial_resistance(
  140. TMP=TMP0,
  141. q_UF=q_UF,
  142. temp=temp
  143. )
  144. return UFState(
  145. TMP=TMP0,
  146. q_UF=q_UF,
  147. temp=temp,
  148. R=R0,
  149. slope=slope,
  150. power=power,
  151. nuK=nuK,
  152. ceb_removal=ceb_removal,
  153. )
  154. def _get_training_progress(self) -> float:
  155. """
  156. 返回训练进度,用于 reset_sampler 的 curriculum sampling
  157. """
  158. return min(1.0, self.current_step / self.max_episode_steps )
  159. def reset(self, seed=None, options=None, max_attempts: int = 1000):
  160. super().reset(seed=seed)
  161. progress = self._get_training_progress()
  162. for _ in range(max_attempts):
  163. state = self.reset_sampler.sample(progress)
  164. if state is None:
  165. continue
  166. ok_run = self.physics.check_dead_initial_state(
  167. init_state=state,
  168. max_steps=self.max_episode_steps,
  169. L_s=self.action_spec.L_min_s,
  170. t_bw_s=self.action_spec.t_bw_max_s
  171. )
  172. if ok_run:
  173. self.state = state
  174. break
  175. else:
  176. raise RuntimeError("无法生成可行初始状态")
  177. self.current_step = 0
  178. self.tmp_over_limit_flag = False
  179. self.last_action = None
  180. self.max_TMP_during_filtration = self.state.TMP
  181. return self.get_obs(self.state), {}
  182. def _get_state_copy(self):
  183. return copy.deepcopy(self.state)
  184. def get_obs(self, state):
  185. """
  186. 构建当前环境归一化状态向量
  187. """
  188. # === 1. 从 state 读取动态参数 ===
  189. TMP = state.TMP
  190. q_UF = state.q_UF
  191. temp = state.temp
  192. # === 2. 计算本周期初始膜阻力 ===
  193. R = state.R
  194. # === 3. 从 self.state 读取膜阻力增长模型参数 ===
  195. nuk = state.nuK
  196. slope = state.slope
  197. power = state.power
  198. ceb_removal = state.ceb_removal
  199. # === 4. 从 current_params 动态读取上下限 ===
  200. TMP0_min, TMP0_max = self.state_bounds.TMP0_min, self.state_bounds.global_TMP_hard_limit
  201. q_UF_min, q_UF_max = self.state_bounds.q_UF_min, self.state_bounds.q_UF_max
  202. temp_min, temp_max = self.state_bounds.temp_min, self.state_bounds.temp_max
  203. nuK_min, nuK_max = self.state_bounds.nuK_min, self.state_bounds.nuK_max
  204. slope_min, slope_max = self.state_bounds.slope_min, self.state_bounds.slope_max
  205. power_min, power_max = self.state_bounds.power_min, self.state_bounds.power_max
  206. ceb_min, ceb_max = self.state_bounds.ceb_removal_min, self.state_bounds.ceb_removal_max
  207. # === 5. 归一化计算(clip防止越界) ===
  208. TMP0_norm = np.clip((TMP - TMP0_min) / (TMP0_max - TMP0_min), 0, 1)
  209. q_UF_norm = np.clip((q_UF - q_UF_min) / (q_UF_max - q_UF_min), 0, 1)
  210. temp_norm = np.clip((temp - temp_min) / (temp_max - temp_min), 0, 1)
  211. # R0 不在 current_params 中定义上下限,设定经验范围
  212. R0_norm = np.clip((R - 100.0) / (800.0 - 100.0), 0, 1)
  213. short_term_norm = np.clip((nuk - nuK_min) / (nuK_max - nuK_min), 0, 1)
  214. long_term_slope_norm = np.clip((slope - slope_min) / (slope_max - slope_min), 0, 1)
  215. long_term_power_norm = np.clip((power - power_min) / (power_max - power_min), 0, 1)
  216. ceb_removal_norm = np.clip((ceb_removal - ceb_min) / (ceb_max - ceb_min), 0, 1)
  217. # === 6. 构建观测向量 ===
  218. obs = np.array([
  219. TMP0_norm,
  220. q_UF_norm,
  221. temp_norm,
  222. R0_norm,
  223. short_term_norm,
  224. long_term_slope_norm,
  225. long_term_power_norm,
  226. ceb_removal_norm
  227. ], dtype=np.float32)
  228. return obs
  229. def get_action_values(self, action):
  230. """
  231. 将动作还原为实际时长
  232. """
  233. L_idx = action // self.num_bw
  234. t_bw_idx = action % self.num_bw
  235. return self.L_values[L_idx], self.t_bw_values[t_bw_idx]
  236. def step(self, action):
  237. self.current_step += 1
  238. L_s, t_bw_s = self.get_action_values(action)
  239. L_s = np.clip(L_s, self.action_spec.L_min_s, self.action_spec.L_max_s)
  240. t_bw_s = np.clip(t_bw_s, self.action_spec.t_bw_min_s, self.action_spec.t_bw_max_s)
  241. # 模拟超级周期
  242. info, next_state = self.physics.simulate_one_supercycle(state=self.state,L_s=L_s, t_bw_s=t_bw_s)
  243. # 根据 info 判断是否成功
  244. feasible = self.physics.is_dead_cycle(info) # True 表示成功循环,False 表示失败
  245. if info["max_TMP_during_filtration"] >= self.reward_params.global_TMP_hard_limit:
  246. self.tmp_over_limit_flag = True
  247. # ================== 孤立观察下一周期 ==================
  248. info_next = None
  249. if info["max_TMP_during_filtration"] > self.reward_params.global_TMP_soft_limit:
  250. info_next, _ = self.physics.simulate_one_supercycle(state=next_state,L_s=L_s,t_bw_s=t_bw_s)
  251. reward, tmp_penalty, rec_reward, energy_reward, res_penalty = self._calculate_reward(info, info_next)
  252. info["tmp_penalty"] = tmp_penalty
  253. info["rec_reward"] = rec_reward
  254. info["energy_reward"] = energy_reward
  255. info["res_penalty"] = res_penalty
  256. self.state = next_state
  257. terminated = False
  258. # 判断是否到达最大步数
  259. truncated = self.current_step >= self.max_episode_steps
  260. self.last_action = (L_s, t_bw_s)
  261. next_obs = self.get_obs(next_state)
  262. info["feasible"] = feasible
  263. info["step"] = self.current_step
  264. info["L_s"] = L_s.copy()
  265. info["t_bw_s"] = t_bw_s.copy()
  266. # # ===================== 测试终末奖励:鼓励 TMP 接近初始状态 =====================
  267. # # 仅在 episode 自然结束(满步但未提前失败)时触发
  268. # if truncated and not terminated:
  269. # TMP_initial = self.TMP0 # reset 时记录的初始 TMP
  270. # TMP_final = next_obs[0] # next_obs 提供的最终 TMP
  271. #
  272. # delta_ratio = abs((TMP_final - TMP_initial) / TMP_initial)
  273. #
  274. # alpha = 4.0 # TMP 偏差敏感度
  275. # gamma = 5.0 # 奖励幅度
  276. # stability_reward = gamma * (np.exp(-alpha * delta_ratio) - 1) # 量级在0到-5之间
  277. #
  278. # reward += stability_reward
  279. # terminated = True # episode 正式结束
  280. # # ===================== 测试结果 =====================
  281. # 增加该奖励后强化学习依然能保证奖励收敛,但是损失函数在2-3之间反复震荡,无法降低,见reward_test&loss_test
  282. # 原设想是只能听在大额偏移发生前能通过该奖励学习到提前减小偏移步伐,但是实际训练时该惩罚反复被触发
  283. # 推测是终末的大额奖惩无法有效传递回过往时间步引导智能体学习,可能由于状态中缺少预测值,智能体会将其观测为不可控事件,暂时不添加该奖励,TODO:等待优化
  284. return next_obs, reward, terminated, truncated, info
  285. def _calculate_reward(self, info: dict, info_next) -> float:
  286. """
  287. 计算强化学习奖励函数(扩展版)
  288. 功能:
  289. - 平衡回收率、残余污染和吨水电耗三个目标
  290. - TMP不直接参与奖励计算(通过失败判定间接影响)
  291. - 使用 tanh 函数实现平滑的非线性奖励
  292. 参数:
  293. info (dict): 周期性能指标字典,需包含
  294. - recovery: 回收率 [0-1]
  295. - R_after_ceb: 本周期结束膜阻力
  296. - initial_R: 本周期初始膜阻力
  297. - delta_R_allow: 本周期允许最大阻力上升
  298. - ton_water_energy_kWh_per_m3: 本周期吨水电耗
  299. 返回:
  300. float: 奖励值(通常在 -3 到 +3 之间)
  301. 设计思想:
  302. - 高回收率 → 水资源利用率高 → 正奖励
  303. - 低残余污染 → 膜长期稳定运行 → 正奖励
  304. - 低吨水电耗 → 节能 → 正奖励
  305. - 三者需要权衡:过短的过滤时间提高回收率但污染去除不彻底;过长时间污染控制好但回收率下降,过高功率增加耗能
  306. 参考点设计:
  307. - 残余污染:
  308. - 高污染参考点 = 1 / self.max_episode_steps
  309. - 平衡点 = 0.5 / self.max_episode_steps
  310. - 吨水电耗:
  311. - 高点 = 0.1034 kWh/m³
  312. - 平衡点 = 0.1011 kWh/m³
  313. - 低点 = 0.0993 kWh/m³
  314. - 回收率参考点保持原有设计
  315. """
  316. #新增:将TMP超限改为持续惩罚
  317. # ========== TMP 超标风险惩罚项 ==========
  318. tmp = info["max_TMP_during_filtration"]
  319. tmp_soft = self.reward_params.global_TMP_soft_limit
  320. tmp_hard = self.reward_params.global_TMP_hard_limit
  321. if self.tmp_over_limit_flag:
  322. tmp_state_penalty = -self.reward_params.w_tmp_hard
  323. elif tmp <= tmp_soft:
  324. tmp_state_penalty = 0.0
  325. elif tmp < tmp_hard:
  326. x = (tmp - tmp_soft) / (tmp_hard - tmp_soft)
  327. tmp_state_penalty = -self.reward_params.w_tmp * x ** self.reward_params.p
  328. # -------- TMP 趋势惩罚 --------
  329. tmp_trend_penalty = 0.0
  330. if info_next is not None:
  331. delta_tmp = info_next["max_TMP_during_filtration"] - tmp
  332. tmp_trend_penalty = -self.reward_params.w_trend * delta_tmp
  333. tmp_penalty = tmp_state_penalty + tmp_trend_penalty
  334. # ========== 提取性能指标 ==========
  335. recovery = info["recovery"] # 回收率 [0-1]
  336. # 污染比例:实际上升的阻力 / 允许上升的阻力
  337. # 允许上升的阻力值 = 当前阻力值软上限 - 当前阻力
  338. residual_ratio = info['residual_ratio']
  339. # 吨水电耗指标
  340. energy = info["ton_water_energy_kWh_per_m3"]
  341. # ========== 回收率奖励项 ==========
  342. # 将回收率归一化到 [0, 1] 区间(基于预期范围)
  343. rec_norm = (recovery - self.reward_params.rec_low) / (self.reward_params.rec_high - self.reward_params.rec_low)
  344. # 使用 tanh 函数构建平滑的 S 型奖励曲线
  345. # - rec_norm = 0.5 时(回收率处于中间值),rec_reward = 0
  346. # - rec_norm > 0.5 时,rec_reward > 0(鼓励高回收率)
  347. # - rec_norm < 0.5 时,rec_reward < 0(惩罚低回收率)
  348. # - k_rec 控制曲线陡峭程度,越大变化越陡
  349. rec_reward = np.clip(np.tanh(self.reward_params.k_rec * (rec_norm - 0.5)), -1, 1)
  350. # ========== 残余污染惩罚项 ==========
  351. # 新参考点:每步允许上升比例 = 1 / max_episode_steps
  352. # 平衡点 = 0.8 / max_episode_steps
  353. ref_residual = 0.8 / self.max_episode_steps
  354. # 使用 tanh 构建惩罚曲线
  355. # - residual_ratio < 平衡点时,res_penalty > 0(奖励低污染)
  356. # - residual_ratio > 平衡点时,res_penalty < 0(惩罚高污染)
  357. # - k_res 控制曲线陡峭程度
  358. res_penalty = -np.tanh(self.reward_params.k_res * (residual_ratio / ref_residual - 1))
  359. # ========== 吨水电耗奖励项 ==========
  360. # 设置高/平衡/低点
  361. energy_low = 0.0993
  362. energy_high = 0.1034
  363. # 将能耗归一化到 [0, 1],平衡点对应 energy_norm = 0.5
  364. energy_norm = (energy - energy_low) / (energy_high - energy_low)
  365. # 使用 tanh 构建平滑奖励
  366. # - energy_norm < 0.5 时,energy_reward > 0(节能奖励)
  367. # - energy_norm > 0.5 时,energy_reward < 0(高能耗惩罚)
  368. # - k_energy 控制曲线陡峭程度
  369. energy_reward = -np.tanh(self.reward_params.k_energy * (energy_norm - 0.5))
  370. # ========== 组合奖励 ==========
  371. # 简单线性组合三项(为污染项加权)
  372. total_reward = rec_reward + 2.0 * res_penalty + energy_reward + tmp_penalty
  373. # 可选:添加平移项使特定点的奖励为零(当前未使用)
  374. # total_reward -= offset
  375. return total_reward, tmp_penalty, rec_reward, energy_reward, res_penalty