uf_env.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. """
  2. 超滤强化学习环境模块
  3. ========================
  4. 本模块定义了超滤系统的强化学习环境,包括:
  5. 1. UFParams: 超滤系统参数配置类
  6. 2. 膜阻力与跨膜压差转换函数
  7. 3. simulate_one_supercycle: 超级周期模拟函数
  8. 4. calculate_reward: 奖励函数
  9. 5. is_dead_cycle: 失败判定函数
  10. 6. UFSuperCycleEnv: Gymnasium环境类
  11. 模块设计说明:
  12. - 基于 Gymnasium (原OpenAI Gym) 标准接口
  13. - 模拟超滤膜的"超级周期"运行(多次物理反洗 + 一次化学反洗)
  14. - 强化学习智能体通过优化过滤时长和反洗时长来最大化回收率并控制污染累积
  15. """
  16. import numpy as np
  17. import gymnasium as gym
  18. from gymnasium import spaces
  19. from env.env_params import UFState, UFStateBounds, UFRewardParams, UFActionSpec
  20. from env.uf_physics import UFPhysicsModel
  21. from env.env_reset import ResetSampler
  22. import copy
  23. class UFSuperCycleEnv(gym.Env):
  24. """
  25. 超滤系统强化学习环境(Gymnasium标准接口)
  26. 功能:
  27. - 模拟超滤膜的超级周期运行
  28. - 智能体在每个超级周期选择过滤时长和反洗时长
  29. - 目标:最大化回收率同时控制污染累积
  30. 状态空间 (8维,归一化到 [0,1]):
  31. 1. TMP0: 初始跨膜压差
  32. 2. q_UF: 过滤流量
  33. 3. temp: 水温
  34. 4. R0: 初始膜阻力
  35. 5. nuK: 短期污染系数
  36. 6. slope: 长期污染斜率
  37. 7. power: 长期污染幂次
  38. 8. ceb_removal: CEB去除能力
  39. 动作空间 (离散):
  40. - 二维离散动作组合:(过滤时长, 反洗时长)
  41. - 过滤时长: L_min_s ~ L_max_s,步长 L_step_s
  42. - 反洗时长: t_bw_min_s ~ t_bw_max_s,步长 t_bw_step_s
  43. - 总动作数 = len(L_values) × len(t_bw_values)
  44. 奖励机制:
  45. - 基于回收率和残余污染的平衡
  46. - 失败 (TMP超限、回收率过低、污染过快) 时给予大负奖励 (-10)
  47. 终止条件:
  48. - terminated: 违反运行约束(失败)
  49. - truncated: 达到最大步数 (max_episode_steps)
  50. """
  51. metadata = {"render_modes": ["human"]}
  52. def __init__(
  53. self,
  54. physics: UFPhysicsModel,
  55. reward_params: UFRewardParams,
  56. action_spec:UFActionSpec,
  57. statebounds:UFStateBounds,
  58. real_state_pool,
  59. max_episode_steps: int = 45,
  60. RANDOM_SEED = 1024
  61. ):
  62. """
  63. 超滤强化学习环境
  64. 参数:
  65. physics(UFPhysicsModel): 超滤物理模型
  66. reward_params(UFRewardParams): 奖励函数参数
  67. max_episode_steps (int): 每个episode的最大步数,默认45
  68. 注:每步代表一个超级周期(约2-3天),45步约三个月
  69. """
  70. super(UFSuperCycleEnv, self).__init__()
  71. self.RANDOM_SEED = RANDOM_SEED
  72. self.physics = physics
  73. self.reward_params = reward_params
  74. self.max_episode_steps = max_episode_steps
  75. self.current_step = 0
  76. # -------- 动作空间 --------
  77. self.action_spec = action_spec
  78. self.L_values = np.arange(
  79. self.action_spec.L_min_s,
  80. self.action_spec.L_max_s + self.action_spec.L_step_s,
  81. self.action_spec.L_step_s,
  82. )
  83. self.t_bw_values = np.arange(
  84. self.action_spec.t_bw_min_s,
  85. self.action_spec.t_bw_max_s + self.action_spec.t_bw_step_s,
  86. self.action_spec.t_bw_step_s,
  87. )
  88. self.num_L = len(self.L_values)
  89. self.num_bw = len(self.t_bw_values)
  90. self.action_space = spaces.Discrete(self.num_L * self.num_bw)
  91. # -------- 状态空间 --------
  92. self.observation_space = spaces.Box(
  93. low=0.0,
  94. high=1.0,
  95. shape=(8,),
  96. dtype=np.float32,
  97. )
  98. self.state_bounds = statebounds # 状态边界
  99. self.real_state_pool = real_state_pool
  100. self.reset_sampler = ResetSampler(
  101. bounds=self.state_bounds,
  102. physics=physics,
  103. real_state_pool=self.real_state_pool,
  104. max_resample_attempts=50,
  105. random_state=np.random.RandomState(RANDOM_SEED)
  106. )
  107. def _generate_initial_state(self) -> UFState | None:
  108. """
  109. 在 UFStateBounds 定义的范围内采样一个【合法】初始状态。
  110. 若采样失败(约束不满足)返回 None,由 reset() 负责重试。
  111. """
  112. b = self.state_bounds
  113. A = 128 * 40.0 # 有效膜面积
  114. # ---- 1. 基础工况 ----
  115. # ---- 随机生成 TMP、q_UF、温度 ----
  116. TMP0 = np.random.uniform(b.TMP0_min, b.TMP0_max)
  117. q_UF = np.random.uniform(b.q_UF_min, b.q_UF_max)
  118. temp = np.random.uniform(b.temp_min, b.temp_max)
  119. # ---- 2. 污染增长参数 ----
  120. slope = np.random.uniform(b.slope_min, b.slope_max)
  121. power = np.random.uniform(b.power_min, b.power_max)
  122. # ---- 3. 约束:污染增长速率可实现 ----
  123. t_max = 60 if power >= 1 else 1
  124. required_nuK_min = slope * power * (t_max ** (power - 1)) * (A / q_UF)
  125. # 若 required_nuK_min 超过可选范围 → 初始状态非法
  126. if required_nuK_min > b.nuK_max:
  127. return None
  128. # 在可行范围中采样 nuK
  129. nuK = np.random.uniform(
  130. max(required_nuK_min, b.nuK_min),
  131. b.nuK_max
  132. )
  133. # ---- 4. CEB 去除率 ----
  134. ceb_removal = np.random.uniform(
  135. b.ceb_removal_min,
  136. b.ceb_removal_max
  137. )
  138. # ---- 5. 初始膜阻力(物理模型) ----
  139. R0 = self.physics.calculate_initial_resistance(
  140. TMP=TMP0,
  141. q_UF=q_UF,
  142. temp=temp
  143. )
  144. return UFState(
  145. TMP=TMP0,
  146. q_UF=q_UF,
  147. temp=temp,
  148. R=R0,
  149. slope=slope,
  150. power=power,
  151. nuK=nuK,
  152. ceb_removal=ceb_removal,
  153. )
  154. def _get_training_progress(self) -> float:
  155. """
  156. 返回训练进度,用于 reset_sampler 的 curriculum sampling
  157. """
  158. return min(1.0, self.current_step / self.max_episode_steps )
  159. def reset(self, seed=None, options=None, max_attempts: int = 1000):
  160. super().reset(seed=seed)
  161. progress = self._get_training_progress()
  162. for _ in range(max_attempts):
  163. state = self.reset_sampler.sample(progress)
  164. if state is None:
  165. continue
  166. ok_run = self.physics.check_dead_initial_state(
  167. init_state=state,
  168. max_steps=self.max_episode_steps,
  169. L_s=self.action_spec.L_min_s,
  170. t_bw_s=self.action_spec.t_bw_max_s
  171. )
  172. if ok_run:
  173. self.state = state
  174. break
  175. else:
  176. raise RuntimeError("无法生成可行初始状态")
  177. self.current_step = 0
  178. self.tmp_over_limit_flag = False
  179. self.last_action = None
  180. self.max_TMP_during_filtration = self.state.TMP
  181. return self.get_obs(self.state), {}
  182. def _get_state_copy(self):
  183. return copy.deepcopy(self.state)
  184. def get_obs(self, state):
  185. """
  186. 构建当前环境归一化状态向量
  187. """
  188. # === 1. 从 state 读取动态参数 ===
  189. TMP = state.TMP
  190. q_UF = state.q_UF
  191. temp = state.temp
  192. # === 2. 计算本周期初始膜阻力 ===
  193. R = state.R
  194. # === 3. 从 self.state 读取膜阻力增长模型参数 ===
  195. nuk = state.nuK
  196. slope = state.slope
  197. power = state.power
  198. ceb_removal = state.ceb_removal
  199. # === 4. 从 current_params 动态读取上下限 ===
  200. TMP0_min, TMP0_max = self.state_bounds.TMP0_min, self.state_bounds.global_TMP_hard_limit
  201. q_UF_min, q_UF_max = self.state_bounds.q_UF_min, self.state_bounds.q_UF_max
  202. temp_min, temp_max = self.state_bounds.temp_min, self.state_bounds.temp_max
  203. nuK_min, nuK_max = self.state_bounds.nuK_min, self.state_bounds.nuK_max
  204. slope_min, slope_max = self.state_bounds.slope_min, self.state_bounds.slope_max
  205. power_min, power_max = self.state_bounds.power_min, self.state_bounds.power_max
  206. ceb_min, ceb_max = self.state_bounds.ceb_removal_min, self.state_bounds.ceb_removal_max
  207. # === 5. 归一化计算(clip防止越界) ===
  208. TMP0_norm = np.clip((TMP - TMP0_min) / (TMP0_max - TMP0_min), 0, 1)
  209. q_UF_norm = np.clip((q_UF - q_UF_min) / (q_UF_max - q_UF_min), 0, 1)
  210. temp_norm = np.clip((temp - temp_min) / (temp_max - temp_min), 0, 1)
  211. # R0 不在 current_params 中定义上下限,设定经验范围
  212. R0_norm = np.clip((R - 100.0) / (800.0 - 100.0), 0, 1)
  213. short_term_norm = np.clip((nuk - nuK_min) / (nuK_max - nuK_min), 0, 1)
  214. long_term_slope_norm = np.clip((slope - slope_min) / (slope_max - slope_min), 0, 1)
  215. long_term_power_norm = np.clip((power - power_min) / (power_max - power_min), 0, 1)
  216. ceb_removal_norm = np.clip((ceb_removal - ceb_min) / (ceb_max - ceb_min), 0, 1)
  217. # === 6. 构建观测向量 ===
  218. obs = np.array([
  219. TMP0_norm,
  220. q_UF_norm,
  221. temp_norm,
  222. R0_norm,
  223. short_term_norm,
  224. long_term_slope_norm,
  225. long_term_power_norm,
  226. ceb_removal_norm
  227. ], dtype=np.float32)
  228. return obs
  229. def get_action_values(self, action):
  230. """
  231. 将动作还原为实际时长
  232. """
  233. L_idx = action // self.num_bw
  234. t_bw_idx = action % self.num_bw
  235. return self.L_values[L_idx], self.t_bw_values[t_bw_idx]
  236. def step(self, action):
  237. self.current_step += 1
  238. L_s, t_bw_s = self.get_action_values(action)
  239. L_s = np.clip(L_s, self.action_spec.L_min_s, self.action_spec.L_max_s)
  240. t_bw_s = np.clip(t_bw_s, self.action_spec.t_bw_min_s, self.action_spec.t_bw_max_s)
  241. # 模拟超级周期
  242. info, next_state = self.physics.simulate_one_supercycle(state=self.state,L_s=L_s, t_bw_s=t_bw_s)
  243. # 根据 info 判断是否成功
  244. feasible = self.physics.is_dead_cycle(info) # True 表示成功循环,False 表示失败
  245. if info["max_TMP_during_filtration"] >= self.reward_params.global_TMP_hard_limit:
  246. self.tmp_over_limit_flag = True
  247. # ================== 孤立观察下一周期 ==================
  248. info_next = None
  249. if info["max_TMP_during_filtration"] > self.reward_params.global_TMP_soft_limit:
  250. info_next, _ = self.physics.simulate_one_supercycle(state=next_state,L_s=L_s,t_bw_s=t_bw_s)
  251. reward,tmp_penalty,econ_reward,res_penalty= self._calculate_reward(info, info_next)
  252. info["tmp_penalty"] = tmp_penalty
  253. info["econ_reward"] = econ_reward
  254. info["res_penalty"] = res_penalty
  255. self.state = next_state
  256. terminated = False
  257. # 判断是否到达最大步数
  258. truncated = self.current_step >= self.max_episode_steps
  259. self.last_action = (L_s, t_bw_s)
  260. next_obs = self.get_obs(next_state)
  261. info["feasible"] = feasible
  262. info["step"] = self.current_step
  263. info["L_s"] = L_s.copy()
  264. info["t_bw_s"] = t_bw_s.copy()
  265. # # ===================== 测试终末奖励:鼓励 TMP 接近初始状态 =====================
  266. # # 仅在 episode 自然结束(满步但未提前失败)时触发
  267. # if truncated and not terminated:
  268. # TMP_initial = self.TMP0 # reset 时记录的初始 TMP
  269. # TMP_final = next_obs[0] # next_obs 提供的最终 TMP
  270. #
  271. # delta_ratio = abs((TMP_final - TMP_initial) / TMP_initial)
  272. #
  273. # alpha = 4.0 # TMP 偏差敏感度
  274. # gamma = 5.0 # 奖励幅度
  275. # stability_reward = gamma * (np.exp(-alpha * delta_ratio) - 1) # 量级在0到-5之间
  276. #
  277. # reward += stability_reward
  278. # terminated = True # episode 正式结束
  279. # # ===================== 测试结果 =====================
  280. # 增加该奖励后强化学习依然能保证奖励收敛,但是损失函数在2-3之间反复震荡,无法降低,见reward_test&loss_test
  281. # 原设想是只能听在大额偏移发生前能通过该奖励学习到提前减小偏移步伐,但是实际训练时该惩罚反复被触发
  282. # 推测是终末的大额奖惩无法有效传递回过往时间步引导智能体学习,可能由于状态中缺少预测值,智能体会将其观测为不可控事件,暂时不添加该奖励,TODO:等待优化
  283. return next_obs, reward, terminated, truncated, info
  284. def _calculate_reward(self, info: dict, info_next=None):
  285. """
  286. 计算强化学习奖励函数(经济性 + 系统稳定性)
  287. 奖励结构:
  288. Reward = 经济奖励 + 污染控制奖励 + TMP风险惩罚
  289. 经济奖励:
  290. 基于吨水电耗 + 吨水药耗
  291. 稳定性奖励:
  292. - 残余污染控制
  293. - TMP软限制
  294. - TMP增长趋势
  295. 返回:
  296. total_reward,
  297. tmp_penalty,
  298. econ_reward,
  299. res_penalty
  300. """
  301. # ==============================
  302. # TMP 状态惩罚
  303. # ==============================
  304. tmp = info["max_TMP_during_filtration"]
  305. tmp_soft = self.reward_params.global_TMP_soft_limit
  306. tmp_hard = self.reward_params.global_TMP_hard_limit
  307. if self.tmp_over_limit_flag:
  308. tmp_state_penalty = -self.reward_params.w_tmp_hard
  309. elif tmp <= tmp_soft:
  310. tmp_state_penalty = 0.0
  311. elif tmp < tmp_hard:
  312. x = (tmp - tmp_soft) / (tmp_hard - tmp_soft)
  313. tmp_state_penalty = -self.reward_params.w_tmp * (
  314. x ** self.reward_params.p
  315. )
  316. else:
  317. tmp_state_penalty = -self.reward_params.w_tmp_hard
  318. # ==============================
  319. # TMP 趋势惩罚
  320. # ==============================
  321. tmp_trend_penalty = 0.0
  322. if info_next is not None:
  323. delta_tmp = (
  324. info_next["max_TMP_during_filtration"] - tmp
  325. )
  326. # 只惩罚TMP上升
  327. delta_tmp = max(delta_tmp, 0)
  328. tmp_trend_penalty = -self.reward_params.w_trend * delta_tmp
  329. tmp_penalty = tmp_state_penalty + tmp_trend_penalty
  330. # ==============================
  331. # 残余污染惩罚
  332. # ==============================
  333. residual_ratio = info["residual_ratio"]
  334. ref_residual = 1 / self.max_episode_steps
  335. res_penalty = -np.tanh(
  336. self.reward_params.k_res *
  337. (residual_ratio / ref_residual - 1)
  338. )
  339. # ==============================
  340. # 经济成本(电耗 + 药耗)
  341. # ==============================
  342. energy = info["ton_water_energy"]
  343. chemical = info["ton_water_chem"]
  344. alpha = self.reward_params.alpha_chemical
  345. cost = energy + alpha * chemical
  346. # 成本归一化范围
  347. cost_low = self.reward_params.cost_low
  348. cost_high = self.reward_params.cost_high
  349. cost_norm = (
  350. (cost - cost_low) /
  351. (cost_high - cost_low)
  352. )
  353. econ_reward = -np.tanh(
  354. self.reward_params.k_cost *
  355. (cost_norm - 0.5)
  356. )
  357. # ==============================
  358. # 总奖励
  359. # ==============================
  360. total_reward = (
  361. econ_reward
  362. + res_penalty
  363. + tmp_penalty
  364. )
  365. return (
  366. total_reward,
  367. tmp_penalty,
  368. econ_reward,
  369. res_penalty
  370. )