""" 超滤强化学习环境模块 ======================== 本模块定义了超滤系统的强化学习环境,包括: 1. UFParams: 超滤系统参数配置类 2. 膜阻力与跨膜压差转换函数 3. simulate_one_supercycle: 超级周期模拟函数 4. calculate_reward: 奖励函数 5. is_dead_cycle: 失败判定函数 6. UFSuperCycleEnv: Gymnasium环境类 模块设计说明: - 基于 Gymnasium (原OpenAI Gym) 标准接口 - 模拟超滤膜的"超级周期"运行(多次物理反洗 + 一次化学反洗) - 强化学习智能体通过优化过滤时长和反洗时长来最大化回收率并控制污染累积 """ import numpy as np import gymnasium as gym from gymnasium import spaces from env.env_params import UFState, UFStateBounds, UFRewardParams, UFActionSpec from env.uf_physics import UFPhysicsModel from env.env_reset import ResetSampler import copy class UFSuperCycleEnv(gym.Env): """ 超滤系统强化学习环境(Gymnasium标准接口) 功能: - 模拟超滤膜的超级周期运行 - 智能体在每个超级周期选择过滤时长和反洗时长 - 目标:最大化回收率同时控制污染累积 状态空间 (8维,归一化到 [0,1]): 1. TMP0: 初始跨膜压差 2. q_UF: 过滤流量 3. temp: 水温 4. R0: 初始膜阻力 5. nuK: 短期污染系数 6. slope: 长期污染斜率 7. power: 长期污染幂次 8. ceb_removal: CEB去除能力 动作空间 (离散): - 二维离散动作组合:(过滤时长, 反洗时长) - 过滤时长: L_min_s ~ L_max_s,步长 L_step_s - 反洗时长: t_bw_min_s ~ t_bw_max_s,步长 t_bw_step_s - 总动作数 = len(L_values) × len(t_bw_values) 奖励机制: - 基于回收率和残余污染的平衡 - 失败 (TMP超限、回收率过低、污染过快) 时给予大负奖励 (-10) 终止条件: - terminated: 违反运行约束(失败) - truncated: 达到最大步数 (max_episode_steps) """ metadata = {"render_modes": ["human"]} def __init__( self, physics: UFPhysicsModel, reward_params: UFRewardParams, action_spec:UFActionSpec, statebounds:UFStateBounds, real_state_pool, max_episode_steps: int = 45, RANDOM_SEED = 1024 ): """ 超滤强化学习环境 参数: physics(UFPhysicsModel): 超滤物理模型 reward_params(UFRewardParams): 奖励函数参数 max_episode_steps (int): 每个episode的最大步数,默认45 注:每步代表一个超级周期(约2-3天),45步约三个月 """ super(UFSuperCycleEnv, self).__init__() self.RANDOM_SEED = RANDOM_SEED self.physics = physics self.reward_params = reward_params self.max_episode_steps = max_episode_steps self.current_step = 0 # -------- 动作空间 -------- self.action_spec = action_spec self.L_values = np.arange( self.action_spec.L_min_s, self.action_spec.L_max_s , self.action_spec.L_step_s, ) self.t_bw_values = np.arange( self.action_spec.t_bw_min_s, self.action_spec.t_bw_max_s, self.action_spec.t_bw_step_s, ) self.num_L = len(self.L_values) self.num_bw = len(self.t_bw_values) self.action_space = spaces.Discrete(self.num_L * self.num_bw) # -------- 状态空间 -------- self.observation_space = spaces.Box( low=0.0, high=1.0, shape=(8,), dtype=np.float32, ) self.state_bounds = statebounds # 状态边界 self.real_state_pool = real_state_pool self.reset_sampler = ResetSampler( bounds=self.state_bounds, physics=physics, real_state_pool=self.real_state_pool, max_resample_attempts=50, random_state=np.random.RandomState(RANDOM_SEED) ) def _generate_initial_state(self) -> UFState | None: """ 在 UFStateBounds 定义的范围内采样一个【合法】初始状态。 若采样失败(约束不满足)返回 None,由 reset() 负责重试。 """ b = self.state_bounds A = 128 * 40.0 # 有效膜面积 # ---- 1. 基础工况 ---- # ---- 随机生成 TMP、q_UF、温度 ---- TMP0 = np.random.uniform(b.TMP0_min, b.TMP0_max) q_UF = np.random.uniform(b.q_UF_min, b.q_UF_max) temp = np.random.uniform(b.temp_min, b.temp_max) # ---- 2. 污染增长参数 ---- slope = np.random.uniform(b.slope_min, b.slope_max) power = np.random.uniform(b.power_min, b.power_max) # ---- 3. 约束:污染增长速率可实现 ---- t_max = 60 if power >= 1 else 1 required_nuK_min = slope * power * (t_max ** (power - 1)) * (A / q_UF) # 若 required_nuK_min 超过可选范围 → 初始状态非法 if required_nuK_min > b.nuK_max: return None # 在可行范围中采样 nuK nuK = np.random.uniform( max(required_nuK_min, b.nuK_min), b.nuK_max ) # ---- 4. CEB 去除率 ---- ceb_removal = np.random.uniform( b.ceb_removal_min, b.ceb_removal_max ) # ---- 5. 初始膜阻力(物理模型) ---- R0 = self.physics.calculate_initial_resistance( TMP=TMP0, q_UF=q_UF, temp=temp ) return UFState( TMP=TMP0, q_UF=q_UF, temp=temp, R=R0, slope=slope, power=power, nuK=nuK, ceb_removal=ceb_removal, ) def _get_training_progress(self) -> float: """ 返回训练进度,用于 reset_sampler 的 curriculum sampling """ return min(1.0, self.current_step / self.max_episode_steps ) def reset(self, seed=None, options=None, max_attempts: int = 10000): super().reset(seed=seed) progress = self._get_training_progress() for _ in range(max_attempts): state = self.reset_sampler.sample(progress) if state is None: continue ok_run = self.physics.check_dead_initial_state( init_state=state, max_steps=self.max_episode_steps, L_s=self.action_spec.L_min_s, t_bw_s=self.action_spec.t_bw_max_s ) if ok_run: self.state = state break else: raise RuntimeError("无法生成可行初始状态") self.current_step = 0 self.tmp_over_limit_flag = False self.last_action = None self.max_TMP_during_filtration = self.state.TMP return self.get_obs(self.state), {} def _get_state_copy(self): return copy.deepcopy(self.state) def get_obs(self, state): """ 构建当前环境归一化状态向量 """ # === 1. 从 state 读取动态参数 === TMP = state.TMP q_UF = state.q_UF temp = state.temp # === 2. 计算本周期初始膜阻力 === R = state.R # === 3. 从 self.state 读取膜阻力增长模型参数 === nuk = state.nuK slope = state.slope power = state.power ceb_removal = state.ceb_removal # === 4. 从 current_params 动态读取上下限 === TMP0_min, TMP0_max = self.state_bounds.TMP0_min, self.state_bounds.global_TMP_hard_limit q_UF_min, q_UF_max = self.state_bounds.q_UF_min, self.state_bounds.q_UF_max temp_min, temp_max = self.state_bounds.temp_min, self.state_bounds.temp_max nuK_min, nuK_max = self.state_bounds.nuK_min, self.state_bounds.nuK_max slope_min, slope_max = self.state_bounds.slope_min, self.state_bounds.slope_max power_min, power_max = self.state_bounds.power_min, self.state_bounds.power_max ceb_min, ceb_max = self.state_bounds.ceb_removal_min, self.state_bounds.ceb_removal_max # === 5. 归一化计算(clip防止越界) === TMP0_norm = np.clip((TMP - TMP0_min) / (TMP0_max - TMP0_min), 0, 1) q_UF_norm = np.clip((q_UF - q_UF_min) / (q_UF_max - q_UF_min), 0, 1) temp_norm = np.clip((temp - temp_min) / (temp_max - temp_min), 0, 1) # R0 不在 current_params 中定义上下限,设定经验范围 R0_norm = np.clip((R - 100.0) / (800.0 - 100.0), 0, 1) short_term_norm = np.clip((nuk - nuK_min) / (nuK_max - nuK_min), 0, 1) long_term_slope_norm = np.clip((slope - slope_min) / (slope_max - slope_min), 0, 1) long_term_power_norm = np.clip((power - power_min) / (power_max - power_min), 0, 1) ceb_removal_norm = np.clip((ceb_removal - ceb_min) / (ceb_max - ceb_min), 0, 1) # === 6. 构建观测向量 === obs = np.array([ TMP0_norm, q_UF_norm, temp_norm, R0_norm, short_term_norm, long_term_slope_norm, long_term_power_norm, ceb_removal_norm ], dtype=np.float32) return obs def get_action_values(self, action): """ 将动作还原为实际时长 """ L_idx = action // self.num_bw t_bw_idx = action % self.num_bw return self.L_values[L_idx], self.t_bw_values[t_bw_idx] def step(self, action): self.current_step += 1 L_s, t_bw_s = self.get_action_values(action) L_s = np.clip(L_s, self.action_spec.L_min_s, self.action_spec.L_max_s) t_bw_s = np.clip(t_bw_s, self.action_spec.t_bw_min_s, self.action_spec.t_bw_max_s) # 模拟超级周期 info, next_state = self.physics.simulate_one_supercycle(state=self.state,L_s=L_s, t_bw_s=t_bw_s) # 根据 info 判断是否成功 feasible = self.physics.is_dead_cycle(info) # True 表示成功循环,False 表示失败 if info["max_TMP_during_filtration"] >= self.reward_params.global_TMP_hard_limit: self.tmp_over_limit_flag = True # ================== 孤立观察下一周期 ================== info_next = None if info["max_TMP_during_filtration"] > self.reward_params.global_TMP_soft_limit: info_next, _ = self.physics.simulate_one_supercycle(state=next_state,L_s=L_s,t_bw_s=t_bw_s) reward,tmp_penalty,econ_reward,res_penalty= self._calculate_reward(info, info_next) info["tmp_penalty"] = tmp_penalty info["econ_reward"] = econ_reward info["res_penalty"] = res_penalty self.state = next_state terminated = False # 判断是否到达最大步数 truncated = self.current_step >= self.max_episode_steps self.last_action = (L_s, t_bw_s) next_obs = self.get_obs(next_state) info["feasible"] = feasible info["step"] = self.current_step info["L_s"] = L_s.copy() info["t_bw_s"] = t_bw_s.copy() # # ===================== 测试终末奖励:鼓励 TMP 接近初始状态 ===================== # # 仅在 episode 自然结束(满步但未提前失败)时触发 # if truncated and not terminated: # TMP_initial = self.TMP0 # reset 时记录的初始 TMP # TMP_final = next_obs[0] # next_obs 提供的最终 TMP # # delta_ratio = abs((TMP_final - TMP_initial) / TMP_initial) # # alpha = 4.0 # TMP 偏差敏感度 # gamma = 5.0 # 奖励幅度 # stability_reward = gamma * (np.exp(-alpha * delta_ratio) - 1) # 量级在0到-5之间 # # reward += stability_reward # terminated = True # episode 正式结束 # # ===================== 测试结果 ===================== # 增加该奖励后强化学习依然能保证奖励收敛,但是损失函数在2-3之间反复震荡,无法降低,见reward_test&loss_test # 原设想是只能听在大额偏移发生前能通过该奖励学习到提前减小偏移步伐,但是实际训练时该惩罚反复被触发 # 推测是终末的大额奖惩无法有效传递回过往时间步引导智能体学习,可能由于状态中缺少预测值,智能体会将其观测为不可控事件,暂时不添加该奖励,TODO:等待优化 return next_obs, reward, terminated, truncated, info def _calculate_reward(self, info: dict, info_next=None): """ 计算强化学习奖励函数(经济性 + 系统稳定性) 奖励结构: Reward = 经济奖励 + 污染控制奖励 + TMP风险惩罚 经济奖励: 基于吨水电耗 + 吨水药耗 稳定性奖励: - 残余污染控制 - TMP软限制 - TMP增长趋势 返回: total_reward, tmp_penalty, econ_reward, res_penalty """ # ============================== # TMP 状态惩罚 # ============================== tmp = info["max_TMP_during_filtration"] tmp_soft = self.reward_params.global_TMP_soft_limit tmp_hard = self.reward_params.global_TMP_hard_limit if self.tmp_over_limit_flag: tmp_state_penalty = -self.reward_params.w_tmp_hard elif tmp <= tmp_soft: tmp_state_penalty = 0.0 elif tmp < tmp_hard: x = (tmp - tmp_soft) / (tmp_hard - tmp_soft) tmp_state_penalty = -self.reward_params.w_tmp * ( x ** self.reward_params.p ) else: tmp_state_penalty = -self.reward_params.w_tmp_hard # ============================== # TMP 趋势惩罚 # ============================== tmp_trend_penalty = 0.0 if info_next is not None: delta_tmp = ( info_next["max_TMP_during_filtration"] - tmp ) # 只惩罚TMP上升 delta_tmp = max(delta_tmp, 0) tmp_trend_penalty = -self.reward_params.w_trend * delta_tmp tmp_penalty = tmp_state_penalty + tmp_trend_penalty # ============================== # 残余污染惩罚 # ============================== residual_ratio = info["residual_ratio"] ref_residual = 1 / self.max_episode_steps res_penalty = -np.tanh( self.reward_params.k_res * (residual_ratio / ref_residual - 1) ) # ============================== # 经济成本(电耗 + 药耗) # ============================== energy = info["ton_water_energy"] chemical = info["ton_water_chem"] chemical_price = self.reward_params.chemical_price energy_price =self.reward_params.energy_price cost = energy * energy_price + chemical * chemical_price * 100 # 成本归一化范围 cost_low = self.reward_params.cost_low cost_high = self.reward_params.cost_high cost_norm = ( (cost - cost_low) / (cost_high - cost_low) ) econ_reward = -np.tanh( self.reward_params.k_cost * (cost_norm - 0.5) ) # ============================== # 总奖励 # ============================== total_reward = ( econ_reward + res_penalty + tmp_penalty ) return ( total_reward, tmp_penalty, econ_reward, res_penalty )