|
|
@@ -303,12 +303,19 @@ class UFSuperCycleEnv(gym.Env):
|
|
|
|
|
|
if feasible:
|
|
|
# 每步奖励
|
|
|
- reward = self._calculate_reward(info)
|
|
|
+ reward, rec_reward, energy_reward, res_penalty = self._calculate_reward(info)
|
|
|
+ info["rec_reward"] = rec_reward
|
|
|
+ info["energy_reward"] = energy_reward
|
|
|
+ info["res_penalty"] = res_penalty
|
|
|
+
|
|
|
self.state = next_state
|
|
|
terminated = False
|
|
|
else:
|
|
|
# 中途失败惩罚
|
|
|
reward = -10
|
|
|
+ info["rec_reward"] = None
|
|
|
+ info["energy_reward"] = None
|
|
|
+ info["res_penalty"] = None
|
|
|
terminated = True
|
|
|
|
|
|
# 判断是否到达最大步数
|
|
|
@@ -319,6 +326,8 @@ class UFSuperCycleEnv(gym.Env):
|
|
|
|
|
|
info["feasible"] = feasible
|
|
|
info["step"] = self.current_step
|
|
|
+ info["L_s"] = L_s.copy()
|
|
|
+ info["t_bw_s"] = t_bw_s.copy()
|
|
|
|
|
|
# # ===================== 测试终末奖励:鼓励 TMP 接近初始状态 =====================
|
|
|
# # 仅在 episode 自然结束(满步但未提前失败)时触发
|
|
|
@@ -344,36 +353,49 @@ class UFSuperCycleEnv(gym.Env):
|
|
|
|
|
|
def _calculate_reward(self, info: dict) -> float:
|
|
|
"""
|
|
|
- 计算强化学习奖励函数
|
|
|
+ 计算强化学习奖励函数(扩展版)
|
|
|
|
|
|
功能:
|
|
|
- - 平衡回收率和残余污染两个目标
|
|
|
+ - 平衡回收率、残余污染和吨水电耗三个目标
|
|
|
- TMP不直接参与奖励计算(通过失败判定间接影响)
|
|
|
- 使用 tanh 函数实现平滑的非线性奖励
|
|
|
|
|
|
参数:
|
|
|
- info (dict): 周期性能指标字典
|
|
|
+ info (dict): 周期性能指标字典,需包含
|
|
|
+ - recovery: 回收率 [0-1]
|
|
|
+ - R_after_ceb: 本周期结束膜阻力
|
|
|
+ - initial_R: 本周期初始膜阻力
|
|
|
+ - delta_R_allow: 本周期允许最大阻力上升
|
|
|
+ - ton_water_energy_kWh_per_m3: 本周期吨水电耗
|
|
|
|
|
|
返回:
|
|
|
- float: 奖励值(通常在 -2 到 +2 之间)
|
|
|
+ float: 奖励值(通常在 -3 到 +3 之间)
|
|
|
|
|
|
设计思想:
|
|
|
- 高回收率 → 水资源利用率高 → 正奖励
|
|
|
- 低残余污染 → 膜长期稳定运行 → 正奖励
|
|
|
- - 两者需要权衡:过短的过滤时间提高回收率但污染去除不彻底;
|
|
|
- 过长的过滤时间污染控制好但回收率下降
|
|
|
+ - 低吨水电耗 → 节能 → 正奖励
|
|
|
+ - 三者需要权衡:过短的过滤时间提高回收率但污染去除不彻底;过长时间污染控制好但回收率下降,过高功率增加耗能
|
|
|
|
|
|
参考点设计:
|
|
|
- - (recovery=0.97, residual_ratio=0.1) → reward ≈ 0(高回收但污染高)
|
|
|
- - (recovery=0.90, residual_ratio=0.0) → reward ≈ 0(低污染但回收率低)
|
|
|
- - (recovery≈0.94, residual_ratio≈0.05) → reward > 0(平衡点)
|
|
|
+ - 残余污染:
|
|
|
+ - 高污染参考点 = 1 / self.max_episode_steps
|
|
|
+ - 平衡点 = 0.5 / self.max_episode_steps
|
|
|
+ - 吨水电耗:
|
|
|
+ - 高点 = 0.1034 kWh/m³
|
|
|
+ - 平衡点 = 0.1011 kWh/m³
|
|
|
+ - 低点 = 0.0993 kWh/m³
|
|
|
+ - 回收率参考点保持原有设计
|
|
|
"""
|
|
|
# ========== 提取性能指标 ==========
|
|
|
recovery = info["recovery"] # 回收率 [0-1]
|
|
|
|
|
|
# 污染比例:实际上升的阻力 / 允许上升的阻力
|
|
|
# 允许上升的阻力值 = 当前阻力值软上限 - 当前阻力
|
|
|
- residual_ratio = (info["R_after_ceb"] - info["initial_R"]) / info["delta_R_allow"]
|
|
|
+ residual_ratio = info['residual_ratio']
|
|
|
+
|
|
|
+ # 吨水电耗指标
|
|
|
+ energy = info["ton_water_energy_kWh_per_m3"]
|
|
|
|
|
|
# ========== 回收率奖励项 ==========
|
|
|
# 将回收率归一化到 [0, 1] 区间(基于预期范围)
|
|
|
@@ -386,22 +408,39 @@ class UFSuperCycleEnv(gym.Env):
|
|
|
# - k_rec 控制曲线陡峭程度,越大变化越陡
|
|
|
rec_reward = np.clip(np.tanh(self.reward_params.k_rec * (rec_norm - 0.5)), -1, 1)
|
|
|
|
|
|
- # ========== 污染惩罚项 ==========
|
|
|
- # 使用 tanh 函数构建惩罚曲线
|
|
|
- # - residual_ratio < rr0 时,res_penalty > 0(奖励低污染)
|
|
|
- # - residual_ratio > rr0 时,res_penalty < 0(惩罚高污染)
|
|
|
+ # ========== 残余污染惩罚项 ==========
|
|
|
+ # 新参考点:每步允许上升比例 = 1 / max_episode_steps
|
|
|
+ # 平衡点 = 0.5 / max_episode_steps
|
|
|
+ ref_residual = 0.5 / self.max_episode_steps
|
|
|
+
|
|
|
+ # 使用 tanh 构建惩罚曲线
|
|
|
+ # - residual_ratio < 平衡点时,res_penalty > 0(奖励低污染)
|
|
|
+ # - residual_ratio > 平衡点时,res_penalty < 0(惩罚高污染)
|
|
|
# - k_res 控制曲线陡峭程度
|
|
|
- res_penalty = -np.tanh(self.reward_params.k_res * (residual_ratio / self.reward_params.rr0 - 1))
|
|
|
+ res_penalty = -np.tanh(self.reward_params.k_res * (residual_ratio / ref_residual - 1))
|
|
|
+
|
|
|
+ # ========== 吨水电耗奖励项 ==========
|
|
|
+ # 设置高/平衡/低点
|
|
|
+ energy_low = 0.0993
|
|
|
+ energy_high = 0.1034
|
|
|
+
|
|
|
+ # 将能耗归一化到 [0, 1],平衡点对应 energy_norm = 0.5
|
|
|
+ energy_norm = (energy - energy_low) / (energy_high - energy_low)
|
|
|
+
|
|
|
+ # 使用 tanh 构建平滑奖励
|
|
|
+ # - energy_norm < 0.5 时,energy_reward > 0(节能奖励)
|
|
|
+ # - energy_norm > 0.5 时,energy_reward < 0(高能耗惩罚)
|
|
|
+ # - k_energy 控制曲线陡峭程度
|
|
|
+ energy_reward = -np.tanh(self.reward_params.k_energy * (energy_norm - 0.5))
|
|
|
|
|
|
# ========== 组合奖励 ==========
|
|
|
- # 简单线性组合两项(也可以加权)
|
|
|
- total_reward = rec_reward + res_penalty
|
|
|
+ # 简单线性组合三项(为污染项加权)
|
|
|
+ total_reward = rec_reward + 2.0 * res_penalty + energy_reward
|
|
|
+
|
|
|
|
|
|
# 可选:添加平移项使特定点的奖励为零(当前未使用)
|
|
|
# total_reward -= offset
|
|
|
|
|
|
- return total_reward
|
|
|
-
|
|
|
-
|
|
|
+ return total_reward, rec_reward, energy_reward, res_penalty
|
|
|
|
|
|
|