Parcourir la source

feat:真实工厂数据测试版
- 优化渲染

junc_WHU il y a 3 mois
Parent
commit
eba5aa9521

+ 26 - 9
models/uf-rl/uf_train/env/env_params.py

@@ -281,30 +281,47 @@ class UFActionSpec:
 @dataclass(frozen=True)
 class UFRewardParams:
     """
-    【奖励函数与安全约束参数】
-    用于 reward 计算和 episode 终止判断。
+    【奖励函数与安全约束参数】用于 reward 计算和 episode 终止判断。
     """
 
+    # ------------------ TMP 限值 ------------------
     global_TMP_hard_limit: float = 0.08
     # TMP 硬上限(MPa)
-    # 说明:超过此值将导致episode失败,需立即停机
+    # 说明:超过此值将导致 episode 失败,需立即停机
 
     global_TMP_soft_limit: float = 0.06
     # TMP 软上限 (MPa)
-    # 说明:此上限用于指导奖励函数中膜阻力允许上升值,越接近该上限,系统对膜阻力上升控制的更严格
+    # 说明:此上限用于指导奖励函数中膜阻力允许上升值,
+    # 越接近该上限,系统对膜阻力上升控制的更严格
 
+    # ------------------ 回收率奖励参数 ------------------
     k_rec: float = 5.0
     # 回收率敏感度系数(控制回收率奖励的陡峭程度)
 
-    k_res: float = 10.0
-    # 残余污染敏感度系数(控制污染惩罚的陡峭程度)
-
     rec_low: float = 0.92
     rec_high: float = 0.99
-    # 回收率正常范围
+    # 回收率正常范围,用于归一化奖励
+
+    # ------------------ 残余污染奖励参数 ------------------
+    k_res: float = 10.0
+    # 残余污染敏感度系数(控制污染惩罚的陡峭程度)
 
     rr0: float = 0.08
-    # 残余污染比例参考值
+    # 残余污染比例参考值(原设计参考点)
+    # 在新设计中,每步允许上升比例由 1/max_episode_steps 决定,可覆盖原值
+
+    # ------------------ 吨水电耗奖励参数 ------------------
+    k_energy: float = 5.0
+    # 吨水电耗敏感度系数(控制电耗奖励/惩罚陡峭程度)
+
+    energy_low: float = 0.0993
+    energy_high: float = 0.1034
+    energy_ref: float = 0.1011
+    # 电耗参考值
+    # energy_low: 低耗电,奖励较高
+    # energy_high: 高耗电,奖励为负
+    # energy_ref: 平衡点,对应奖励为 0
+
 
 @dataclass(frozen=True)
 class UFStateBounds:

+ 15 - 1
models/uf-rl/uf_train/env/env_visual.py

@@ -2,6 +2,7 @@ import numpy as np
 from stable_baselines3.common.callbacks import BaseCallback
 
 
+
 class UFEpisodeRecorder:
     """记录episode中的决策和结果"""
 
@@ -69,10 +70,23 @@ class UFTrainingCallback(BaseCallback):
                 step_reward = rewards[0] if rewards is not None else 0.0
                 step_done = dones[0] if dones is not None else False
                 step_info = infos[0] if infos is not None else {}
+                L_s = step_info["L_s"]
+                t_bw_s = step_info["t_bw_s"]
+                initial_tmp = step_info["initial_tmp"]
+                max_TMP_during_filtration = step_info["max_TMP_during_filtration"]
+                tmp_after_ceb = step_info["tmp_after_ceb"]
+                residual_ratio =step_info["residual_ratio"]
+                rec_reward = step_info["rec_reward"]
+                energy_reward = step_info["energy_reward"]
+                recovery = step_info["recovery"]
+                res_penalty = step_info["res_penalty"]
 
                 # 打印当前 step 的信息
                 if self.verbose:
-                    print(f"[Step {self.num_timesteps}] 动作={step_action}, 奖励={step_reward:.3f}, Done={step_done}")
+                    print(f"[Step {self.num_timesteps}] 动作={step_action}, 奖励={step_reward:.3f}, Done={step_done}, L_s={L_s}, t_bw_s={t_bw_s},"
+                          f"residual_ratio = {residual_ratio:.4f},recovery = {recovery:.4f},"
+                          f"rec_reward = {rec_reward:.4f}, energy_reward = {energy_reward:.4f}, res_penalty = {res_penalty:.4f},"
+                          f"initial_tmp = {initial_tmp:.4f}, max_TMP_during_filtration ={max_TMP_during_filtration:.4f}, tmp_after_ceb = {tmp_after_ceb:.4f} ")
 
                 # 记录数据
                 self.recorder.record_step(

+ 60 - 21
models/uf-rl/uf_train/env/uf_env.py

@@ -303,12 +303,19 @@ class UFSuperCycleEnv(gym.Env):
 
         if feasible:
             # 每步奖励
-            reward = self._calculate_reward(info)
+            reward, rec_reward, energy_reward, res_penalty = self._calculate_reward(info)
+            info["rec_reward"] = rec_reward
+            info["energy_reward"] = energy_reward
+            info["res_penalty"] = res_penalty
+
             self.state = next_state
             terminated = False
         else:
             # 中途失败惩罚
             reward = -10
+            info["rec_reward"] = None
+            info["energy_reward"] = None
+            info["res_penalty"] = None
             terminated = True
 
         # 判断是否到达最大步数
@@ -319,6 +326,8 @@ class UFSuperCycleEnv(gym.Env):
 
         info["feasible"] = feasible
         info["step"] = self.current_step
+        info["L_s"] = L_s.copy()
+        info["t_bw_s"] = t_bw_s.copy()
 
         # # ===================== 测试终末奖励:鼓励 TMP 接近初始状态 =====================
         # # 仅在 episode 自然结束(满步但未提前失败)时触发
@@ -344,36 +353,49 @@ class UFSuperCycleEnv(gym.Env):
 
     def _calculate_reward(self, info: dict) -> float:
         """
-        计算强化学习奖励函数
+        计算强化学习奖励函数(扩展版)
 
         功能:
-        - 平衡回收率和残余污染两个目标
+        - 平衡回收率、残余污染和吨水电耗三个目标
         - TMP不直接参与奖励计算(通过失败判定间接影响)
         - 使用 tanh 函数实现平滑的非线性奖励
 
         参数:
-            info (dict): 周期性能指标字典
+            info (dict): 周期性能指标字典,需包含
+                - recovery: 回收率 [0-1]
+                - R_after_ceb: 本周期结束膜阻力
+                - initial_R: 本周期初始膜阻力
+                - delta_R_allow: 本周期允许最大阻力上升
+                - ton_water_energy_kWh_per_m3: 本周期吨水电耗
 
         返回:
-            float: 奖励值(通常在 -2 到 +2 之间)
+            float: 奖励值(通常在 -3 到 +3 之间)
 
         设计思想:
         - 高回收率 → 水资源利用率高 → 正奖励
         - 低残余污染 → 膜长期稳定运行 → 正奖励
-        - 两者需要权衡:过短的过滤时间提高回收率但污染去除不彻底;
-                          过长的过滤时间污染控制好但回收率下降
+        - 低吨水电耗 → 节能 → 正奖励
+        - 三者需要权衡:过短的过滤时间提高回收率但污染去除不彻底;过长时间污染控制好但回收率下降,过高功率增加耗能
 
         参考点设计:
-        - (recovery=0.97, residual_ratio=0.1) → reward ≈ 0(高回收但污染高)
-        - (recovery=0.90, residual_ratio=0.0) → reward ≈ 0(低污染但回收率低)
-        - (recovery≈0.94, residual_ratio≈0.05) → reward > 0(平衡点)
+        - 残余污染:
+            - 高污染参考点 = 1 / self.max_episode_steps
+            - 平衡点 = 0.5 / self.max_episode_steps
+        - 吨水电耗:
+            - 高点 = 0.1034 kWh/m³
+            - 平衡点 = 0.1011 kWh/m³
+            - 低点 = 0.0993 kWh/m³
+        - 回收率参考点保持原有设计
         """
         # ========== 提取性能指标 ==========
         recovery = info["recovery"]  # 回收率 [0-1]
 
         # 污染比例:实际上升的阻力 / 允许上升的阻力
         # 允许上升的阻力值 = 当前阻力值软上限 - 当前阻力
-        residual_ratio = (info["R_after_ceb"] - info["initial_R"]) / info["delta_R_allow"]
+        residual_ratio = info['residual_ratio']
+
+        # 吨水电耗指标
+        energy = info["ton_water_energy_kWh_per_m3"]
 
         # ========== 回收率奖励项 ==========
         # 将回收率归一化到 [0, 1] 区间(基于预期范围)
@@ -386,22 +408,39 @@ class UFSuperCycleEnv(gym.Env):
         # - k_rec 控制曲线陡峭程度,越大变化越陡
         rec_reward = np.clip(np.tanh(self.reward_params.k_rec * (rec_norm - 0.5)), -1, 1)
 
-        # ========== 污染惩罚项 ==========
-        # 使用 tanh 函数构建惩罚曲线
-        # - residual_ratio < rr0 时,res_penalty > 0(奖励低污染)
-        # - residual_ratio > rr0 时,res_penalty < 0(惩罚高污染)
+        # ========== 残余污染惩罚项 ==========
+        # 新参考点:每步允许上升比例 = 1 / max_episode_steps
+        # 平衡点 = 0.5 / max_episode_steps
+        ref_residual = 0.5 / self.max_episode_steps
+
+        # 使用 tanh 构建惩罚曲线
+        # - residual_ratio < 平衡点时,res_penalty > 0(奖励低污染)
+        # - residual_ratio > 平衡点时,res_penalty < 0(惩罚高污染)
         # - k_res 控制曲线陡峭程度
-        res_penalty = -np.tanh(self.reward_params.k_res * (residual_ratio / self.reward_params.rr0 - 1))
+        res_penalty = -np.tanh(self.reward_params.k_res * (residual_ratio / ref_residual - 1))
+
+        # ========== 吨水电耗奖励项 ==========
+        # 设置高/平衡/低点
+        energy_low = 0.0993
+        energy_high = 0.1034
+
+        # 将能耗归一化到 [0, 1],平衡点对应 energy_norm = 0.5
+        energy_norm = (energy - energy_low) / (energy_high - energy_low)
+
+        # 使用 tanh 构建平滑奖励
+        # - energy_norm < 0.5 时,energy_reward > 0(节能奖励)
+        # - energy_norm > 0.5 时,energy_reward < 0(高能耗惩罚)
+        # - k_energy 控制曲线陡峭程度
+        energy_reward = -np.tanh(self.reward_params.k_energy * (energy_norm - 0.5))
 
         # ========== 组合奖励 ==========
-        # 简单线性组合两项(也可以加权)
-        total_reward = rec_reward + res_penalty
+        # 简单线性组合三项(为污染项加权)
+        total_reward = rec_reward + 2.0 * res_penalty + energy_reward
+
 
         # 可选:添加平移项使特定点的奖励为零(当前未使用)
         # total_reward -= offset
 
-        return total_reward
-
-
+        return total_reward, rec_reward, energy_reward, res_penalty
 
 

+ 6 - 4
models/uf-rl/uf_train/env/uf_physics.py

@@ -355,6 +355,7 @@ class UFPhysicsModel:
             self.resistance_from_tmp(max_tmp_during_filtration, state.q_UF, state.temp),
             1e-6
         )
+        residual_ratio = (R_after_ceb - initial_R) / delta_R_allow
 
         # ========== 构建性能指标字典 ==========
         info = {
@@ -384,6 +385,7 @@ class UFPhysicsModel:
             "R_after_ceb": R_after_ceb,  # CEB后膜阻力
             "max_residual_increase_per_run": max_residual_increase,  # 最大残余污染增量
             "delta_R_allow": delta_R_allow,  # 污染允许增长空间
+            "residual_ratio" : residual_ratio, # 污染上升比例
 
             # 能耗指标
             "ton_water_energy_kWh_per_m3": ton_water_energy,  # 吨水电耗
@@ -394,12 +396,12 @@ class UFPhysicsModel:
         next_state.TMP = tmp_after_ceb
         next_state.R = R_after_ceb
 
-        # ========== 可选更新的参数(当前保持不变) ==========
+        # ========== 可选更新的参数 ==========
         # 这些参数可根据实际情况动态调整,预留扩展接口
+        next_state.nuK = state.nuK  # 短期污染系数
         next_state.slope = state.slope  # 长期污染斜率
         next_state.power = state.power  # 长期污染幂次
         next_state.ceb_removal = state.ceb_removal  # CEB去除能力
-        next_state.nuK = state.nuK  # 短期污染系数
         next_state.q_UF = state.q_UF  # 过滤流量
         next_state.temp = state.temp  # 水温
 
@@ -452,7 +454,7 @@ class UFPhysicsModel:
 
         # 条件3:污染增长比例超过容许范围
         residual_increase = (R_after_ceb - R0) / delta_R_allow
-        if residual_increase > 1 / 45:
+        if residual_increase > 1 / 30:
             return False  # 失败
 
         # 所有条件通过
@@ -461,7 +463,7 @@ class UFPhysicsModel:
     def check_dead_initial_state(
             self,
             init_state: UFState,
-            max_steps: int = 15,
+            max_steps: int = 45,
             L_s: float = 3800.0,
             t_bw_s: float = 60.0
     ) -> bool:

+ 20 - 4
models/uf-rl/uf_train/rl_model/DQN/dqn_trainer.py

@@ -14,7 +14,7 @@ class DQNTrainer:
     - 在测试集环境上评估策略
     """
 
-    def __init__(self, env, params, callback=None):
+    def __init__(self, env, params, callback=None,PROJECT_ROOT=None):
         """
         初始化训练器
 
@@ -26,25 +26,41 @@ class DQNTrainer:
         self.env = env
         self.params = params
         self.callback = callback
+        self.PROJECT_ROOT = PROJECT_ROOT
         self.log_dir = self._create_log_dir()  # 创建TensorBoard日志目录
         self.model = self._create_model()      # 创建DQN模型
 
     # ------------------- 私有方法 -------------------
     def _create_log_dir(self):
         """
-        创建TensorBoard日志目录
+        创建 TensorBoard 日志目录(固定在 PROJECT_ROOT/model_result/uf_dqn_tensorboard 下)
         """
+        import os
+        import time
+
+        # 1️⃣ 时间戳,用于区分每次训练
         timestamp = time.strftime("%Y%m%d-%H%M%S")
+
+        # 2️⃣ 将浮点参数转成整数便于命名
         lr_int = int(self.params.learning_rate * 1e4)
         gamma_int = int(self.params.gamma * 100)
         exp_int = int(self.params.exploration_fraction * 100)
 
-        log_name = f"DQN_lr{lr_int}_buf{self.params.buffer_size}_bs{self.params.batch_size}_gamma{gamma_int}_exp{exp_int}_{self.params.remark}_{timestamp}"
+        # 3️⃣ 构建日志目录名称
+        log_name = (
+            f"DQN_lr{lr_int}_buf{self.params.buffer_size}_bs{self.params.batch_size}"
+            f"_gamma{gamma_int}_exp{exp_int}_{self.params.remark}_{timestamp}"
+        )
 
-        base_dir = os.path.join(os.getcwd(), "uf_dqn_tensorboard")
+        # 4️⃣ 固定日志存放位置:PROJECT_ROOT/model_result/uf_dqn_tensorboard
+        # 假设在 run_dqn_train.py 中定义了 PROJECT_ROOT = "models/uf-rl"
+        base_dir = os.path.join(self.PROJECT_ROOT, "model_result", "uf_dqn_tensorboard")
         os.makedirs(base_dir, exist_ok=True)
+
+        # 5️⃣ 完整日志目录路径
         log_dir = os.path.join(base_dir, log_name)
         os.makedirs(log_dir, exist_ok=True)
+
         return log_dir
 
     def _create_model(self):

+ 2 - 1
models/uf-rl/uf_train/rl_model/DQN/run_dqn_train.py

@@ -161,6 +161,7 @@ def main():
         env=train_env,
         params=dqn_params,
         callback=callback,
+        PROJECT_ROOT=PROJECT_ROOT
     )
 
 
@@ -208,7 +209,7 @@ if __name__ == "__main__":
     # 2. 全局配置
     # ============================================================
     RANDOM_SEED = 2025
-    TOTAL_TIMESTEPS = 1000000
+    TOTAL_TIMESTEPS = 1500000
 
     RESET_STATE_CSV = (
             PROJECT_ROOT