пре 5 месеци · f9163094cb
--- a/models/uf-rl/超滤开发/DEPLOYMENT_PLAN.md
+++ b/models/uf-rl/超滤开发/DEPLOYMENT_PLAN.md
@@ -0,0 +1,188 @@
 
				+# 超滤AI控制系统部署方案
			
 
				+
			
 
				+## 一、测试流程
			
 
				+### 控制两台设备（1-2周）
			
 
				+**目标**：控制设备，看效果
			
 
				+
			
 
				+**操作**：
			
 
				+0. 启动程序：`nohup python loop_main.py > /dev/null 2>&1 &`
			
 
				+1. **选两台设备**：比如 UF1 和 UF2（其他设备保持关闭）
			
 
				+2. **前端开启** UF1 和 UF2 的模型开关
			
 
				+3. **每天看2次**：早上一次，下午一次
			
 
				+4. **重点关注**：
			
 
				+   - 设备有没有报警
			
 
				+   - TMP趋势是不是稳定（看 device_states.json 里保存的历史值）
			
 
				+   - PLC参数有没有乱跳
			
 
				+
			
 
				+**判断标准**：
			
 
				+- 决策的产水时间在 3600-6000 秒之间
			
 
				+- 决策的反洗时间在 40-60 秒之间
			
 
				+- 看着无明显异常
			
 
				+
			
 
				+**出现问题立即停止**：
			
 
				+- 设备报警了 → 前端立即关闭
			
 
				+- TMP一直涨或者一直降 → 关闭
			
 
				+- 感觉不对劲 → 关闭
			
 
				+
			
 
				+**没问题就继续跑**：
			
 
				+- 跑个1-2周
			
 
				+- 确认稳定后再放开观察频率
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 二、日常怎么看
			
 
				+
			
 
				+### 看日志
			
 
				+```bash
			
 
				+# 看最近的运行情况
			
 
				+tail -n 50 monitor_service.log
			
 
				+
			
 
				+# 看有没有错误
			
 
				+tail -n 200 monitor_service.log | grep ERROR
			
 
				+
			
 
				+# 实时跟踪
			
 
				+tail -f monitor_service.log
			
 
				+```
			
 
				+
			
 
				+### 看设备状态
			
 
				+```bash
			
 
				+# 查看最近5次TMP记录，判断趋势
			
 
				+cat device_states.json
			
 
				+
			
 
				+# 看起来格式化点
			
 
				+cat device_states.json | python -m json.tool
			
 
				+```
			
 
				+
			
 
				+**TMP趋势判断**：
			
 
				+- 一直涨：可能膜污染严重，要注意
			
 
				+- 一直降：可能反洗太频繁
			
 
				+- 波动正常：没啥问题
			
 
				+
			
 
				+### 看设备本身
			
 
				+- SCADA系统看设备状态
			
 
				+- 现场看有没有报警
			
 
				+- 产水量、能耗这些数据对比下
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 三、出问题怎么办
			
 
				+
			
 
				+### 设备报警
			
 
				+1. 前端立即关闭模型
			
 
				+2. 搞清楚是不是模型导致的
			
 
				+
			
 
				+### 程序崩溃
			
 
				+1. 看日志最后几行，看报什么错
			
 
				+2. 修复后重启
			
 
				+
			
 
				+### TMP一直涨
			
 
				+1. 前端关闭模型
			
 
				+2. 观察是不是水质问题
			
 
				+3. 可能需要手动CEB
			
 
				+
			
 
				+### 决策乱跳
			
 
				+1. 检查输入数据是否正常
			
 
				+2. 可能是数据采集有问题
			
 
				+3. 先关闭，排查原因
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 四、关键点说明
			
 
				+
			
 
				+### 1. 跨膜压差异常判断
			
 
				+**系统里能判断**：
			
 
				+- 通过保存的历史TMP值判断趋势
			
 
				+- 如果最近5次TMP持续上升 → 可能有问题
			
 
				+- 如果TMP突然跳变很大 → 异常
			
 
				+
			
 
				+**需要SCADA系统配合**：
			
 
				+- 绝对阈值报警（>0.08 bar）需要SCADA系统设置
			
 
				+- 具体数值监控依赖SCADA
			
 
				+
			
 
				+**建议**：
			
 
				+- 程序里记录TMP趋势
			
 
				+- SCADA设置报警阈值
			
 
				+- 两边配合使用
			
 
				+
			
 
				+### 2. 产水量判断
			
 
				+**系统里不能直接判断**：
			
 
				+- 需要流量计数据
			
 
				+- 需要SCADA系统提供
			
 
				+
			
 
				+**替代方案**：
			
 
				+- 人工定期查看SCADA数据
			
 
				+- 对比历史同期数据
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 五、配置说明（新增）
			
 
				+
			
 
				+### device_states.json 存储内容
			
 
				+```json
			
 
				+{
			
 
				+  "UF1": {
			
 
				+    "model_prev_L_s": 3860,           // 上次决策的产水时间
			
 
				+    "model_prev_t_bw_s": 42,          // 上次决策的反洗时间
			
 
				+    "last_cycle_end_time": "2025-10-27 14:30:00",  // 上次决策时间
			
 
				+    "recent_tmp_values": [            // 最近N次TMP平均值（新增）
			
 
				+      0.0312,
			
 
				+      0.0318,
			
 
				+      0.0325,
			
 
				+      0.0331,
			
 
				+      0.0328
			
 
				+    ]
			
 
				+  }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### 配置参数（config.json）
			
 
				+```json
			
 
				+{
			
 
				+  "system": {
			
 
				+    "tmp_history_count": 5,  // 保存最近N次TMP值，可调整
			
 
				+    ...
			
 
				+  }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### TMP趋势判断逻辑
			
 
				+- 保存最近5次（可配置）TMP平均值
			
 
				+- 每次决策后更新
			
 
				+- 可以通过这个列表判断：
			
 
				+  - 持续上升：可能需要关注
			
 
				+  - 突然跳变：可能数据异常
			
 
				+  - 波动正常：运行正常
			
 
				+
			
 
				+
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 六、成功标准
			
 
				+
			
 
				+**看这几点**：
			
 
				+1. 程序能稳定跑2周
			
 
				+2. 设备没有因为模型控制出问题
			
 
				+3. 前端开关能正常控制
			
 
				+4. TMP趋势没有明显异常
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 七、应急方案
			
 
				+
			
 
				+### 前端关闭
			
 
				+- 在手机端关闭系统自控按钮
			
 
				+- 程序继续跑，但不下发指令
			
 
				+
			
 
				+### 备选：停程序
			
 
				+```bash
			
 
				+ps aux | grep loop_main.py
			
 
				+kill <PID>
			
 
				+```
			
 
				+
			
 
				+### 最后：PLC手动操作
			
 
				+- 切换到手动模式
			
 
				+
			
 
				+---
			
 
				+
			
 
				+**版本**：v1.0  
			
 
				+**日期**：2025-10-27
			
--- a/models/uf-rl/超滤开发/DQN_decide.py
+++ b/models/uf-rl/超滤开发/DQN_decide.py
@@ -0,0 +1,264 @@
 
				+import numpy as np
			
 
				+from stable_baselines3 import DQN
			
 
				+from DQN_env import UFSuperCycleEnv
			
 
				+from DQN_env import UFParams
			
 
				+
			
 
				+# 模型路径
			
 
				+MODEL_PATH = "dqn_model.zip"
			
 
				+
			
 
				+# 创建环境实例以获取观察空间和动作空间
			
 
				+def _get_model_spaces():
			
 
				+    """获取模型的观察空间和动作空间"""
			
 
				+    env = UFSuperCycleEnv(UFParams())
			
 
				+    obs_space = env.observation_space
			
 
				+    action_space = env.action_space
			
 
				+    env.close()
			
 
				+    return obs_space, action_space
			
 
				+
			
 
				+# 加载模型（只加载一次，提高效率）
			
 
				+try:
			
 
				+    # 尝试直接加载
			
 
				+    model = DQN.load(MODEL_PATH)
			
 
				+except KeyError:
			
 
				+    # 如果失败，则提供观察空间和动作空间
			
 
				+    obs_space, action_space = _get_model_spaces()
			
 
				+    model = DQN.load(MODEL_PATH, custom_objects={
			
 
				+        'observation_space': obs_space,
			
 
				+        'action_space': action_space
			
 
				+    })
			
 
				+
			
 
				+def run_uf_DQN_decide(uf_params, TMP0_value: float):
			
 
				+    """
			
 
				+    单步决策函数：输入原始 TMP0，预测并执行动作
			
 
				+
			
 
				+    参数:
			
 
				+        TMP0_value (float): 当前 TMP0 值（单位与环境一致）
			
 
				+
			
 
				+    返回:
			
 
				+        dict: 包含模型选择的动作、动作参数、新状态、奖励等
			
 
				+    """
			
 
				+    # 1. 实例化环境
			
 
				+    base_params = uf_params
			
 
				+    env = UFSuperCycleEnv(base_params)
			
 
				+
			
 
				+    # 2. 将输入的 TMP0 写入环境
			
 
				+    env.current_params.TMP0 = TMP0_value
			
 
				+
			
 
				+    # 3. 获取归一化状态
			
 
				+    obs = env._get_obs().reshape(1, -1)
			
 
				+
			
 
				+    # 4. 模型预测动作
			
 
				+    action, _ = model.predict(obs, deterministic=True)
			
 
				+
			
 
				+    # 5. 解析动作对应的 L_s 和 t_bw_s
			
 
				+    L_s, t_bw_s = env._get_action_values(action[0])
			
 
				+
			
 
				+    # 6. 在环境中执行该动作
			
 
				+    next_obs, reward, terminated, truncated, info = env.step(action[0])
			
 
				+
			
 
				+    # 7. 整理结果
			
 
				+    result = {
			
 
				+        "action": int(action[0]),
			
 
				+        "L_s": float(L_s),
			
 
				+        "t_bw_s": float(t_bw_s),
			
 
				+        "next_obs": next_obs,
			
 
				+        "reward": reward,
			
 
				+        "terminated": terminated,
			
 
				+        "truncated": truncated,
			
 
				+        "info": info
			
 
				+    }
			
 
				+
			
 
				+    # 8. 关闭环境
			
 
				+    env.close()
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+def generate_plc_instructions(current_L_s, current_t_bw_s, model_prev_L_s, model_prev_t_bw_s, model_L_s, model_t_bw_s):
			
 
				+    """
			
 
				+    根据工厂当前值、模型上一轮决策值和模型当前轮决策值，生成PLC指令。
			
 
				+
			
 
				+    新增功能：
			
 
				+    1. 处理None值情况：如果模型上一轮值为None，则使用工厂当前值；
			
 
				+       如果工厂当前值也为None，则返回None并提示错误。
			
 
				+    """
			
 
				+    # 参数配置保持不变
			
 
				+    params = UFParams(
			
 
				+        L_min_s=3600.0, L_max_s=6000.0, L_step_s=60.0,
			
 
				+        t_bw_min_s=40.0, t_bw_max_s=60.0, t_bw_step_s=2.0,
			
 
				+    )
			
 
				+
			
 
				+    # 参数解包
			
 
				+    L_step_s = params.L_step_s
			
 
				+    t_bw_step_s = params.t_bw_step_s
			
 
				+    L_min_s = params.L_min_s
			
 
				+    L_max_s = params.L_max_s
			
 
				+    t_bw_min_s = params.t_bw_min_s
			
 
				+    t_bw_max_s = params.t_bw_max_s
			
 
				+    adjustment_threshold = 1.0
			
 
				+
			
 
				+    # 处理None值情况
			
 
				+    if model_prev_L_s is None:
			
 
				+        if current_L_s is None:
			
 
				+            print("错误: 过滤时长的工厂当前值和模型上一轮值均为None")
			
 
				+            return None, None
			
 
				+        else:
			
 
				+            # 使用工厂当前值作为基准
			
 
				+            effective_current_L = current_L_s
			
 
				+            source_L = "工厂当前值(模型上一轮值为None)"
			
 
				+    else:
			
 
				+        # 模型上一轮值不为None，继续检查工厂当前值
			
 
				+        if current_L_s is None:
			
 
				+            effective_current_L = model_prev_L_s
			
 
				+            source_L = "模型上一轮值(工厂当前值为None)"
			
 
				+        else:
			
 
				+            effective_current_L = model_prev_L_s
			
 
				+            source_L = "模型上一轮值"
			
 
				+
			
 
				+    # 对反洗时长进行同样的处理
			
 
				+    if model_prev_t_bw_s is None:
			
 
				+        if current_t_bw_s is None:
			
 
				+            print("错误: 反洗时长的工厂当前值和模型上一轮值均为None")
			
 
				+            return None, None
			
 
				+        else:
			
 
				+            effective_current_t_bw = current_t_bw_s
			
 
				+            source_t_bw = "工厂当前值(模型上一轮值为None)"
			
 
				+    else:
			
 
				+        if current_t_bw_s is None:
			
 
				+            effective_current_t_bw = model_prev_t_bw_s
			
 
				+            source_t_bw = "模型上一轮值(工厂当前值为None)"
			
 
				+        else:
			
 
				+            effective_current_t_bw = model_prev_t_bw_s
			
 
				+            source_t_bw = "模型上一轮值"
			
 
				+
			
 
				+    # 检测所有输入值是否在规定范围内（只对非None值进行检查）
			
 
				+    # 工厂当前值检查（警告）
			
 
				+    if current_L_s is not None and not (L_min_s <= current_L_s <= L_max_s):
			
 
				+        print(f"警告: 当前过滤时长 {current_L_s} 秒不在允许范围内 [{L_min_s}, {L_max_s}]")
			
 
				+    if current_t_bw_s is not None and not (t_bw_min_s <= current_t_bw_s <= t_bw_max_s):
			
 
				+        print(f"警告: 当前反洗时长 {current_t_bw_s} 秒不在允许范围内 [{t_bw_min_s}, {t_bw_max_s}]")
			
 
				+
			
 
				+    # 模型上一轮决策值检查（警告）
			
 
				+    if model_prev_L_s is not None and not (L_min_s <= model_prev_L_s <= L_max_s):
			
 
				+        print(f"警告: 模型上一轮过滤时长 {model_prev_L_s} 秒不在允许范围内 [{L_min_s}, {L_max_s}]")
			
 
				+    if model_prev_t_bw_s is not None and not (t_bw_min_s <= model_prev_t_bw_s <= t_bw_max_s):
			
 
				+        print(f"警告: 模型上一轮反洗时长 {model_prev_t_bw_s} 秒不在允许范围内 [{t_bw_min_s}, {t_bw_max_s}]")
			
 
				+
			
 
				+    # 模型当前轮决策值检查（错误）
			
 
				+    if model_L_s is None:
			
 
				+        raise ValueError("错误: 决策模型建议的过滤时长不能为None")
			
 
				+    elif not (L_min_s <= model_L_s <= L_max_s):
			
 
				+        raise ValueError(f"错误: 决策模型建议的过滤时长 {model_L_s} 秒不在允许范围内 [{L_min_s}, {L_max_s}]")
			
 
				+
			
 
				+    if model_t_bw_s is None:
			
 
				+        raise ValueError("错误: 决策模型建议的反洗时长不能为None")
			
 
				+    elif not (t_bw_min_s <= model_t_bw_s <= t_bw_max_s):
			
 
				+        raise ValueError(f"错误: 决策模型建议的反洗时长 {model_t_bw_s} 秒不在允许范围内 [{t_bw_min_s}, {t_bw_max_s}]")
			
 
				+
			
 
				+    print(f"过滤时长基准: {source_L}, 值: {effective_current_L}")
			
 
				+    print(f"反洗时长基准: {source_t_bw}, 值: {effective_current_t_bw}")
			
 
				+
			
 
				+    # 使用选定的基准值进行计算调整
			
 
				+    L_diff = model_L_s - effective_current_L
			
 
				+    L_adjustment = 0
			
 
				+    if abs(L_diff) >= adjustment_threshold * L_step_s:
			
 
				+        if L_diff >= 0:
			
 
				+            L_adjustment = L_step_s
			
 
				+        else:
			
 
				+            L_adjustment = -L_step_s
			
 
				+    next_L_s = effective_current_L + L_adjustment
			
 
				+
			
 
				+    t_bw_diff = model_t_bw_s - effective_current_t_bw
			
 
				+    t_bw_adjustment = 0
			
 
				+    if abs(t_bw_diff) >= adjustment_threshold * t_bw_step_s:
			
 
				+        if t_bw_diff >= 0:
			
 
				+            t_bw_adjustment = t_bw_step_s
			
 
				+        else:
			
 
				+            t_bw_adjustment = -t_bw_step_s
			
 
				+    next_t_bw_s = effective_current_t_bw + t_bw_adjustment
			
 
				+
			
 
				+    return next_L_s, next_t_bw_s
			
 
				+
			
 
				+
			
 
				+from DQN_env import simulate_one_supercycle
			
 
				+def calc_uf_cycle_metrics(p, TMP0, max_tmp_during_filtration, min_tmp_during_filtration, L_s: float, t_bw_s: float):
			
 
				+    """
			
 
				+    计算 UF 超滤系统的核心性能指标
			
 
				+
			
 
				+    参数:
			
 
				+        p (UFParams): UF 系统参数
			
 
				+        L_s (float): 单次过滤时间（秒）
			
 
				+        t_bw_s (float): 单次反洗时间（秒）
			
 
				+
			
 
				+    返回:
			
 
				+        dict: {
			
 
				+            "k_bw_per_ceb": 小周期次数,
			
 
				+            "ton_water_energy_kWh_per_m3": 吨水电耗,
			
 
				+            "recovery": 回收率,
			
 
				+            "net_delivery_rate_m3ph": 净供水率 (m³/h),
			
 
				+            "daily_prod_time_h": 日均产水时间 (小时/天)
			
 
				+            "max_permeability": 全周期最高渗透率(lmh/bar)
			
 
				+        }
			
 
				+    """
			
 
				+    # 将跨膜压差写入参数
			
 
				+    p.TMP0 = TMP0
			
 
				+
			
 
				+    # 模拟该参数下的超级周期
			
 
				+    feasible, info = simulate_one_supercycle(p, L_s, t_bw_s)
			
 
				+
			
 
				+    # 获得模型模拟周期信息
			
 
				+    k_bw_per_ceb = info["k_bw_per_ceb"]
			
 
				+    ton_water_energy_kWh_per_m3 = info["ton_water_energy_kWh_per_m3"]
			
 
				+    recovery = info["recovery"]
			
 
				+    net_delivery_rate_m3ph = info["net_delivery_rate_m3ph"]
			
 
				+    daily_prod_time_h = info["daily_prod_time_h"]
			
 
				+
			
 
				+    # 获得模型模拟周期内最高跨膜压差/最低跨膜压差
			
 
				+    if max_tmp_during_filtration is None:
			
 
				+        max_tmp_during_filtration = info["max_TMP_during_filtration"]
			
 
				+    if min_tmp_during_filtration is None:
			
 
				+        min_tmp_during_filtration = info["min_TMP_during_filtration"]
			
 
				+
			
 
				+    # 计算最高渗透率
			
 
				+    max_permeability = 100 * p.q_UF / (128*40) / min_tmp_during_filtration
			
 
				+
			
 
				+
			
 
				+    return {
			
 
				+        "k_bw_per_ceb": k_bw_per_ceb,
			
 
				+        "ton_water_energy_kWh_per_m3": ton_water_energy_kWh_per_m3,
			
 
				+        "recovery": recovery,
			
 
				+        "net_delivery_rate_m3ph": net_delivery_rate_m3ph,
			
 
				+        "daily_prod_time_h": daily_prod_time_h,
			
 
				+        "max_permeability": max_permeability
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+# ==============================
			
 
				+# 示例调用
			
 
				+# ==============================
			
 
				+if __name__ == "__main__":
			
 
				+    uf_params = UFParams()
			
 
				+    TMP0 = 0.03 # 原始 TMP0
			
 
				+    model_decide_result = run_uf_DQN_decide(uf_params, TMP0) # 调用模型获得动作
			
 
				+    model_L_s = model_decide_result['L_s'] # 获得模型决策产水时长
			
 
				+    model_t_bw_s = model_decide_result['t_bw_s'] # 获得模型决策反洗时长
			
 
				+
			
 
				+    current_L_s = 3800
			
 
				+    current_t_bw_s = 40
			
 
				+    model_prev_L_s = 4040
			
 
				+    model_prev_t_bw_s = 60
			
 
				+    L_s, t_bw_s = generate_plc_instructions(current_L_s, current_t_bw_s, model_prev_L_s, model_prev_t_bw_s, model_L_s, model_t_bw_s) # 获取模型下发指令
			
 
				+
			
 
				+    L_s = 4100
			
 
				+    t_bw_s = 96
			
 
				+    max_tmp_during_filtration = 0.050176 # 新增工厂数据接口：周期最高/最低跨膜压差，无工厂数据接入时传入None，calc_uf_cycle_metrics()自动获取模拟周期中的跨膜压差最值
			
 
				+    min_tmp_during_filtration = 0.012496
			
 
				+    execution_result = calc_uf_cycle_metrics(uf_params, TMP0, max_tmp_during_filtration, min_tmp_during_filtration, L_s, t_bw_s)
			
 
				+    print("\n===== 单步决策结果 =====")
			
 
				+    print(f"模型选择的动作: {model_decide_result['action']}")
			
 
				+    print(f"模型选择的L_s: {model_L_s} 秒, 模型选择的t_bw_s: {model_t_bw_s} 秒")
			
 
				+    print(f"指令下发的L_s: {L_s} 秒, 指令下发的t_bw_s: {t_bw_s} 秒")
			
 
				+    print(f"指令对应的反洗次数: {execution_result['k_bw_per_ceb']}")
			
 
				+    print(f"指令对应的吨水电耗: {execution_result['ton_water_energy_kWh_per_m3']}")
			
 
				+    print(f"指令对应的回收率: {execution_result['recovery']}")
			
 
				+    print(f"指令对应的日均产水时间: {execution_result['daily_prod_time_h']}")
			
 
				+    print(f"指令对应的最高渗透率: {execution_result['max_permeability']}")
			
--- a/models/uf-rl/超滤开发/DQN_env.py
+++ b/models/uf-rl/超滤开发/DQN_env.py
@@ -0,0 +1,566 @@
 
				+import os
			
 
				+import time
			
 
				+import random
			
 
				+import numpy as np
			
 
				+import gymnasium as gym
			
 
				+from gymnasium import spaces
			
 
				+from stable_baselines3 import DQN
			
 
				+from stable_baselines3.common.monitor import Monitor
			
 
				+from stable_baselines3.common.vec_env import DummyVecEnv
			
 
				+from stable_baselines3.common.callbacks import BaseCallback
			
 
				+from typing import Dict, Tuple, Optional
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+from dataclasses import dataclass, asdict
			
 
				+from save_uf_models import TMPIncreaseModel, TMPDecreaseModel  # 导入模型类
			
 
				+import copy
			
 
				+
			
 
				+
			
 
				+# ==== 定义膜的基础运行参数 ====
			
 
				+@dataclass
			
 
				+class UFParams:
			
 
				+    # —— 膜与运行参数 ——
			
 
				+    q_UF: float = 360.0  # 过滤进水流量（m^3/h）
			
 
				+    TMP0: float = 0.03  # 初始TMP（MPa）
			
 
				+    TMP_max: float = 0.06  # TMP硬上限（MPa）
			
 
				+
			
 
				+    # —— 膜污染动力学 ——
			
 
				+    alpha: float = 1e-6  # TMP增长系数
			
 
				+    belta: float = 1.1  # 幂指数
			
 
				+
			
 
				+    # —— 反洗参数（固定） ——
			
 
				+    q_bw_m3ph: float = 1000.0  # 物理反洗流量（m^3/h）
			
 
				+
			
 
				+    # —— CEB参数（固定） ——
			
 
				+    T_ceb_interval_h: float = 48.0  # 固定每 k 小时做一次CEB
			
 
				+    v_ceb_m3: float = 30.0  # CEB用水体积（m^3）
			
 
				+    t_ceb_s: float = 40 * 60.0  # CEB时长（s）
			
 
				+    phi_ceb: float = 1.0  # CEB去除比例（简化：完全恢复到TMP0）
			
 
				+
			
 
				+    # —— 约束与收敛 ——
			
 
				+    dTMP: float = 0.001  # 单次产水结束时，相对TMP0最大升幅（MPa）
			
 
				+
			
 
				+    # —— 搜索范围（秒） ——
			
 
				+    L_min_s: float = 3800.0  # 过滤时长下限（s）
			
 
				+    L_max_s: float = 6000.0  # 过滤时长上限（s）
			
 
				+    t_bw_min_s: float = 40.0  # 物洗时长下限（s）
			
 
				+    t_bw_max_s: float = 60.0  # 物洗时长上限（s）
			
 
				+
			
 
				+    # —— 物理反洗恢复函数参数 ——
			
 
				+    phi_bw_min: float = 0.7  # 物洗去除比例最小值
			
 
				+    phi_bw_max: float = 1.0  # 物洗去除比例最大值
			
 
				+    L_ref_s: float = 4000.0  # 过滤时长影响时间尺度
			
 
				+    tau_bw_s: float = 20.0  # 物洗时长影响时间尺度
			
 
				+    gamma_t: float = 1.0  # 物洗时长作用指数
			
 
				+
			
 
				+    # —— 网格 ——
			
 
				+    L_step_s: float = 60.0  # 过滤时长步长（s）
			
 
				+    t_bw_step_s: float = 5.0  # 物洗时长步长（s）
			
 
				+
			
 
				+    # 多目标加权及高TMP惩罚
			
 
				+    w_rec: float = 0.8  # 回收率权重
			
 
				+    w_rate: float = 0.2  # 净供水率权重
			
 
				+    w_headroom: float = 0.2  # 贴边惩罚权重
			
 
				+    r_headroom: float = 2.0  # 贴边惩罚幂次
			
 
				+    headroom_hardcap: float = 0.98  # 超过此比例直接视为不可取
			
 
				+
			
 
				+# ==== 定义强化学习超参数 ====
			
 
				+@dataclass
			
 
				+class DQNParams:
			
 
				+    """
			
 
				+    DQN 超参数定义类
			
 
				+    用于统一管理模型训练参数
			
 
				+    """
			
 
				+    # 学习率，控制神经网络更新步长
			
 
				+    learning_rate: float = 1e-4
			
 
				+
			
 
				+    # 经验回放缓冲区大小（步数）
			
 
				+    buffer_size: int = 10000
			
 
				+
			
 
				+    # 学习开始前需要收集的步数
			
 
				+    learning_starts: int = 200
			
 
				+
			
 
				+    # 每次从经验池中采样的样本数量
			
 
				+    batch_size: int = 32
			
 
				+
			
 
				+    # 折扣因子，越接近1越重视长期奖励
			
 
				+    gamma: float = 0.95
			
 
				+
			
 
				+    # 每隔多少步训练一次
			
 
				+    train_freq: int = 4
			
 
				+
			
 
				+    # 目标网络更新间隔
			
 
				+    target_update_interval: int = 2000
			
 
				+
			
 
				+    # 初始探索率 ε
			
 
				+    exploration_initial_eps: float = 1.0
			
 
				+
			
 
				+    # 从初始ε衰减到最终ε所占的训练比例
			
 
				+    exploration_fraction: float = 0.3
			
 
				+
			
 
				+    # 最终探索率 ε
			
 
				+    exploration_final_eps: float = 0.02
			
 
				+
			
 
				+    # 日志备注（用于区分不同实验）
			
 
				+    remark: str = "default"
			
 
				+
			
 
				+# ==== 加载模拟环境模型 ====
			
 
				+# 初始化模型
			
 
				+model_fp = TMPIncreaseModel()
			
 
				+model_bw = TMPDecreaseModel()
			
 
				+
			
 
				+# 加载参数
			
 
				+model_fp.load_state_dict(torch.load("uf_fp.pth"))
			
 
				+model_bw.load_state_dict(torch.load("uf_bw.pth"))
			
 
				+
			
 
				+# 切换到推理模式
			
 
				+model_fp.eval()
			
 
				+model_bw.eval()
			
 
				+
			
 
				+
			
 
				+def _delta_tmp(p, L_h: float) -> float:
			
 
				+    """
			
 
				+    过滤时段TMP上升量：调用 uf_fp.pth 模型
			
 
				+    """
			
 
				+    return model_fp(p, L_h)
			
 
				+
			
 
				+def phi_bw_of(p, L_s: float, t_bw_s: float) -> float:
			
 
				+    """
			
 
				+    物洗去除比例：调用 uf_bw.pth 模型
			
 
				+    """
			
 
				+    return model_bw(p, L_s, t_bw_s)
			
 
				+
			
 
				+def _tmp_after_ceb(p, L_s: float, t_bw_s: float) -> float:
			
 
				+    """
			
 
				+    计算化学清洗(CEB)后的TMP，当前为恢复初始跨膜压差
			
 
				+    """
			
 
				+    return p.TMP0
			
 
				+
			
 
				+def _v_bw_m3(p, t_bw_s: float) -> float:
			
 
				+    """
			
 
				+    物理反洗水耗
			
 
				+    """
			
 
				+    return float(p.q_bw_m3ph * (float(t_bw_s) / 3600.0))
			
 
				+
			
 
				+def simulate_one_supercycle(p: UFParams, L_s: float, t_bw_s: float):
			
 
				+    """
			
 
				+    返回 (是否可行, 指标字典)
			
 
				+    - 支持动态CEB次数：48h固定间隔
			
 
				+    - 增加日均产水时间和吨水电耗
			
 
				+    - 增加最小TMP记录
			
 
				+    """
			
 
				+    L_h = float(L_s) / 3600.0  # 小周期过滤时间(h)
			
 
				+
			
 
				+    tmp = p.TMP0
			
 
				+    max_tmp_during_filtration = tmp
			
 
				+    min_tmp_during_filtration = tmp  # 新增：初始化最小TMP
			
 
				+    max_residual_increase = 0.0
			
 
				+
			
 
				+    # 小周期总时长(h)
			
 
				+    t_small_cycle_h = (L_s + t_bw_s) / 3600.0
			
 
				+
			
 
				+    # 计算超级周期内CEB次数
			
 
				+    k_bw_per_ceb = int(np.floor(p.T_ceb_interval_h / t_small_cycle_h))
			
 
				+    if k_bw_per_ceb < 1:
			
 
				+        k_bw_per_ceb = 1  # 至少一个小周期
			
 
				+
			
 
				+    # ton水电耗查表
			
 
				+    energy_lookup = {
			
 
				+        3600: 0.1034, 3660: 0.1031, 3720: 0.1029, 3780: 0.1026,
			
 
				+        3840: 0.1023, 3900: 0.1021, 3960: 0.1019, 4020: 0.1017,
			
 
				+        4080: 0.1015, 4140: 0.1012, 4200: 0.1011
			
 
				+    }
			
 
				+
			
 
				+    for _ in range(k_bw_per_ceb):
			
 
				+        tmp_run_start = tmp
			
 
				+
			
 
				+        # 过滤阶段TMP增长
			
 
				+        dtmp = _delta_tmp(p, L_h)
			
 
				+        tmp_peak = tmp_run_start + dtmp
			
 
				+
			
 
				+        # 约束1：峰值不得超过硬上限
			
 
				+        if tmp_peak > p.TMP_max + 1e-12:
			
 
				+            return False, {"reason": "TMP_max violated during filtration", "TMP_peak": tmp_peak}
			
 
				+
			
 
				+        # 更新最大和最小TMP
			
 
				+        if tmp_peak > max_tmp_during_filtration:
			
 
				+            max_tmp_during_filtration = tmp_peak
			
 
				+        if tmp_run_start < min_tmp_during_filtration:  # 新增：记录运行开始时的最小TMP
			
 
				+            min_tmp_during_filtration = tmp_run_start
			
 
				+
			
 
				+        # 物理反洗
			
 
				+        phi = phi_bw_of(p, L_s, t_bw_s)
			
 
				+        tmp_after_bw = tmp_peak - phi * (tmp_peak - tmp_run_start)
			
 
				+
			
 
				+        # 约束2：单次残余增量控制
			
 
				+        residual_inc = tmp_after_bw - tmp_run_start
			
 
				+        if residual_inc > p.dTMP + 1e-12:
			
 
				+            return False, {
			
 
				+                "reason": "residual TMP increase after BW exceeded dTMP",
			
 
				+                "residual_increase": residual_inc,
			
 
				+                "limit_dTMP": p.dTMP
			
 
				+            }
			
 
				+        if residual_inc > max_residual_increase:
			
 
				+            max_residual_increase = residual_inc
			
 
				+
			
 
				+        tmp = tmp_after_bw
			
 
				+
			
 
				+    # CEB
			
 
				+    tmp_after_ceb = p.TMP0
			
 
				+
			
 
				+    # 体积与回收率
			
 
				+    V_feed_super = k_bw_per_ceb * p.q_UF * L_h
			
 
				+    V_loss_super = k_bw_per_ceb * _v_bw_m3(p, t_bw_s) + p.v_ceb_m3
			
 
				+    V_net = max(0.0, V_feed_super - V_loss_super)
			
 
				+    recovery = max(0.0, V_net / max(V_feed_super, 1e-12))
			
 
				+
			
 
				+    # 时间与净供水率
			
 
				+    T_super_h = k_bw_per_ceb * (L_s + t_bw_s) / 3600.0 + p.t_ceb_s / 3600.0
			
 
				+    net_delivery_rate_m3ph = V_net / max(T_super_h, 1e-12)
			
 
				+
			
 
				+    # 贴边比例与硬限
			
 
				+    headroom_ratio = max_tmp_during_filtration / max(p.TMP_max, 1e-12)
			
 
				+    if headroom_ratio > p.headroom_hardcap + 1e-12:
			
 
				+        return False, {"reason": "headroom hardcap exceeded", "headroom_ratio": headroom_ratio}
			
 
				+
			
 
				+    # —— 新增指标 1：日均产水时间（h/d） ——
			
 
				+    daily_prod_time_h = k_bw_per_ceb * L_h / T_super_h * 24.0
			
 
				+
			
 
				+    # —— 新增指标 2：吨水电耗（kWh/m³） ——
			
 
				+    closest_L = min(energy_lookup.keys(), key=lambda x: abs(x - L_s))
			
 
				+    ton_water_energy = energy_lookup[closest_L]
			
 
				+
			
 
				+    info = {
			
 
				+        "recovery": recovery,
			
 
				+        "V_feed_super_m3": V_feed_super,
			
 
				+        "V_loss_super_m3": V_loss_super,
			
 
				+        "V_net_super_m3": V_net,
			
 
				+        "supercycle_time_h": T_super_h,
			
 
				+        "net_delivery_rate_m3ph": net_delivery_rate_m3ph,
			
 
				+        "max_TMP_during_filtration": max_tmp_during_filtration,
			
 
				+        "min_TMP_during_filtration": min_tmp_during_filtration,  # 新增：最小TMP
			
 
				+        "max_residual_increase_per_run": max_residual_increase,
			
 
				+        "phi_bw_effective": phi,
			
 
				+        "TMP_after_ceb": tmp_after_ceb,
			
 
				+        "headroom_ratio": headroom_ratio,
			
 
				+        "daily_prod_time_h": daily_prod_time_h,
			
 
				+        "ton_water_energy_kWh_per_m3": ton_water_energy,
			
 
				+        "k_bw_per_ceb": k_bw_per_ceb
			
 
				+    }
			
 
				+
			
 
				+    return True, info
			
 
				+
			
 
				+def _score(p: UFParams, rec: dict) -> float:
			
 
				+    """综合评分：越大越好。通过非线性放大奖励差异，强化区分好坏动作"""
			
 
				+
			
 
				+    # —— 无量纲化净供水率 ——
			
 
				+    rate_norm = rec["net_delivery_rate_m3ph"] / max(p.q_UF, 1e-12)
			
 
				+
			
 
				+    # —— TMP soft penalty (sigmoid) ——
			
 
				+    tmp_ratio = rec["max_TMP_during_filtration"] / max(p.TMP_max, 1e-12)
			
 
				+    k = 10.0
			
 
				+    headroom_penalty = 1.0 / (1.0 + np.exp(-k * (tmp_ratio - 1.0)))
			
 
				+
			
 
				+    # —— 基础 reward（0.6~0.9左右）——
			
 
				+    base_reward = (
			
 
				+        p.w_rec * rec["recovery"]
			
 
				+        + p.w_rate * rate_norm
			
 
				+        - p.w_headroom * headroom_penalty
			
 
				+    )
			
 
				+
			
 
				+    # —— 非线性放大：平方映射 + 缩放 ——
			
 
				+    # 目的是放大好坏动作差异，同时限制最大值，避免 TD-error 过大
			
 
				+    amplified_reward = (base_reward - 0.5) ** 2 * 5.0
			
 
				+
			
 
				+    # —— 可选：保留符号，区分负奖励
			
 
				+    if base_reward < 0.5:
			
 
				+        amplified_reward = -amplified_reward
			
 
				+
			
 
				+    return amplified_reward
			
 
				+
			
 
				+
			
 
				+def set_global_seed(seed: int):
			
 
				+    """固定全局随机种子，保证训练可复现"""
			
 
				+    random.seed(seed)
			
 
				+    np.random.seed(seed)
			
 
				+    torch.manual_seed(seed)
			
 
				+    torch.cuda.manual_seed_all(seed)  # 如果使用GPU
			
 
				+    torch.backends.cudnn.deterministic = True
			
 
				+    torch.backends.cudnn.benchmark = False
			
 
				+
			
 
				+class UFSuperCycleEnv(gym.Env):
			
 
				+    """超滤系统环境（超级周期级别决策）"""
			
 
				+
			
 
				+    metadata = {"render_modes": ["human"]}
			
 
				+
			
 
				+    def __init__(self, base_params, max_episode_steps: int = 20):
			
 
				+        super(UFSuperCycleEnv, self).__init__()
			
 
				+
			
 
				+        self.base_params = base_params
			
 
				+        self.current_params = copy.deepcopy(base_params)
			
 
				+        self.max_episode_steps = max_episode_steps
			
 
				+        self.current_step = 0
			
 
				+
			
 
				+        # 计算离散动作空间
			
 
				+        self.L_values = np.arange(
			
 
				+            self.base_params.L_min_s,
			
 
				+            self.base_params.L_max_s + self.base_params.L_step_s,
			
 
				+            self.base_params.L_step_s
			
 
				+        )
			
 
				+        self.t_bw_values = np.arange(
			
 
				+            self.base_params.t_bw_min_s,
			
 
				+            self.base_params.t_bw_max_s + self.base_params.t_bw_step_s,
			
 
				+            self.base_params.t_bw_step_s
			
 
				+        )
			
 
				+
			
 
				+        self.num_L = len(self.L_values)
			
 
				+        self.num_bw = len(self.t_bw_values)
			
 
				+
			
 
				+        # 单一离散动作空间
			
 
				+        self.action_space = spaces.Discrete(self.num_L * self.num_bw)
			
 
				+
			
 
				+        # 状态空间增加 TMP0, 上一次动作(L_s, t_bw_s), 本周期最高 TMP
			
 
				+        # 状态归一化均在 _get_obs 内处理
			
 
				+        self.observation_space = spaces.Box(
			
 
				+            low=np.zeros(4, dtype=np.float32),
			
 
				+            high=np.ones(4, dtype=np.float32),
			
 
				+            dtype=np.float32
			
 
				+        )
			
 
				+
			
 
				+        # 初始化状态
			
 
				+        self.last_action = (self.base_params.L_min_s, self.base_params.t_bw_min_s)
			
 
				+        self.max_TMP_during_filtration = self.current_params.TMP0
			
 
				+        self.reset(seed=None)
			
 
				+
			
 
				+    def _get_obs(self):
			
 
				+        TMP0 = self.current_params.TMP0
			
 
				+        TMP0_norm = (TMP0 - 0.01) / (0.05 - 0.01)
			
 
				+
			
 
				+        L_s, t_bw_s = self.last_action
			
 
				+        L_norm = (L_s - self.base_params.L_min_s) / (self.base_params.L_max_s - self.base_params.L_min_s)
			
 
				+        t_bw_norm = (t_bw_s - self.base_params.t_bw_min_s) / (self.base_params.t_bw_max_s - self.base_params.t_bw_min_s)
			
 
				+
			
 
				+        max_TMP_norm = (self.max_TMP_during_filtration - 0.01) / (0.05 - 0.01)
			
 
				+
			
 
				+        return np.array([TMP0_norm, L_norm, t_bw_norm, max_TMP_norm], dtype=np.float32)
			
 
				+
			
 
				+    def _get_action_values(self, action):
			
 
				+        L_idx = action // self.num_bw
			
 
				+        t_bw_idx = action % self.num_bw
			
 
				+        return self.L_values[L_idx], self.t_bw_values[t_bw_idx]
			
 
				+
			
 
				+    def reset(self, seed=None, options=None):
			
 
				+        super().reset(seed=seed)
			
 
				+        self.current_params.TMP0 = np.random.uniform(0.01, 0.03)
			
 
				+        self.current_step = 0
			
 
				+        self.last_action = (self.base_params.L_min_s, self.base_params.t_bw_min_s)
			
 
				+        self.max_TMP_during_filtration = self.current_params.TMP0
			
 
				+        return self._get_obs(), {}
			
 
				+
			
 
				+    def step(self, action):
			
 
				+        self.current_step += 1
			
 
				+        L_s, t_bw_s = self._get_action_values(action)
			
 
				+        L_s = np.clip(L_s, self.base_params.L_min_s, self.base_params.L_max_s)
			
 
				+        t_bw_s = np.clip(t_bw_s, self.base_params.t_bw_min_s, self.base_params.t_bw_max_s)
			
 
				+
			
 
				+        # 模拟超级周期
			
 
				+        feasible, info = simulate_one_supercycle(self.current_params, L_s, t_bw_s)
			
 
				+
			
 
				+        if feasible:
			
 
				+            reward = _score(self.current_params, info)
			
 
				+            self.current_params.TMP0 = info["TMP_after_ceb"]
			
 
				+            self.max_TMP_during_filtration = info["max_TMP_during_filtration"]
			
 
				+            terminated = False
			
 
				+        else:
			
 
				+            reward = -20
			
 
				+            terminated = True
			
 
				+
			
 
				+        truncated = self.current_step >= self.max_episode_steps
			
 
				+        self.last_action = (L_s, t_bw_s)
			
 
				+        next_obs = self._get_obs()
			
 
				+
			
 
				+        info["feasible"] = feasible
			
 
				+        info["step"] = self.current_step
			
 
				+
			
 
				+        return next_obs, reward, terminated, truncated, info
			
 
				+
			
 
				+
			
 
				+class UFEpisodeRecorder:
			
 
				+    """记录episode中的决策和结果"""
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.episode_data = []
			
 
				+        self.current_episode = []
			
 
				+
			
 
				+    def record_step(self, obs, action, reward, done, info):
			
 
				+        """记录一步"""
			
 
				+        step_data = {
			
 
				+            "obs": obs.copy(),
			
 
				+            "action": action.copy(),
			
 
				+            "reward": reward,
			
 
				+            "done": done,
			
 
				+            "info": info.copy() if info else {}
			
 
				+        }
			
 
				+        self.current_episode.append(step_data)
			
 
				+
			
 
				+        if done:
			
 
				+            self.episode_data.append(self.current_episode)
			
 
				+            self.current_episode = []
			
 
				+
			
 
				+    def get_episode_stats(self, episode_idx=-1):
			
 
				+        """获取episode统计信息"""
			
 
				+        if not self.episode_data:
			
 
				+            return {}
			
 
				+
			
 
				+        episode = self.episode_data[episode_idx]
			
 
				+        total_reward = sum(step["reward"] for step in episode)
			
 
				+        avg_recovery = np.mean([step["info"].get("recovery", 0) for step in episode if "recovery" in step["info"]])
			
 
				+        feasible_steps = sum(1 for step in episode if step["info"].get("feasible", False))
			
 
				+
			
 
				+        return {
			
 
				+            "total_reward": total_reward,
			
 
				+            "avg_recovery": avg_recovery,
			
 
				+            "feasible_steps": feasible_steps,
			
 
				+            "total_steps": len(episode)
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+class UFTrainingCallback(BaseCallback):
			
 
				+    """
			
 
				+    PPO 训练回调，用于记录每一步的数据到 recorder。
			
 
				+    相比原来的 RecordingCallback，更加合理和健壮：
			
 
				+    1. 不依赖环境内部 last_* 属性
			
 
				+    2. 使用 PPO 提供的 obs、actions、rewards、dones、infos
			
 
				+    3. 自动处理 episode 结束时的统计
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, recorder, verbose=0):
			
 
				+        super(UFTrainingCallback, self).__init__(verbose)
			
 
				+        self.recorder = recorder
			
 
				+
			
 
				+    def _on_step(self) -> bool:
			
 
				+        try:
			
 
				+            new_obs = self.locals.get("new_obs")
			
 
				+            actions = self.locals.get("actions")
			
 
				+            rewards = self.locals.get("rewards")
			
 
				+            dones = self.locals.get("dones")
			
 
				+            infos = self.locals.get("infos")
			
 
				+
			
 
				+            if len(new_obs) > 0:
			
 
				+                step_obs = new_obs[0]
			
 
				+                step_action = actions[0] if actions is not None else None
			
 
				+                step_reward = rewards[0] if rewards is not None else 0.0
			
 
				+                step_done = dones[0] if dones is not None else False
			
 
				+                step_info = infos[0] if infos is not None else {}
			
 
				+
			
 
				+                # 打印当前 step 的信息
			
 
				+                if self.verbose:
			
 
				+                    print(f"[Step {self.num_timesteps}] 动作={step_action}, 奖励={step_reward:.3f}, Done={step_done}")
			
 
				+
			
 
				+                # 记录数据
			
 
				+                self.recorder.record_step(
			
 
				+                    obs=step_obs,
			
 
				+                    action=step_action,
			
 
				+                    reward=step_reward,
			
 
				+                    done=step_done,
			
 
				+                    info=step_info,
			
 
				+                )
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            if self.verbose:
			
 
				+                print(f"[Callback Error] {e}")
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+class DQNTrainer:
			
 
				+    def __init__(self, env, params, callback=None):
			
 
				+        self.env = env
			
 
				+        self.params = params
			
 
				+        self.callback = callback
			
 
				+        self.log_dir = self._create_log_dir()
			
 
				+        self.model = self._create_model()
			
 
				+
			
 
				+    def _create_log_dir(self):
			
 
				+        timestamp = time.strftime("%Y%m%d-%H%M%S")
			
 
				+        log_name = (
			
 
				+            f"DQN_lr{self.params.learning_rate}_buf{self.params.buffer_size}_bs{self.params.batch_size}"
			
 
				+            f"_gamma{self.params.gamma}_exp{self.params.exploration_fraction}"
			
 
				+            f"_{self.params.remark}_{timestamp}"
			
 
				+        )
			
 
				+        log_dir = os.path.join("./uf_dqn_tensorboard", log_name)
			
 
				+        os.makedirs(log_dir, exist_ok=True)
			
 
				+        return log_dir
			
 
				+
			
 
				+    def _create_model(self):
			
 
				+        return DQN(
			
 
				+            policy="MlpPolicy",
			
 
				+            env=self.env,
			
 
				+            learning_rate=self.params.learning_rate,
			
 
				+            buffer_size=self.params.buffer_size,  # 大缓冲保证经验多样性
			
 
				+            learning_starts=self.params.learning_starts,
			
 
				+            batch_size=self.params.batch_size,
			
 
				+            gamma=self.params.gamma,
			
 
				+            train_freq=self.params.train_freq,
			
 
				+            target_update_interval=1,
			
 
				+            tau=0.005,
			
 
				+            exploration_initial_eps=self.params.exploration_initial_eps,
			
 
				+            exploration_fraction=self.params.exploration_fraction,
			
 
				+            exploration_final_eps=self.params.exploration_final_eps,
			
 
				+            verbose=1,
			
 
				+            tensorboard_log=self.log_dir
			
 
				+            # 不再指定 replay_buffer_class，默认使用 ReplayBuffer
			
 
				+        )
			
 
				+
			
 
				+    def train(self, total_timesteps: int):
			
 
				+        if self.callback:
			
 
				+            self.model.learn(total_timesteps=total_timesteps, callback=self.callback)
			
 
				+        else:
			
 
				+            self.model.learn(total_timesteps=total_timesteps)
			
 
				+        print(f"模型训练完成，日志保存在：{self.log_dir}")
			
 
				+
			
 
				+    def save(self, path=None):
			
 
				+        if path is None:
			
 
				+            path = os.path.join(self.log_dir, "dqn_model.zip")
			
 
				+        self.model.save(path)
			
 
				+        print(f"模型已保存到：{path}")
			
 
				+
			
 
				+    def load(self, path):
			
 
				+        self.model = DQN.load(path, env=self.env)
			
 
				+        print(f"模型已从 {path} 加载")
			
 
				+
			
 
				+
			
 
				+def train_uf_rl_agent(params: UFParams, total_timesteps: int = 10000, seed: int = 2025):
			
 
				+    set_global_seed(seed)
			
 
				+    recorder = UFEpisodeRecorder()
			
 
				+    callback = UFTrainingCallback(recorder, verbose=1)
			
 
				+
			
 
				+    def make_env():
			
 
				+        env = UFSuperCycleEnv(params)
			
 
				+        env = Monitor(env)
			
 
				+        return env
			
 
				+
			
 
				+    env = DummyVecEnv([make_env])
			
 
				+
			
 
				+    dqn_params = DQNParams()
			
 
				+    trainer = DQNTrainer(env, dqn_params, callback=callback)
			
 
				+    trainer.train(total_timesteps)
			
 
				+    trainer.save()
			
 
				+
			
 
				+    stats = callback.recorder.get_episode_stats()
			
 
				+    print(f"训练完成 - 总奖励: {stats.get('total_reward', 0):.2f}, 平均回收率: {stats.get('avg_recovery', 0):.3f}")
			
 
				+
			
 
				+    return trainer.model
			
 
				+
			
 
				+
			
 
				+# 训练和测试示例
			
 
				+if __name__ == "__main__":
			
 
				+    # 初始化参数
			
 
				+    params = UFParams()
			
 
				+
			
 
				+    # 训练RL代理
			
 
				+    print("开始训练RL代理...")
			
 
				+    train_uf_rl_agent(params, total_timesteps=50000)
			
 
				+
			
--- a/models/uf-rl/超滤开发/config.json
+++ b/models/uf-rl/超滤开发/config.json
@@ -0,0 +1,198 @@
 
				+{
			
 
				+  "_comment_api": "API接口配置",
			
 
				+  "api": {
			
 
				+    "base_url": "http://120.55.44.4:8900",
			
 
				+    "current_data_endpoint": "/api/v1/jinke-cloud/device/current-data",
			
 
				+    "callback_endpoint": "/api/dtgateway/v1/decision/data",
			
 
				+    "plc_endpoint": "/api/v1/plc/set-var-values",
			
 
				+    "jwt_token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJJRCI6NywiVXNlcm5hbWUiOiJhZG1pbiIsIkRlcCI6IjEzNSIsImV4cCI6MTc3NjExOTExNCwiaXNzIjoiZ2luLWJsb2cifQ.0HTtzHZjyd2mHo8VCy8icYROxmntRMuQhyoZsAYRL_M"
			
 
				+  },
			
 
				+  "_comment_database": "MySQL数据库连接配置",
			
 
				+  "database": {
			
 
				+    "host": "222.130.26.206",
			
 
				+    "port": 4000,
			
 
				+    "user": "whu",
			
 
				+    "password": "09093f4e6b33ddd",
			
 
				+    "database": "ws_data",
			
 
				+    "table_name": "dc_item_history_data_minute"
			
 
				+  },
			
 
				+  "_comment_scada": "SCADA系统配置 - PLC通信签名验证",
			
 
				+  "scada": {
			
 
				+    "secret": "237c92d2-8795-1094-11ef-00e2e48fce4a",
			
 
				+    "project_id": 92
			
 
				+  },
			
 
				+  "_comment_system": "系统运行参数配置",
			
 
				+  "system": {
			
 
				+    "use_model": 0,
			
 
				+    "_use_model_desc": "模型开关: 1=启用模型决策, 0=禁用模型仅记录数据 (支持运行时修改)",
			
 
				+    "trigger_value": 95,
			
 
				+    "_trigger_value_desc": "触发监控的控制值",
			
 
				+    "num_values_to_collect": 10,
			
 
				+    "_num_values_to_collect_desc": "每次收集的数据点数量",
			
 
				+    "poll_interval": 2,
			
 
				+    "_poll_interval_desc": "轮询间隔时间(秒)",
			
 
				+    "backwash_time": 100,
			
 
				+    "_backwash_time_desc": "默认反洗时间(秒)",
			
 
				+    "ceb_count": 45,
			
 
				+    "tmp_history_count": 5,
			
 
				+    "_tmp_history_count_desc": "保存最近N次TMP平均值，用于趋势判断（可调整为任意次数）"
			
 
				+  },
			
 
				+  "_comment_devices": "设备配置列表 - 每个设备的API调用参数",
			
 
				+  "devices": [
			
 
				+    {
			
 
				+      "_comment": "UF1超滤设备配置",
			
 
				+      "name": "UF1",
			
 
				+      "press_pv_item": "C.M.UF1_DB@press_PV",
			
 
				+      "_press_pv_item_desc": "用于历史数据查询的压差",
			
 
				+      "control_payload": {
			
 
				+        "_desc": "控制字读取配置 - 用于触发条件检测",
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF1_DB@word_control",
			
 
				+        "deviceName": "UF1_control_word",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "target_payload": {
			
 
				+        "_desc": "跨膜压差读取配置 - 用于数据收集",
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "UF1_BW_After_TMP",
			
 
				+        "deviceName": "UF1_backwash_pressure_diff",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "production_time_payload": {
			
 
				+        "_desc": "产水时长读取配置 - 用于模型输入",
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF1_DB@time_production",
			
 
				+        "deviceName": "UF1过滤时长",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "backwashing_payload": {
			
 
				+        "_desc": "反洗时长读取配置 - 用于模型输入",
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF1_DB@time_BW_SP",
			
 
				+        "deviceName": "UF1反洗时长",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "ceb_payload": {
			
 
				+        "_desc": "CEB次数读取配置 - 用于下发",
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF1_DB@cycle_sp",
			
 
				+        "deviceName": "UF1CEB次数设定",
			
 
				+        "project_id": 92
			
 
				+      }
			
 
				+    },
			
 
				+    {
			
 
				+      "_comment": "UF2超滤设备配置",
			
 
				+      "name": "UF2",
			
 
				+      "press_pv_item": "C.M.UF2_DB@press_PV",
			
 
				+      "control_payload": {
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF2_DB@word_control",
			
 
				+        "deviceName": "UF2_control_word",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "target_payload": {
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "UF2_BW_After_TMP",
			
 
				+        "deviceName": "UF2_backwash_pressure_diff",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "production_time_payload": {
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF2_DB@time_production",
			
 
				+        "deviceName": "UF2过滤时长",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "backwashing_payload": {
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF2_DB@time_BW_SP",
			
 
				+        "deviceName": "UF2反洗时长",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "ceb_payload": {
			
 
				+        "_desc": "CEB次数读取配置 - 用于下发",
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF2_DB@cycle_sp",
			
 
				+        "deviceName": "UF2CEB次数设定",
			
 
				+        "project_id": 92
			
 
				+      }
			
 
				+    },
			
 
				+    {
			
 
				+      "_comment": "UF3超滤设备配置",
			
 
				+      "name": "UF3",
			
 
				+      "press_pv_item": "C.M.UF3_DB@press_PV",
			
 
				+      "control_payload": {
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF3_DB@word_control",
			
 
				+        "deviceName": "UF3_control_word",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "target_payload": {
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "UF3_BW_After_TMP",
			
 
				+        "deviceName": "UF3_backwash_pressure_diff",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "production_time_payload": {
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF3_DB@time_production",
			
 
				+        "deviceName": "UF3过滤时长",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "backwashing_payload": {
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF3_DB@time_BW_SP",
			
 
				+        "deviceName": "UF3反洗时长",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "ceb_payload": {
			
 
				+        "_desc": "CEB次数读取配置 - 用于下发",
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF3_DB@cycle_sp",
			
 
				+        "deviceName": "UF3CEB次数设定",
			
 
				+        "project_id": 92
			
 
				+      }
			
 
				+    },
			
 
				+    {
			
 
				+      "_comment": "UF4超滤设备配置",
			
 
				+      "name": "UF4",
			
 
				+      "press_pv_item": "C.M.UF4_DB@press_PV",
			
 
				+      "control_payload": {
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF4_DB@word_control",
			
 
				+        "deviceName": "UF4_control_word",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "target_payload": {
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "UF4_BW_After_TMP",
			
 
				+        "deviceName": "UF4_backwash_pressure_diff",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "production_time_payload": {
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF4_DB@time_production",
			
 
				+        "deviceName": "UF4反洗时长",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "backwashing_payload": {
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF4_DB@time_BW_SP",
			
 
				+        "deviceName": "UF4反洗时长",
			
 
				+        "project_id": 92
			
 
				+      },
			
 
				+      "ceb_payload": {
			
 
				+        "_desc": "CEB次数读取配置 - 用于下发",
			
 
				+        "deviceId": "1",
			
 
				+        "deviceItems": "C.M.UF4_DB@cycle_sp",
			
 
				+        "deviceName": "UF4CEB次数设定",
			
 
				+        "project_id": 92
			
 
				+      }
			
 
				+    }
			
 
				+  ],
			
 
				+  "_comment_usage": "配置文件使用说明",
			
 
				+  "_usage_notes": {
			
 
				+    "1_动态配置": "use_model支持运行时修改，无需重启程序",
			
 
				+    "2_签名验证": "PLC通信使用MD5签名验证，确保scada.secret与服务器端一致",
			
 
				+    "3_设备扩展": "新增设备时，复制现有设备配置并修改相应的deviceItems参数"
			
 
				+  }
			
 
				+}
			
--- a/models/uf-rl/超滤开发/device_states.json
+++ b/models/uf-rl/超滤开发/device_states.json
@@ -0,0 +1,23 @@
 
				+{
			
 
				+    "_comment": "此文件用于存储每个设备的运行时状态。时间格式为 YYYY-MM-DD HH:MM:SS",
			
 
				+    "UF1": {
			
 
				+        "model_prev_L_s": 4220.0,
			
 
				+        "model_prev_t_bw_s": 90.0,
			
 
				+        "last_cycle_end_time": "2025-10-29 09:29:48"
			
 
				+    },
			
 
				+    "UF2": {
			
 
				+        "model_prev_L_s": 4220.0,
			
 
				+        "model_prev_t_bw_s": 90.0,
			
 
				+        "last_cycle_end_time": "2025-10-26 15:34:23"
			
 
				+    },
			
 
				+    "UF3": {
			
 
				+        "model_prev_L_s": 4220.0,
			
 
				+        "model_prev_t_bw_s": 90.0,
			
 
				+        "last_cycle_end_time": "2025-10-26 18:17:29"
			
 
				+    },
			
 
				+    "UF4": {
			
 
				+        "model_prev_L_s": 4220.0,
			
 
				+        "model_prev_t_bw_s": 90.0,
			
 
				+        "last_cycle_end_time": "2025-10-27 13:44:35"
			
 
				+    }
			
 
				+}
			
--- a/models/uf-rl/超滤开发/dqn_model.zip
+++ b/models/uf-rl/超滤开发/dqn_model.zip
--- a/models/uf-rl/超滤开发/dqn_model/_stable_baselines3_version
+++ b/models/uf-rl/超滤开发/dqn_model/_stable_baselines3_version
@@ -0,0 +1 @@
 
				+2.6.0
			
--- a/models/uf-rl/超滤开发/dqn_model/data
+++ b/models/uf-rl/超滤开发/dqn_model/data
--- a/models/uf-rl/超滤开发/dqn_model/policy.optimizer.pth
+++ b/models/uf-rl/超滤开发/dqn_model/policy.optimizer.pth
--- a/models/uf-rl/超滤开发/dqn_model/policy.pth
+++ b/models/uf-rl/超滤开发/dqn_model/policy.pth
--- a/models/uf-rl/超滤开发/dqn_model/pytorch_variables.pth
+++ b/models/uf-rl/超滤开发/dqn_model/pytorch_variables.pth
--- a/models/uf-rl/超滤开发/dqn_model/system_info.txt
+++ b/models/uf-rl/超滤开发/dqn_model/system_info.txt
@@ -0,0 +1,9 @@
 
				+- OS: Windows-10-10.0.26100-SP0 10.0.26100
			
 
				+- Python: 3.10.9
			
 
				+- Stable-Baselines3: 2.6.0
			
 
				+- PyTorch: 2.8.0+cpu
			
 
				+- GPU Enabled: False
			
 
				+- Numpy: 1.26.4
			
 
				+- Cloudpickle: 3.1.1
			
 
				+- Gymnasium: 1.0.0
			
 
				+- OpenAI Gym: 0.26.2
			
--- a/models/uf-rl/超滤开发/loop_main.py
+++ b/models/uf-rl/超滤开发/loop_main.py
@@ -0,0 +1,771 @@
 
				+# 标准库导入
			
 
				+import time
			
 
				+import json
			
 
				+import os
			
 
				+import threading
			
 
				+import hashlib
			
 
				+from datetime import datetime, timedelta
			
 
				+import logging
			
 
				+from logging.handlers import RotatingFileHandler
			
 
				+
			
 
				+# 第三方库导入
			
 
				+import pymysql
			
 
				+import requests
			
 
				+
			
 
				+# 自定义模块导入
			
 
				+from DQN_env import UFParams
			
 
				+from DQN_decide import run_uf_DQN_decide, generate_plc_instructions, calc_uf_cycle_metrics 
			
 
				+
			
 
				+# 日志系统配置
			
 
				+logger = logging.getLogger(__name__)
			
 
				+logger.setLevel(logging.INFO)
			
 
				+
			
 
				+# 日志输出格式
			
 
				+formatter = logging.Formatter(
			
 
				+    '%(asctime)s - %(threadName)s - %(levelname)s - %(message)s',
			
 
				+    datefmt='%Y-%m-%d %H:%M:%S'
			
 
				+)
			
 
				+
			
 
				+# 文件日志处理器，单个文件最大5MB，保留3个备份
			
 
				+file_handler = RotatingFileHandler('monitor_service.log', maxBytes=5 * 1024 * 1024, backupCount=3, encoding='utf-8')
			
 
				+file_handler.setFormatter(formatter)
			
 
				+
			
 
				+# 控制台日志处理器
			
 
				+console_handler = logging.StreamHandler()
			
 
				+console_handler.setFormatter(formatter)
			
 
				+
			
 
				+# 添加处理器
			
 
				+logger.addHandler(file_handler)
			
 
				+logger.addHandler(console_handler)
			
 
				+
			
 
				+
			
 
				+# 配置加载函数
			
 
				+def load_config(config_file='config.json'):
			
 
				+    """
			
 
				+    从JSON配置文件加载系统配置
			
 
				+    
			
 
				+    参数:
			
 
				+        config_file: 配置文件路径
			
 
				+        
			
 
				+    返回:
			
 
				+        配置字典
			
 
				+        
			
 
				+    异常:
			
 
				+        配置文件不存在或格式错误时抛出异常
			
 
				+    """
			
 
				+    try:
			
 
				+        with open(config_file, 'r', encoding='utf-8') as f:
			
 
				+            return json.load(f)
			
 
				+    except FileNotFoundError:
			
 
				+        logger.critical(f"配置文件未找到 {config_file}")
			
 
				+        raise
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        logger.critical(f"配置文件格式错误 {config_file}: {e}")
			
 
				+        raise
			
 
				+
			
 
				+
			
 
				+def get_current_config():
			
 
				+    """
			
 
				+    获取当前配置，支持运行时配置动态变更
			
 
				+    """
			
 
				+    return load_config()
			
 
				+
			
 
				+
			
 
				+# 初始化配置
			
 
				+config = load_config()
			
 
				+
			
 
				+# 全局配置参数
			
 
				+
			
 
				+# API接口配置
			
 
				+API_BASE_URL = config['api']['base_url']
			
 
				+API_URL = API_BASE_URL + config['api']['current_data_endpoint']
			
 
				+CALLBACK_URL = API_BASE_URL + config['api']['callback_endpoint']
			
 
				+PLC_URL = API_BASE_URL + config['api']['plc_endpoint']
			
 
				+
			
 
				+# HTTP请求头
			
 
				+HEADERS = {
			
 
				+    "Content-Type": "application/json",
			
 
				+    "JWT-TOKEN": config['api']['jwt_token']
			
 
				+}
			
 
				+
			
 
				+# MySQL数据库配置，优先读取环境变量
			
 
				+DB_USER = os.getenv('DB_USERNAME', config['database']['user'])
			
 
				+DB_PASSWORD = os.getenv('DB_PASSWORD', config['database']['password'])
			
 
				+DB_HOST = os.getenv('DB_HOST', config['database']['host'])
			
 
				+DB_NAME = os.getenv('DB_DATABASE', config['database']['database'])
			
 
				+DB_PORT = int(os.getenv('DB_PORT', str(config['database']['port'])))
			
 
				+HISTORY_TABLE_NAME = config['database']['table_name']
			
 
				+
			
 
				+# 超滤系统参数
			
 
				+uf_params = UFParams()
			
 
				+PROJECT_ID_FOR_CALLBACK = config['scada']['project_id']
			
 
				+SCADA_SECRET = config['scada']['secret']
			
 
				+
			
 
				+# 监控流程参数
			
 
				+TRIGGER_VALUE = config['system']['trigger_value']
			
 
				+NUM_VALUES_TO_COLLECT = config['system']['num_values_to_collect']
			
 
				+POLL_INTERVAL = config['system']['poll_interval']
			
 
				+
			
 
				+# 设备列表
			
 
				+DEVICE_SEQUENCE = config['devices']
			
 
				+
			
 
				+# 状态持久化配置
			
 
				+STATE_FILE = 'device_states.json'
			
 
				+_state_lock = threading.Lock()
			
 
				+device_states = {}
			
 
				+DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
			
 
				+
			
 
				+
			
 
				+# 状态持久化函数
			
 
				+def load_device_states():
			
 
				+    """
			
 
				+    从状态文件加载所有设备的运行状态
			
 
				+    """
			
 
				+    global device_states
			
 
				+    with _state_lock:
			
 
				+        try:
			
 
				+            if os.path.exists(STATE_FILE):
			
 
				+                with open(STATE_FILE, 'r', encoding='utf-8') as f:
			
 
				+                    content = f.read()
			
 
				+                    if content:
			
 
				+                        device_states = json.loads(content)
			
 
				+                        logger.info(f"状态文件加载成功 {STATE_FILE}")
			
 
				+                    else:
			
 
				+                        logger.warning(f"状态文件为空 {STATE_FILE}")
			
 
				+                        device_states = {}
			
 
				+            else:
			
 
				+                logger.info(f"状态文件不存在，首次运行 {STATE_FILE}")
			
 
				+                device_states = {}
			
 
				+        except (json.JSONDecodeError, IOError) as e:
			
 
				+            logger.error(f"状态文件加载失败 {STATE_FILE}: {e}")
			
 
				+            device_states = {}
			
 
				+
			
 
				+
			
 
				+def save_device_state(device_name, state_data):
			
 
				+    """
			
 
				+    保存单个设备的运行状态到文件
			
 
				+    
			
 
				+    参数:
			
 
				+        device_name: 设备名称
			
 
				+        state_data: 设备状态数据字典
			
 
				+    """
			
 
				+    with _state_lock:
			
 
				+        try:
			
 
				+            # 读取现有状态
			
 
				+            full_states = {}
			
 
				+            if os.path.exists(STATE_FILE):
			
 
				+                with open(STATE_FILE, 'r', encoding='utf-8') as f:
			
 
				+                    content = f.read()
			
 
				+                    if content:
			
 
				+                        full_states = json.loads(content)
			
 
				+
			
 
				+            # 更新指定设备状态
			
 
				+            full_states[device_name] = state_data
			
 
				+
			
 
				+            # 写回文件
			
 
				+            with open(STATE_FILE, 'w', encoding='utf-8') as f:
			
 
				+                json.dump(full_states, f, indent=4, ensure_ascii=False)
			
 
				+
			
 
				+            # 更新内存缓存
			
 
				+            global device_states
			
 
				+            device_states[device_name] = state_data
			
 
				+            logger.info(f"[{device_name}] 状态保存成功")
			
 
				+        except (json.JSONDecodeError, IOError) as e:
			
 
				+            logger.error(f"[{device_name}] 状态保存失败: {e}")
			
 
				+
			
 
				+
			
 
				+# 核心业务函数
			
 
				+
			
 
				+def create_db_connection():
			
 
				+    """
			
 
				+    创建MySQL数据库连接
			
 
				+    
			
 
				+    返回:
			
 
				+        连接对象或None
			
 
				+    """
			
 
				+    try:
			
 
				+        connection = pymysql.connect(
			
 
				+            host=DB_HOST, user=DB_USER, password=DB_PASSWORD, database=DB_NAME,
			
 
				+            port=DB_PORT, charset='utf8mb4',
			
 
				+            cursorclass=pymysql.cursors.DictCursor
			
 
				+        )
			
 
				+        logger.debug("数据库连接成功")
			
 
				+        return connection
			
 
				+    except pymysql.MySQLError as e:
			
 
				+        logger.error(f"数据库连接失败: {e}")
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def get_tmp_extremes(item_name, start_time, end_time, word_control):
			
 
				+    """
			
 
				+    通过API查询历史数据中指定时间范围内的跨膜压差极值
			
 
				+    
			
 
				+    参数:
			
 
				+        item_name: 数据项名称
			
 
				+        start_time: 开始时间
			
 
				+        end_time: 结束时间
			
 
				+        word_control: 控制字段名
			
 
				+        
			
 
				+    返回:
			
 
				+        (最大值, 最小值) 或 (None, None)
			
 
				+    """
			
 
				+    # 转换时间为毫秒级时间戳
			
 
				+    start_timestamp = int(start_time.timestamp() * 1000)
			
 
				+    end_timestamp = int(end_time.timestamp() * 1000)
			
 
				+    
			
 
				+    logger.info(f"查询历史极值 {item_name} 从 {start_time.strftime(DATETIME_FORMAT)} 到 {end_time.strftime(DATETIME_FORMAT)}")
			
 
				+    
			
 
				+    # API基础URL
			
 
				+    api_base_url = "http://120.55.44.4:8900/api/v1/jinke-cloud/db/device/history-data"
			
 
				+    
			
 
				+    try:
			
 
				+        # 第一次调用：查询item_name的极值
			
 
				+        params1 = {
			
 
				+            "deviceid": "1",
			
 
				+            "dataitemid": item_name,
			
 
				+            "project_id": "92",
			
 
				+            "stime": start_timestamp,
			
 
				+            "etime": end_timestamp,
			
 
				+            "size": "1",
			
 
				+            "interval": "minute",
			
 
				+            "aggregator": "new"
			
 
				+        }
			
 
				+        
			
 
				+        logger.info(f"第一次API调用: {api_base_url} 参数: {params1}")
			
 
				+        response1 = requests.get(api_base_url, params=params1, headers=HEADERS, timeout=30)
			
 
				+        response1.raise_for_status()
			
 
				+        data1 = response1.json()
			
 
				+        logger.debug(f"第一次API响应: {data1}")
			
 
				+        
			
 
				+        # 第二次调用：查询word_control的极值
			
 
				+        params2 = {
			
 
				+            "deviceid": "1", 
			
 
				+            "dataitemid": word_control,
			
 
				+            "project_id": "92",
			
 
				+            "stime": start_timestamp,
			
 
				+            "etime": end_timestamp,
			
 
				+            "size": "1",
			
 
				+            "interval": "minute",
			
 
				+            "aggregator": "new"
			
 
				+        }
			
 
				+        
			
 
				+        logger.info(f"第二次API调用: {api_base_url} 参数: {params2}")
			
 
				+        response2 = requests.get(api_base_url, params=params2, headers=HEADERS, timeout=30)
			
 
				+        response2.raise_for_status()
			
 
				+        data2 = response2.json()
			
 
				+        logger.debug(f"第二次API响应: {data2}")
			
 
				+
			
 
				+        # 处理两次API调用的结果
			
 
				+        max_val = None
			
 
				+        min_val = None
			
 
				+
			
 
				+        # 从第一次调用结果中提取'UF1跨膜压差'的值，并存储在字典中，以时间为键
			
 
				+        uf1_diff_values = {}
			
 
				+        if data1.get("code") == 200 and data1.get("data"):
			
 
				+            for item in data1["data"]:
			
 
				+                if item.get("name") == "UF1跨膜压差" and item.get("val") is not None:
			
 
				+                    time = item.get("htime_at")
			
 
				+                    uf1_diff_values[time] = float(item.get("val"))
			
 
				+            if uf1_diff_values:
			
 
				+                logger.info(f"第一次API查询成功，提取到跨膜压差数据数量：{len(uf1_diff_values)}")
			
 
				+
			
 
				+        # 从第二次调用结果中提取'UF1控制字'为26的数据点，并进行时间匹配
			
 
				+        if data2.get("code") == 200 and data2.get("data"):
			
 
				+            control_26_values = []
			
 
				+            for item in data2["data"]:
			
 
				+                if item.get("name") == "UF1控制字" and item.get("val") == '26':
			
 
				+                    time = item.get("htime_at")
			
 
				+                    # 如果在第一次数据中找到了对应的跨膜压差值
			
 
				+                    if time in uf1_diff_values:
			
 
				+                        control_26_values.append(uf1_diff_values[time])
			
 
				+
			
 
				+            if control_26_values:
			
 
				+                logger.info(f"找到控制字为26的数据点，合并跨膜压差数据")
			
 
				+                max_val = max(control_26_values)
			
 
				+                min_val = min(control_26_values)
			
 
				+                # 增加最小跨膜压差的下限值
			
 
				+                if min_val < 0.01:
			
 
				+                    min_val = 0.01
			
 
				+                logger.info(f"控制字为26时的最大跨膜压差值={max_val}，最小跨膜压差值={min_val}")
			
 
				+
			
 
				+        if max_val is not None and min_val is not None:
			
 
				+            logger.info(f"API查询成功 最大跨膜压差值={max_val} 最小跨膜压差值={min_val}")
			
 
				+            return max_val, min_val
			
 
				+        else:
			
 
				+            logger.warning("未找到有效的控制字为26时的跨膜压差数据")
			
 
				+            return None, None
			
 
				+            
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        logger.error(f"API请求错误: {e}")
			
 
				+        return None, None
			
 
				+    except (json.JSONDecodeError, ValueError, KeyError) as e:
			
 
				+        logger.error(f"API响应解析错误: {e}")
			
 
				+        return None, None
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"API查询未知错误: {e}")
			
 
				+        return None, None
			
 
				+
			
 
				+
			
 
				+def generate_md5_signature(record_data, secret, timestamp):
			
 
				+    """
			
 
				+    生成PLC请求的MD5签名
			
 
				+    """
			
 
				+    cal_str = f"{record_data}{secret}{timestamp}"
			
 
				+    return hashlib.md5(cal_str.encode('utf-8')).hexdigest().upper()
			
 
				+
			
 
				+
			
 
				+def send_plc_update(device_name, item, old_value, new_value, command_type):
			
 
				+    """
			
 
				+    向PLC发送参数更新指令
			
 
				+    
			
 
				+    参数:
			
 
				+        device_name: 设备名称
			
 
				+        item: 参数项名称
			
 
				+        old_value: 旧值
			
 
				+        new_value: 新值
			
 
				+        command_type: 指令类型
			
 
				+        
			
 
				+    返回:
			
 
				+        是否发送成功
			
 
				+    """
			
 
				+    # 构造签名和请求数据
			
 
				+    timestamp = int(time.time())  # 生成时间戳
			
 
				+    record_obj = {
			
 
				+        "project_id": PROJECT_ID_FOR_CALLBACK,  # 项目ID
			
 
				+        "item": item,  # 参数项名称
			
 
				+        "old_value": old_value,  # 旧值 
			
 
				+        "new_value": new_value,  # 新值
			
 
				+        "command_type": command_type  # 指令类型
			
 
				+    }
			
 
				+    record_data = json.dumps([record_obj])  # 生成签名数据
			
 
				+    signature = generate_md5_signature(record_data, SCADA_SECRET, timestamp)  # 生成签名
			
 
				+    url = f"{PLC_URL}?sign={signature}&timestamp={timestamp}"  # 生成请求URL
			
 
				+    payload = [record_obj]
			
 
				+
			
 
				+    logger.info(f"[{device_name}] PLC指令 {item} 从 {old_value} 到 {new_value}")  
			
 
				+    logger.debug(f"[{device_name}] 签名数据 {record_data}")
			
 
				+    logger.debug(f"[{device_name}] 签名值 {signature}")
			
 
				+
			
 
				+    # 重试机制
			
 
				+    max_retries, retry_interval = 3, 60  # 重试次数 重试间隔
			
 
				+    for attempt in range(1, max_retries + 1):
			
 
				+        try:
			
 
				+            logger.info(f"[{device_name}] 发送PLC指令 尝试 {attempt}/{max_retries}")
			
 
				+            response = requests.post(url, json=payload, timeout=15)  # 发送PLC指令 请求头 请求体 超时时间
			
 
				+            response_json = response.json()
			
 
				+            if response_json.get('code') == 200:
			
 
				+                logger.info(f"[{device_name}] PLC指令发送成功 响应 {response_json}")
			
 
				+                return True
			
 
				+            else:
			
 
				+                logger.error(f"[{device_name}] PLC指令发送失败 {response_json.get('msg', '未知错误')}")
			
 
				+        except requests.exceptions.RequestException as e:
			
 
				+            logger.error(f"[{device_name}] PLC指令网络错误 {e}")
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"[{device_name}] PLC指令未知错误 {e}")
			
 
				+
			
 
				+        if attempt < max_retries:  # 重试次数 小于 最大重试次数
			
 
				+            logger.info(f"[{device_name}] 等待{retry_interval}秒后重试")
			
 
				+            time.sleep(retry_interval)
			
 
				+
			
 
				+    logger.error(f"[{device_name}] PLC指令发送失败，已达最大重试次数")
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def send_decision_to_callback(type_name, **kwargs):
			
 
				+    """
			
 
				+    发送决策结果到回调接口
			
 
				+    
			
 
				+    参数:
			
 
				+        type_name: 设备类型名称
			
 
				+        **kwargs: 决策结果数据
			
 
				+        
			
 
				+    返回:
			
 
				+        use_model状态值: 1表示开启模型，0表示关闭模型，None表示发送失败
			
 
				+    """
			
 
				+    payload = {"list": [{"type": type_name, "project_id": PROJECT_ID_FOR_CALLBACK, **kwargs}]}  # 请求负载 设备类型 项目ID 决策结果数据
			
 
				+
			
 
				+    logger.info(f"[{type_name}] 发送决策数据\n{json.dumps(payload, indent=2, ensure_ascii=False)}")
			
 
				+
			
 
				+    max_retries, retry_interval = 3, 60  # 重试次数 重试间隔
			
 
				+    for attempt in range(1, max_retries + 1):
			
 
				+        try:
			
 
				+            logger.info(f"[{type_name}] 发送回调 尝试 {attempt}/{max_retries}")
			
 
				+            response = requests.post(CALLBACK_URL, headers=HEADERS, json=payload, timeout=15)  # 发送回调 请求头 请求体 超时时间
			
 
				+            response.raise_for_status()
			
 
				+            response_json = response.json()
			
 
				+            logger.info(f"[{type_name}] 回调发送成功 响应 {response.text}")
			
 
				+            
			
 
				+            # 提取返回的 data 字段，表示 use_model 状态（1=开启，0=关闭）
			
 
				+            use_model_status = response_json.get('data')
			
 
				+            logger.info(f"[{type_name}] 服务器返回 use_model 状态: {use_model_status}")
			
 
				+            return use_model_status
			
 
				+        except requests.exceptions.RequestException as e:
			
 
				+            logger.error(f"[{type_name}] 回调发送失败 {e}")
			
 
				+        except (json.JSONDecodeError, ValueError) as e:
			
 
				+            logger.error(f"[{type_name}] 响应解析失败 {e}")
			
 
				+
			
 
				+        if attempt < max_retries:  # 重试次数 小于 最大重试次数
			
 
				+            logger.info(f"[{type_name}] 等待{retry_interval}秒后重试")
			
 
				+            time.sleep(retry_interval)
			
 
				+
			
 
				+    logger.error(f"[{type_name}] 回调发送失败，已达最大重试次数")
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def get_device_value(payload, device_name):
			
 
				+    """
			
 
				+    从API获取设备数据项的当前值
			
 
				+    
			
 
				+    参数:
			
 
				+        payload: 请求负载
			
 
				+        device_name: 设备名称
			
 
				+        
			
 
				+    返回:
			
 
				+        数据值或None
			
 
				+    """
			
 
				+    try:
			
 
				+        response = requests.post(API_URL, headers=HEADERS, json=[payload], timeout=10)  # 发送请求 请求头 请求体 超时时间
			
 
				+        response.raise_for_status()
			
 
				+        api_response = response.json()  # 解析响应
			
 
				+        if api_response.get("code") == 200 and api_response.get("data"):
			
 
				+            val_str = api_response["data"][0].get("val")  # 获取数据值
			
 
				+            if val_str is not None:
			
 
				+                return float(val_str)
			
 
				+        else:
			
 
				+            logger.error(f"[{device_name}] 获取数据失败 {payload['deviceItems']} {api_response.get('msg', '未知错误')}")  # 日志 设备名称 请求负载 响应
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        logger.error(f"[{device_name}] API网络错误 {payload['deviceItems']} {e}")  # 日志 设备名称 请求负载 错误
			
 
				+    except (json.JSONDecodeError, ValueError, IndexError) as e:
			
 
				+        logger.error(f"[{device_name}] API数据解析错误 {payload['deviceItems']} {e}")  # 日志 设备名称 请求负载 错误
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+# 设备监控主循环
			
 
				+
			
 
				+def monitor_device(device):
			
 
				+    """
			
 
				+    单个设备的监控循环
			
 
				+    
			
 
				+    完整流程:
			
 
				+    1. 等待触发条件
			
 
				+    2. 收集稳定数据
			
 
				+    3. 执行决策计算
			
 
				+    4. 发送控制指令
			
 
				+    5. 等待重置信号
			
 
				+    
			
 
				+    参数:
			
 
				+        device: 设备配置字典
			
 
				+    """
			
 
				+    name = device["name"]
			
 
				+    threading.current_thread().name = name
			
 
				+    logger.info("监控线程启动")
			
 
				+
			
 
				+    # 加载设备历史状态
			
 
				+    device_state = device_states.get(name, {})  # 设备状态 
			
 
				+    model_prev_L_s = device_state.get('model_prev_L_s')  # 过滤时间 上一轮
			
 
				+    model_prev_t_bw_s = device_state.get('model_prev_t_bw_s')  # 反洗时间 上一轮
			
 
				+    last_cycle_end_time_str = device_state.get('last_cycle_end_time')  # 上次运行结束时间
			
 
				+
			
 
				+    # 解析上次运行结束时间
			
 
				+    last_cycle_end_time = None  # 上次运行结束时间
			
 
				+    if last_cycle_end_time_str:
			
 
				+        try:
			
 
				+            last_cycle_end_time = datetime.strptime(last_cycle_end_time_str, DATETIME_FORMAT)  # 上次运行结束时间
			
 
				+            logger.info(f"历史状态加载成功，上次运行时间 {last_cycle_end_time.strftime(DATETIME_FORMAT)}")
			
 
				+        except ValueError:
			
 
				+            logger.warning(f"时间戳解析失败 {last_cycle_end_time_str}")
			
 
				+    else:
			
 
				+        logger.info("首次运行，无历史状态")
			
 
				+
			
 
				+    # 主循环
			
 
				+    while True:
			
 
				+        try:
			
 
				+            # 阶段1: 等待触发条件 (控制字=95)
			
 
				+            logger.info(f"等待触发 控制字需等于 {TRIGGER_VALUE}")
			
 
				+            while True:
			
 
				+                control_value = get_device_value(device["control_payload"], name)  # 控制字
			
 
				+                if control_value is not None and int(control_value) == TRIGGER_VALUE:  # 控制字 等于 触发值 95
			
 
				+                    logger.info("触发条件满足，开始等待控制字变为26")
			
 
				+                    break
			
 
				+                time.sleep(POLL_INTERVAL)
			
 
				+
			
 
				+            # 阶段1.5: 等待控制字变为26
			
 
				+            logger.info("等待控制字变为26")
			
 
				+            while True:
			
 
				+                control_value = get_device_value(device["control_payload"], name)  # 控制字
			
 
				+                if control_value is not None and int(control_value) == 26:  # 控制字 等于 26
			
 
				+                    logger.info("控制字变为26，开始收集10分钟数据")
			
 
				+                    break
			
 
				+                time.sleep(POLL_INTERVAL)
			
 
				+
			
 
				+            # 阶段2: 收集10分钟数据并计算平均值
			
 
				+            logger.info("开始收集10分钟TMP数据")
			
 
				+            collected_values = []
			
 
				+            start_collection_time = datetime.now()
			
 
				+            collection_duration = timedelta(minutes=10)  # 10分钟
			
 
				+            
			
 
				+            # 日志计数器，每收集60个点打印一次，避免日志过多
			
 
				+            log_interval = 60
			
 
				+            
			
 
				+            while datetime.now() - start_collection_time < collection_duration:
			
 
				+                current_value = get_device_value(device["target_payload"], name)  # 当前值
			
 
				+                control_value = get_device_value(device["control_payload"], name)  # 检查控制字
			
 
				+                
			
 
				+                # 检查控制字是否保持26
			
 
				+                if control_value is not None and int(control_value) != 26:
			
 
				+                    logger.warning(f"数据收集期间控制字发生变化: {control_value}，停止收集")
			
 
				+                    # 如果控制字变为95，说明系统重置了，需要重新开始
			
 
				+                    if int(control_value) == TRIGGER_VALUE:
			
 
				+                        logger.info("控制字变为95，系统重置，重新开始监控")
			
 
				+                        break
			
 
				+                    else:
			
 
				+                        logger.info("控制字变为其他值，等待重置")
			
 
				+                        break
			
 
				+                    
			
 
				+                if current_value is not None:
			
 
				+                    collected_values.append(current_value)
			
 
				+                    # 每收集60个点或第一个点时打印日志，减少日志数量
			
 
				+                    if len(collected_values) == 1 or len(collected_values) % log_interval == 0:
			
 
				+                        logger.info(f"收集TMP值 {current_value:.4f} 已收集 {len(collected_values)} 个数据点")
			
 
				+                time.sleep(POLL_INTERVAL)
			
 
				+            
			
 
				+            if not collected_values:
			
 
				+                logger.warning("10分钟内未收集到有效数据，跳过本轮")
			
 
				+                # 检查控制字状态，如果已经是95则直接开始新一轮
			
 
				+                control_value = get_device_value(device["control_payload"], name)
			
 
				+                if control_value is not None and int(control_value) == TRIGGER_VALUE:
			
 
				+                    logger.info("控制字已经是95，直接开始新一轮")
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    # 等待控制字重置后再继续
			
 
				+                    logger.info("等待控制字重置...")
			
 
				+                    time.sleep(10)  # 等待10秒
			
 
				+                    continue
			
 
				+
			
 
				+            # 阶段3: 决策计算
			
 
				+            logger.info(f"数据收集完成，共收集 {len(collected_values)} 个数据点，开始决策计算")
			
 
				+            if collected_values:
			
 
				+                # 计算平均值作为代表值
			
 
				+                average_value = sum(collected_values) / len(collected_values)
			
 
				+                logger.info(f"TMP平均值 {average_value:.4f}")
			
 
				+
			
 
				+                # 确定历史数据查询时间范围
			
 
				+                current_decision_time = datetime.now()
			
 
				+                start_query_time = last_cycle_end_time if last_cycle_end_time else current_decision_time - timedelta(hours=48)
			
 
				+                _word_controldevice = device["control_payload"]["deviceItems"]
			
 
				+
			
 
				+                # 查询历史极值
			
 
				+                max_tmp, min_tmp = get_tmp_extremes(device["press_pv_item"], start_query_time, current_decision_time, _word_controldevice)
			
 
				+
			
 
				+                # 调用DQN模型获取决策建议
			
 
				+                logger.info("调用DQN决策模型")
			
 
				+                uf_bw_dict = run_uf_DQN_decide(uf_params, average_value)
			
 
				+                logger.info(f"模型决策结果 {uf_bw_dict}")
			
 
				+
			
 
				+                # 获取当前PLC参数
			
 
				+                prod_time = get_device_value(device["production_time_payload"], name) or 3800  # 产水时间 默认3800
			
 
				+                bw_time = get_device_value(device["backwashing_payload"], name) or 100  # 反洗时间 默认100
			
 
				+                bw_per_ceb = get_device_value(device["ceb_payload"], name) or 40  # CEB 次数时间 默认40
			
 
				+
			
 
				+                # 生成PLC指令
			
 
				+                L_s, t_bw_s = generate_plc_instructions(
			
 
				+                    prod_time, bw_time,  # 产水时间 反洗时间
			
 
				+                    model_prev_L_s, model_prev_t_bw_s,  # 过滤时间 反洗时间 上一轮
			
 
				+                    uf_bw_dict["L_s"], uf_bw_dict["t_bw_s"]  # 过滤时间 反洗时间 决策建议
			
 
				+                )
			
 
				+                
			
 
				+                # 计算运行指标
			
 
				+                logger.info(f"计算运行指标 TMP={average_value} L_s={L_s} t_bw_s={t_bw_s}")
			
 
				+                metrics = calc_uf_cycle_metrics(uf_params, average_value, max_tmp, min_tmp, L_s, t_bw_s)  # 计算运行指标
			
 
				+                ceb_backwash_frequency = int(metrics["k_bw_per_ceb"])
			
 
				+                
			
 
				+                # 发送决策结果，并获取服务器返回的 use_model 状态
			
 
				+                use_model_status = send_decision_to_callback(
			
 
				+                    type_name=name,  # 设备名称
			
 
				+                    water_production_time=int(L_s),  # 过滤时间
			
 
				+                    physical_backwash=int(t_bw_s),  # 反洗时间
			
 
				+                    ceb_backwash_frequency=ceb_backwash_frequency,  # 化学反洗频率
			
 
				+                    duration_system=int(prod_time),  # 系统运行时间
			
 
				+                    tmp_action=average_value,  # TMP动作
			
 
				+                    recovery_rate=metrics["recovery"],  # 回收率
			
 
				+                    ton_water_energy_kWh=metrics['ton_water_energy_kWh_per_m3'],  # 吨水电耗
			
 
				+                    max_permeability=metrics['max_permeability'],  # 最高渗透率
			
 
				+                    daily_prod_time_h=metrics['daily_prod_time_h'],  # 日均产水时间
			
 
				+                    ctime=current_decision_time.strftime(DATETIME_FORMAT)  # 时间
			
 
				+                )
			
 
				+
			
 
				+                # 判断是否下发PLC指令，根据服务器返回的 use_model 状态
			
 
				+                if use_model_status == 1:
			
 
				+                    logger.info("模型开关已开启，检查PLC指令")
			
 
				+                    
			
 
				+                    # 记录当前PLC值和模型决策值
			
 
				+                    current_plc_values = {
			
 
				+                        'prod_time': int(prod_time),
			
 
				+                        'bw_time': int(bw_time),
			
 
				+                        'bw_per_ceb': int(bw_per_ceb)
			
 
				+                    }
			
 
				+                    model_decision_values = {
			
 
				+                        'L_s': int(L_s),
			
 
				+                        't_bw_s': int(t_bw_s),
			
 
				+                        'ceb_frequency': int(ceb_backwash_frequency)
			
 
				+                    }
			
 
				+                    
			
 
				+                    logger.info(f"当前PLC值: 产水时间={current_plc_values['prod_time']}, 反洗时间={current_plc_values['bw_time']}, CEB次数={current_plc_values['bw_per_ceb']}")
			
 
				+                    logger.info(f"模型决策值: L_s={model_decision_values['L_s']}, t_bw_s={model_decision_values['t_bw_s']}, ceb_frequency={model_decision_values['ceb_frequency']}")
			
 
				+                    
			
 
				+                    # 检查每个参数是否需要下发指令
			
 
				+                    
			
 
				+                    # 检查产水时间是否需要更新
			
 
				+                    if current_plc_values['prod_time'] != model_decision_values['L_s']:
			
 
				+                        logger.info(f"产水时间需要更新: {current_plc_values['prod_time']} -> {model_decision_values['L_s']}")
			
 
				+                        send_plc_update(name, device["production_time_payload"]["deviceItems"], str(prod_time), str(model_decision_values['L_s']), 1)
			
 
				+                    else:
			
 
				+                        logger.info(f"产水时间无需更新: {current_plc_values['prod_time']}")
			
 
				+                    
			
 
				+                    # 检查反洗时间是否需要更新
			
 
				+                    if current_plc_values['bw_time'] != model_decision_values['t_bw_s']:
			
 
				+                        logger.info(f"反洗时间需要更新: {current_plc_values['bw_time']} -> {model_decision_values['t_bw_s']}")
			
 
				+                        send_plc_update(name, device["backwashing_payload"]["deviceItems"], str(bw_time), str(model_decision_values['t_bw_s']), 4)
			
 
				+                    else:
			
 
				+                        logger.info(f"反洗时间无需更新: {current_plc_values['bw_time']}")
			
 
				+                    
			
 
				+                    # 检查CEB次数是否需要更新
			
 
				+                    if current_plc_values['bw_per_ceb'] != model_decision_values['ceb_frequency']:
			
 
				+                        logger.info(f"CEB次数需要更新: {current_plc_values['bw_per_ceb']} -> {model_decision_values['ceb_frequency']}")
			
 
				+                        send_plc_update(name, device["ceb_payload"]["deviceItems"], str(bw_per_ceb), str(model_decision_values['ceb_frequency']), 2)
			
 
				+                    else:
			
 
				+                        logger.info(f"CEB次数无需更新: {current_plc_values['bw_per_ceb']}")
			
 
				+
			
 
				+                elif use_model_status == 0:
			
 
				+                    logger.info("服务器返回 use_model=0，模型开关已关闭，跳过PLC指令")
			
 
				+                else:
			
 
				+                    logger.warning("回调发送失败，无法获取 use_model 状态，跳过PLC指令")
			
 
				+
			
 
				+                # 保存运行状态
			
 
				+                model_prev_L_s = L_s  # 过滤时间 上一轮
			
 
				+                model_prev_t_bw_s = t_bw_s  # 反洗时间 上一轮
			
 
				+                last_cycle_end_time = current_decision_time  # 上次运行结束时间 
			
 
				+                
			
 
				+                # 获取配置的TMP历史记录数量
			
 
				+                current_config = get_current_config()
			
 
				+                tmp_history_count = current_config.get('system', {}).get('tmp_history_count', 5)
			
 
				+                
			
 
				+                # 从最新的内存缓存中读取当前设备状态（确保获取最新的历史记录）
			
 
				+                current_device_state = device_states.get(name, {})
			
 
				+                recent_tmp_values = current_device_state.get('recent_tmp_values', [])
			
 
				+                recent_tmp_values.append(round(average_value, 4))
			
 
				+                # 只保留最近N次
			
 
				+                recent_tmp_values = recent_tmp_values[-tmp_history_count:]
			
 
				+
			
 
				+                state_to_save = {
			
 
				+                    'model_prev_L_s': model_prev_L_s,  # 过滤时间 上一轮
			
 
				+                    'model_prev_t_bw_s': model_prev_t_bw_s,  # 反洗时间 上一轮
			
 
				+                    'last_cycle_end_time': last_cycle_end_time.strftime(DATETIME_FORMAT),  # 上次运行结束时间
			
 
				+                    'recent_tmp_values': recent_tmp_values  # 最近N次TMP平均值（新增）
			
 
				+                }
			
 
				+                save_device_state(name, state_to_save)  # 保存设备状态 
			
 
				+                logger.info(f"状态保存完成 下次查询起始时间 {last_cycle_end_time.strftime(DATETIME_FORMAT)}")
			
 
				+                logger.info(f"最近{tmp_history_count}次TMP记录: {recent_tmp_values}") 
			
 
				+
			
 
				+            # 阶段4: 等待重置
			
 
				+            logger.info(f"等待重置 控制字需重新等于 {TRIGGER_VALUE}")
			
 
				+            # 等待一段时间，确保不是立即开始新一轮
			
 
				+            time.sleep(5)  # 等待5秒
			
 
				+            while True:
			
 
				+                control_value = get_device_value(device["control_payload"], name)  # 控制字 
			
 
				+                if control_value is not None and int(control_value) == TRIGGER_VALUE:  # 控制字 等于 触发值
			
 
				+                    logger.info("完整周期结束，开始新一轮")
			
 
				+                    break
			
 
				+                time.sleep(POLL_INTERVAL)
			
 
				+
			
 
				+            logger.info(f"{name} 本轮完成\n")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.critical(f"监控循环异常 {e}", exc_info=True)
			
 
				+            logger.info("等待60秒后重试")
			
 
				+            time.sleep(60)
			
 
				+
			
 
				+
			
 
				+# 程序主入口
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    主函数
			
 
				+    
			
 
				+    功能:
			
 
				+    1. 加载设备历史状态
			
 
				+    2. 为每个设备启动独立监控线程
			
 
				+    3. 保持主线程运行
			
 
				+    """
			
 
				+    logger.info("========================================")
			
 
				+    logger.info("超滤并行监控服务启动")
			
 
				+    logger.info("========================================")
			
 
				+
			
 
				+    # 加载设备历史状态
			
 
				+    load_device_states()
			
 
				+
			
 
				+    # 为每个设备创建监控线程
			
 
				+    threads = []
			
 
				+    for device_config in DEVICE_SEQUENCE:
			
 
				+        thread = threading.Thread(target=monitor_device, args=(device_config,), daemon=True)
			
 
				+        threads.append(thread)
			
 
				+        thread.start()
			
 
				+        logger.info(f"设备 {device_config['name']} 监控线程已启动")
			
 
				+
			
 
				+    # 保持主线程运行
			
 
				+    try:
			
 
				+        while any(t.is_alive() for t in threads):
			
 
				+            time.sleep(1)
			
 
				+    except KeyboardInterrupt:
			
 
				+        logger.info("检测到中断信号，程序退出")
			
 
				+
			
 
				+
			
 
				+def test_get_tmp_extremes():
			
 
				+    """
			
 
				+    测试get_tmp_extremes函数的API调用
			
 
				+    """
			
 
				+    print("=" * 50)
			
 
				+    print("测试get_tmp_extremes API调用")
			
 
				+    print("=" * 50)
			
 
				+    
			
 
				+    # 设置测试参数
			
 
				+    test_item_name = "C.M.UF1_DB@press_PV"  # 测试数据项
			
 
				+    test_word_control = "C.M.UF1_DB@word_control"  # 测试控制字段
			
 
				+    
			
 
				+    # 设置测试时间范围（最近24小时）
			
 
				+    end_time = datetime.now()
			
 
				+    start_time = end_time - timedelta(hours=24)
			
 
				+    
			
 
				+    print(f"测试参数:")
			
 
				+    print(f"  数据项: {test_item_name}")
			
 
				+    print(f"  控制字段: {test_word_control}")
			
 
				+    print(f"  开始时间: {start_time.strftime(DATETIME_FORMAT)}")
			
 
				+    print(f"  结束时间: {end_time.strftime(DATETIME_FORMAT)}")
			
 
				+    print()
			
 
				+    
			
 
				+    try:
			
 
				+        # 调用函数
			
 
				+        max_val, min_val = get_tmp_extremes(test_item_name, start_time, end_time, test_word_control)
			
 
				+        
			
 
				+        print("测试结果:")
			
 
				+        if max_val is not None and min_val is not None:
			
 
				+            print(f"  API调用成功")
			
 
				+            print(f"  最大值: {max_val}")
			
 
				+            print(f"  最小值: {min_val}")
			
 
				+        else:
			
 
				+            print(f"  API调用失败或未返回有效数据")
			
 
				+            print(f"  最大值: {max_val}")
			
 
				+            print(f"  最小值: {min_val}")
			
 
				+            
			
 
				+    except Exception as e:
			
 
				+        print(f" 测试过程中发生异常: {e}")
			
 
				+    
			
 
				+    print("=" * 50)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 运行测试用例
			
 
				+    # test_get_tmp_extremes()
			
 
				+    
			
 
				+    # 运行主程序
			
 
				+    main()
			
--- a/models/uf-rl/超滤开发/monitor_service.log
+++ b/models/uf-rl/超滤开发/monitor_service.log
--- a/models/uf-rl/超滤开发/plc_test_dry_run.py
+++ b/models/uf-rl/超滤开发/plc_test_dry_run.py
@@ -0,0 +1,120 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+PLC指令模拟测试工具
			
 
				+仅显示请求详情，不实际发送，用于调试和验证
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import hashlib
			
 
				+import time
			
 
				+
			
 
				+
			
 
				+def load_config(config_file='config.json'):
			
 
				+    """加载配置文件"""
			
 
				+    with open(config_file, 'r', encoding='utf-8') as f:
			
 
				+        return json.load(f)
			
 
				+
			
 
				+
			
 
				+def generate_md5_signature(record_data, secret, timestamp):
			
 
				+    """生成MD5签名"""
			
 
				+    cal_str = f"{record_data}{secret}{timestamp}"
			
 
				+    cal_md5 = hashlib.md5(cal_str.encode('utf-8')).hexdigest()
			
 
				+    return cal_md5.upper()
			
 
				+
			
 
				+
			
 
				+def prepare_plc_request(device_name, item, old_value, new_value, command_type):
			
 
				+    """
			
 
				+    准备PLC请求参数
			
 
				+    
			
 
				+    参数:
			
 
				+        device_name: 设备名称
			
 
				+        item: 参数项名称
			
 
				+        old_value: 当前值
			
 
				+        new_value: 目标值
			
 
				+        command_type: 命令类型
			
 
				+        
			
 
				+    返回:
			
 
				+        请求信息字典
			
 
				+    """
			
 
				+    config = load_config()
			
 
				+    
			
 
				+    PLC_URL = config['api']['base_url'] + config['api']['plc_endpoint']
			
 
				+    PROJECT_ID = config['scada']['project_id']
			
 
				+    SCADA_SECRET = config['scada']['secret']
			
 
				+    
			
 
				+    timestamp = int(time.time())
			
 
				+    
			
 
				+    record_dict = {
			
 
				+        "project_id": PROJECT_ID,
			
 
				+        "item": item,
			
 
				+        "old_value": old_value,
			
 
				+        "new_value": new_value,
			
 
				+        "command_type": command_type
			
 
				+    }
			
 
				+    record_data = json.dumps(record_dict, separators=(',', ':'))
			
 
				+    
			
 
				+    signature = generate_md5_signature(record_data, SCADA_SECRET, timestamp)
			
 
				+    full_url = f"{PLC_URL}?sign={signature}&timestamp={timestamp}"
			
 
				+    payload = [record_dict]
			
 
				+    
			
 
				+    return {
			
 
				+        'url': full_url,
			
 
				+        'payload': payload,
			
 
				+        'signature_data': record_data,
			
 
				+        'signature': signature,
			
 
				+        'timestamp': timestamp,
			
 
				+        'secret': SCADA_SECRET
			
 
				+    }
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    print("=== PLC指令测试 - 模拟运行 ===")
			
 
				+    print()
			
 
				+    
			
 
				+    # 测试参数
			
 
				+    device_name = "UF2"
			
 
				+    item = "C.M.UF2_DB@time_production"
			
 
				+    old_value = "3800"
			
 
				+    new_value = "3801"
			
 
				+    command_type = 1
			
 
				+    
			
 
				+    # 准备请求
			
 
				+    request_info = prepare_plc_request(device_name, item, old_value, new_value, command_type)
			
 
				+    
			
 
				+    print(f"📋 测试场景:")
			
 
				+    print(f"   设备: {device_name}")
			
 
				+    print(f"   参数项: {item}")
			
 
				+    print(f"   当前值: {old_value}")
			
 
				+    print(f"   目标值: {new_value}")
			
 
				+    print(f"   命令类型: {command_type}")
			
 
				+    print()
			
 
				+    
			
 
				+    print(f"🔧 请求详情:")
			
 
				+    print(f"   完整URL: {request_info['url']}")
			
 
				+    print()
			
 
				+    
			
 
				+    print(f"📝 请求头:")
			
 
				+    print(f"   Content-Type: application/json")
			
 
				+    print()
			
 
				+    
			
 
				+    print(f"📦 请求体:")
			
 
				+    print(json.dumps(request_info['payload'], indent=4, ensure_ascii=False))
			
 
				+    print()
			
 
				+    
			
 
				+    print(f"🔐 签名计算:")
			
 
				+    print(f"   SCADA密钥: {request_info['secret']}")
			
 
				+    print(f"   时间戳: {request_info['timestamp']}")
			
 
				+    print(f"   签名原数据: {request_info['signature_data']}")
			
 
				+    print(f"   计算字符串: {request_info['signature_data']}{request_info['secret']}{request_info['timestamp']}")
			
 
				+    print(f"   MD5签名: {request_info['signature']}")
			
 
				+    print()
			
 
				+    
			
 
				+    print(f"✨ curl命令:")
			
 
				+    curl_cmd = f"""curl -X POST '{request_info['url']}' \\
			
 
				+  -H 'Content-Type: application/json' \\
			
 
				+  -d '{json.dumps(request_info['payload'], separators=(',', ':'), ensure_ascii=False)}'"""
			
 
				+    print(curl_cmd)
			
 
				+    print()
			
 
				+    
			
 
				+    print("🚀 这就是将要发送给PLC系统的完整请求！")
			
 
				+    print("   如果看起来正确，您可以运行 test_plc_update.py 来实际发送。")
			
--- a/models/uf-rl/超滤开发/requirements.txt
+++ b/models/uf-rl/超滤开发/requirements.txt
@@ -0,0 +1,18 @@
 
				+# 超滤系统强化学习决策系统 - 依赖包
			
 
				+
			
 
				+# 科学计算
			
 
				+numpy>=1.23.0
			
 
				+
			
 
				+# 深度学习框架
			
 
				+torch>=2.0.0
			
 
				+
			
 
				+# 强化学习框架
			
 
				+gymnasium>=1.2.0
			
 
				+stable-baselines3>=2.6.0
			
 
				+
			
 
				+# 数据库连接
			
 
				+pymysql>=1.0.0
			
 
				+
			
 
				+# HTTP请求
			
 
				+requests>=2.28.0
			
 
				+
			
--- a/models/uf-rl/超滤开发/save_uf_models.py
+++ b/models/uf-rl/超滤开发/save_uf_models.py
@@ -0,0 +1,73 @@
 
				+import torch
			
 
				+import numpy as np
			
 
				+
			
 
				+
			
 
				+class TMPIncreaseModel(torch.nn.Module):
			
 
				+    """
			
 
				+    跨膜压差上升模型
			
 
				+    
			
 
				+    计算过滤阶段的TMP增长量
			
 
				+    """
			
 
				+    def __init__(self):
			
 
				+        super().__init__()
			
 
				+    
			
 
				+    def forward(self, p, L_h):
			
 
				+        """
			
 
				+        计算TMP增长量
			
 
				+        
			
 
				+        参数:
			
 
				+            p: 系统参数对象
			
 
				+            L_h: 过滤时长（小时）
			
 
				+            
			
 
				+        返回:
			
 
				+            TMP增长量
			
 
				+        """
			
 
				+        return float(p.alpha * (p.q_UF ** p.belta) * L_h)
			
 
				+
			
 
				+
			
 
				+class TMPDecreaseModel(torch.nn.Module):
			
 
				+    """
			
 
				+    跨膜压差恢复模型
			
 
				+    
			
 
				+    计算反洗阶段的TMP恢复比例
			
 
				+    """
			
 
				+    def __init__(self):
			
 
				+        super().__init__()
			
 
				+    
			
 
				+    def forward(self, p, L_s, t_bw_s):
			
 
				+        """
			
 
				+        计算反洗恢复比例
			
 
				+        
			
 
				+        参数:
			
 
				+            p: 系统参数对象
			
 
				+            L_s: 过滤时长（秒）
			
 
				+            t_bw_s: 反洗时长（秒）
			
 
				+            
			
 
				+        返回:
			
 
				+            TMP恢复比例（0到1之间）
			
 
				+        """
			
 
				+        L = max(float(L_s), 1.0)
			
 
				+        t = max(float(t_bw_s), 1e-6)
			
 
				+        
			
 
				+        # 计算恢复比例上限（随过滤时长增加而降低）
			
 
				+        upper_L = p.phi_bw_min + (p.phi_bw_max - p.phi_bw_min) * np.exp(-L / p.L_ref_s)
			
 
				+        
			
 
				+        # 计算时间增益因子（反洗时间越长，恢复越好）
			
 
				+        time_gain = 1.0 - np.exp(-(t / p.tau_bw_s) ** p.gamma_t)
			
 
				+        
			
 
				+        # 综合计算恢复比例
			
 
				+        phi = upper_L * time_gain
			
 
				+        
			
 
				+        return float(np.clip(phi, 0.0, 0.999))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 创建模型实例
			
 
				+    model_fp = TMPIncreaseModel()
			
 
				+    model_bw = TMPDecreaseModel()
			
 
				+
			
 
				+    # 保存模型参数
			
 
				+    torch.save(model_fp.state_dict(), "uf_fp.pth")
			
 
				+    torch.save(model_bw.state_dict(), "uf_bw.pth")
			
 
				+
			
 
				+    print("模型已保存 uf_fp.pth uf_bw.pth")
			
--- a/models/uf-rl/超滤开发/test_callback.py
+++ b/models/uf-rl/超滤开发/test_callback.py
@@ -0,0 +1,393 @@
 
				+"""
			
 
				+send_decision_to_callback 函数测试脚本
			
 
				+
			
 
				+测试功能：
			
 
				+1. 测试正常的决策数据发送
			
 
				+2. 测试不同参数组合
			
 
				+3. 模拟实际使用场景
			
 
				+4. 测试PLC指令下发（当use_model_status=1时）
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+from datetime import datetime
			
 
				+from loop_main import send_decision_to_callback, send_plc_update, DATETIME_FORMAT, load_config, get_device_value
			
 
				+
			
 
				+def test_basic_callback():
			
 
				+    """
			
 
				+    基础测试：发送一组标准的决策数据，如果use_model_status=1则测试PLC指令下发
			
 
				+    """
			
 
				+    print("=" * 60)
			
 
				+    print("测试1: 基础回调测试")
			
 
				+    print("=" * 60)
			
 
				+    
			
 
				+    # 模拟决策结果数据
			
 
				+    device_name = "UF1"  # 设备名称
			
 
				+    test_data = {
			
 
				+        "type_name": device_name,
			
 
				+        "water_production_time": 4201,  # 产水时间（秒）
			
 
				+        "physical_backwash": 120,  # 物理反洗时间（秒）
			
 
				+        "ceb_backwash_frequency": 35,  # CEB反洗频率
			
 
				+        "duration_system": 3800,  # 系统运行时间（秒）
			
 
				+        "tmp_action": 0.045,  # TMP动作值
			
 
				+        "recovery_rate": 0.95,  # 回收率
			
 
				+        "ton_water_energy_kWh": 0.28,  # 吨水电耗
			
 
				+        "max_permeability": 850.5,  # 最高渗透率
			
 
				+        "daily_prod_time_h": 22.5,  # 日均产水时间（小时）
			
 
				+        "ctime": datetime.now().strftime(DATETIME_FORMAT)  # 当前时间
			
 
				+    }
			
 
				+    
			
 
				+    print("发送数据:")
			
 
				+    print(json.dumps(test_data, indent=2, ensure_ascii=False))
			
 
				+    print()
			
 
				+    
			
 
				+    # 调用函数
			
 
				+    try:
			
 
				+        use_model_status = send_decision_to_callback(**test_data)
			
 
				+        print(f"返回的 use_model 状态: {use_model_status}")
			
 
				+        
			
 
				+        if use_model_status == 1:
			
 
				+            print("测试结果: 成功 - 模型开关已开启")
			
 
				+            print()
			
 
				+            print("-" * 60)
			
 
				+            print("开始测试PLC指令下发")
			
 
				+            print("-" * 60)
			
 
				+            
			
 
				+            # 加载配置获取设备信息
			
 
				+            config = load_config()
			
 
				+            device_config = None
			
 
				+            for dev in config['devices']:
			
 
				+                if dev['name'] == device_name:
			
 
				+                    device_config = dev
			
 
				+                    break
			
 
				+            
			
 
				+            if device_config:
			
 
				+                # 先读取当前PLC的产水时间值
			
 
				+                print("正在读取PLC当前产水时间...")
			
 
				+                current_prod_time = get_device_value(device_config["production_time_payload"], device_name)
			
 
				+                
			
 
				+                if current_prod_time is not None:
			
 
				+                    old_prod_time = str(int(current_prod_time))
			
 
				+                    new_prod_time = str(int(current_prod_time) + 1)  # 当前值+1
			
 
				+                    prod_time_item = device_config["production_time_payload"]["deviceItems"]
			
 
				+                    
			
 
				+                    print(f"✓ 读取成功: 当前产水时间 = {old_prod_time}")
			
 
				+                    print()
			
 
				+                    print(f"测试参数:")
			
 
				+                    print(f"  设备名称: {device_name}")
			
 
				+                    print(f"  参数项: {prod_time_item}")
			
 
				+                    print(f"  旧值: {old_prod_time} (从PLC读取)")
			
 
				+                    print(f"  新值: {new_prod_time} (旧值+1)")
			
 
				+                    print(f"  指令类型: 1 (产水时间)")
			
 
				+                    print()
			
 
				+                    
			
 
				+                    # 发送PLC指令
			
 
				+                    print("正在发送PLC指令...")
			
 
				+                    plc_result = send_plc_update(
			
 
				+                        device_name=device_name,
			
 
				+                        item=prod_time_item,
			
 
				+                        old_value=old_prod_time,
			
 
				+                        new_value=new_prod_time,
			
 
				+                        command_type=1  # 产水时间的指令类型
			
 
				+                    )
			
 
				+                    
			
 
				+                    if plc_result:
			
 
				+                        print("✓ PLC指令发送成功")
			
 
				+                    else:
			
 
				+                        print("✗ PLC指令发送失败")
			
 
				+                else:
			
 
				+                    print("✗ 错误：无法读取当前产水时间，跳过PLC指令测试")
			
 
				+            else:
			
 
				+                print(f"⚠ 警告: 未找到设备 {device_name} 的配置")
			
 
				+            
			
 
				+            print("-" * 60)
			
 
				+            
			
 
				+        elif use_model_status == 0:
			
 
				+            print("测试结果: 成功 - 模型开关已关闭")
			
 
				+            print("跳过PLC指令测试")
			
 
				+        else:
			
 
				+            print("测试结果: 失败 - 未能获取 use_model 状态")
			
 
				+    except Exception as e:
			
 
				+        print(f"测试异常: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+    
			
 
				+    print("=" * 60)
			
 
				+    print()
			
 
				+
			
 
				+
			
 
				+def test_minimal_callback():
			
 
				+    """
			
 
				+    最小参数测试：只传递必要的参数
			
 
				+    """
			
 
				+    print("=" * 60)
			
 
				+    print("测试2: 最小参数测试")
			
 
				+    print("=" * 60)
			
 
				+    
			
 
				+    # 最小数据集
			
 
				+    test_data = {
			
 
				+        "type_name": "UF2",
			
 
				+        "water_production_time": 3500,
			
 
				+        "physical_backwash": 100,
			
 
				+        "ctime": datetime.now().strftime(DATETIME_FORMAT)
			
 
				+    }
			
 
				+    
			
 
				+    print("发送数据:")
			
 
				+    print(json.dumps(test_data, indent=2, ensure_ascii=False))
			
 
				+    print()
			
 
				+    
			
 
				+    try:
			
 
				+        use_model_status = send_decision_to_callback(**test_data)
			
 
				+        print(f"返回的 use_model 状态: {use_model_status}")
			
 
				+        if use_model_status == 1:
			
 
				+            print("测试结果: 成功 - 模型开关已开启")
			
 
				+        elif use_model_status == 0:
			
 
				+            print("测试结果: 成功 - 模型开关已关闭")
			
 
				+        else:
			
 
				+            print("测试结果: 失败 - 未能获取 use_model 状态")
			
 
				+    except Exception as e:
			
 
				+        print(f"测试异常: {e}")
			
 
				+    
			
 
				+    print("=" * 60)
			
 
				+    print()
			
 
				+
			
 
				+
			
 
				+def test_multiple_devices():
			
 
				+    """
			
 
				+    多设备测试：模拟多个设备的决策数据发送
			
 
				+    """
			
 
				+    print("=" * 60)
			
 
				+    print("测试3: 多设备测试")
			
 
				+    print("=" * 60)
			
 
				+    
			
 
				+    devices = ["UF1", "UF2", "UF3"]
			
 
				+    
			
 
				+    for device_name in devices:
			
 
				+        print(f"\n发送设备 {device_name} 的决策数据...")
			
 
				+        
			
 
				+        test_data = {
			
 
				+            "type_name": device_name,
			
 
				+            "water_production_time": 3600 + (devices.index(device_name) * 100),
			
 
				+            "physical_backwash": 100 + (devices.index(device_name) * 10),
			
 
				+            "ceb_backwash_frequency": 40 - devices.index(device_name),
			
 
				+            "duration_system": 3800,
			
 
				+            "tmp_action": 0.040 + (devices.index(device_name) * 0.005),
			
 
				+            "recovery_rate": 0.95,
			
 
				+            "ton_water_energy_kWh": 0.25 + (devices.index(device_name) * 0.02),
			
 
				+            "max_permeability": 850.0,
			
 
				+            "daily_prod_time_h": 22.0,
			
 
				+            "ctime": datetime.now().strftime(DATETIME_FORMAT)
			
 
				+        }
			
 
				+        
			
 
				+        try:
			
 
				+            use_model_status = send_decision_to_callback(**test_data)
			
 
				+            print(f"{device_name} 返回的 use_model 状态: {use_model_status}")
			
 
				+            if use_model_status == 1:
			
 
				+                print(f"{device_name} 测试结果: 成功 - 模型开关已开启")
			
 
				+            elif use_model_status == 0:
			
 
				+                print(f"{device_name} 测试结果: 成功 - 模型开关已关闭")
			
 
				+            else:
			
 
				+                print(f"{device_name} 测试结果: 失败 - 未能获取 use_model 状态")
			
 
				+        except Exception as e:
			
 
				+            print(f"{device_name} 测试异常: {e}")
			
 
				+    
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print()
			
 
				+
			
 
				+
			
 
				+def test_custom_scenario():
			
 
				+    """
			
 
				+    自定义场景测试：可以根据需要修改参数
			
 
				+    """
			
 
				+    print("=" * 60)
			
 
				+    print("测试4: 自定义场景测试")
			
 
				+    print("=" * 60)
			
 
				+    
			
 
				+    # 这里可以自定义测试参数
			
 
				+    test_data = {
			
 
				+        "type_name": "UF1",  # 修改设备名称
			
 
				+        "water_production_time": 4000,  # 修改产水时间
			
 
				+        "physical_backwash": 150,  # 修改反洗时间
			
 
				+        "ceb_backwash_frequency": 30,  # 修改CEB频率
			
 
				+        "duration_system": 4200,
			
 
				+        "tmp_action": 0.055,
			
 
				+        "recovery_rate": 0.92,
			
 
				+        "ton_water_energy_kWh": 0.30,
			
 
				+        "max_permeability": 800.0,
			
 
				+        "daily_prod_time_h": 21.5,
			
 
				+        "ctime": datetime.now().strftime(DATETIME_FORMAT)
			
 
				+    }
			
 
				+    
			
 
				+    print("自定义测试数据:")
			
 
				+    print(json.dumps(test_data, indent=2, ensure_ascii=False))
			
 
				+    print()
			
 
				+    
			
 
				+    try:
			
 
				+        use_model_status = send_decision_to_callback(**test_data)
			
 
				+        print(f"返回的 use_model 状态: {use_model_status}")
			
 
				+        if use_model_status == 1:
			
 
				+            print("测试结果: 成功 - 模型开关已开启")
			
 
				+        elif use_model_status == 0:
			
 
				+            print("测试结果: 成功 - 模型开关已关闭")
			
 
				+        else:
			
 
				+            print("测试结果: 失败 - 未能获取 use_model 状态")
			
 
				+    except Exception as e:
			
 
				+        print(f"测试异常: {e}")
			
 
				+    
			
 
				+    print("=" * 60)
			
 
				+    print()
			
 
				+
			
 
				+
			
 
				+def test_plc_update_with_callback():
			
 
				+    """
			
 
				+    测试5: 回调+PLC指令测试（完整流程）
			
 
				+    """
			
 
				+    print("=" * 60)
			
 
				+    print("测试5: 回调+PLC指令完整流程测试")
			
 
				+    print("=" * 60)
			
 
				+    
			
 
				+    device_name = "UF1"
			
 
				+    
			
 
				+    # 第一步：发送回调数据
			
 
				+    test_data = {
			
 
				+        "type_name": device_name,
			
 
				+        "water_production_time": 4201,  # 决策建议的产水时间
			
 
				+        "physical_backwash": 120,
			
 
				+        "ceb_backwash_frequency": 35,
			
 
				+        "duration_system": 3800,
			
 
				+        "tmp_action": 0.045,
			
 
				+        "recovery_rate": 0.95,
			
 
				+        "ton_water_energy_kWh": 0.28,
			
 
				+        "max_permeability": 850.5,
			
 
				+        "daily_prod_time_h": 22.5,
			
 
				+        "ctime": datetime.now().strftime(DATETIME_FORMAT)
			
 
				+    }
			
 
				+    
			
 
				+    print("步骤1: 发送决策数据到回调接口")
			
 
				+    print("=" * 60)
			
 
				+    print(json.dumps(test_data, indent=2, ensure_ascii=False))
			
 
				+    print()
			
 
				+    
			
 
				+    try:
			
 
				+        # use_model_status = 1
			
 
				+        use_model_status = send_decision_to_callback(**test_data)
			
 
				+        print(f"✓ 回调响应: use_model_status = {use_model_status}")
			
 
				+        print()
			
 
				+        
			
 
				+        # 第二步：根据返回状态决定是否发送PLC指令
			
 
				+        print("步骤2: 根据use_model_status决定是否下发PLC指令")
			
 
				+        print("=" * 60)
			
 
				+        
			
 
				+        if use_model_status == 1:
			
 
				+            print("✓ 模型开关已开启，准备下发PLC指令")
			
 
				+            print()
			
 
				+            
			
 
				+            # 加载配置
			
 
				+            config = load_config()
			
 
				+            device_config = None
			
 
				+            for dev in config['devices']:
			
 
				+                if dev['name'] == device_name:
			
 
				+                    device_config = dev
			
 
				+                    break
			
 
				+            
			
 
				+            if device_config:
			
 
				+                print("步骤3: 读取PLC当前产水时间")
			
 
				+                print("=" * 60)
			
 
				+                
			
 
				+                # 先读取当前PLC的产水时间值
			
 
				+                current_prod_time = get_device_value(device_config["production_time_payload"], device_name)
			
 
				+                
			
 
				+                if current_prod_time is not None:
			
 
				+                    old_prod_time = str(int(current_prod_time))
			
 
				+                    new_prod_time = str(int(current_prod_time) - 1)  # 当前值+1
			
 
				+                    prod_time_item = device_config["production_time_payload"]["deviceItems"]
			
 
				+                    
			
 
				+                    print(f"✓ 读取成功: 当前产水时间 = {old_prod_time} 秒")
			
 
				+                    print()
			
 
				+                    
			
 
				+                    print("步骤4: 下发PLC指令（修改产水时间）")
			
 
				+                    print("=" * 60)
			
 
				+                    print(f"设备: {device_name}")
			
 
				+                    print(f"参数: {prod_time_item}")
			
 
				+                    print(f"变更: {old_prod_time} -> {new_prod_time} (当前值+1)")
			
 
				+                    print(f"类型: command_type=1 (产水时间)")
			
 
				+                    print()
			
 
				+                    
			
 
				+                    # 发送PLC指令
			
 
				+                    plc_result = send_plc_update(
			
 
				+                        device_name=device_name,
			
 
				+                        item=prod_time_item,
			
 
				+                        old_value=old_prod_time,
			
 
				+                        new_value=new_prod_time,
			
 
				+                        command_type=1
			
 
				+                    )
			
 
				+                    
			
 
				+                    print()
			
 
				+                    if plc_result:
			
 
				+                        print("✓✓✓ 测试成功：完整流程执行完毕")
			
 
				+                        print("  1. 回调接口调用成功")
			
 
				+                        print("  2. use_model_status=1")
			
 
				+                        print("  3. 从PLC读取当前值成功")
			
 
				+                        print("  4. PLC指令发送成功")
			
 
				+                    else:
			
 
				+                        print("✗ 测试失败：PLC指令发送失败")
			
 
				+                else:
			
 
				+                    print("✗ 错误：无法读取当前产水时间")
			
 
				+                    print("测试失败：无法获取PLC当前值")
			
 
				+            else:
			
 
				+                print(f"✗ 错误：未找到设备 {device_name} 的配置")
			
 
				+        
			
 
				+        elif use_model_status == 0:
			
 
				+            print("⚠ 模型开关已关闭，跳过PLC指令下发")
			
 
				+            print("  这是正常行为（use_model_status=0）")
			
 
				+        
			
 
				+        else:
			
 
				+            print("✗ 测试失败：未能获取有效的 use_model_status")
			
 
				+            
			
 
				+    except Exception as e:
			
 
				+        print(f"✗ 测试异常: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+    
			
 
				+    print()
			
 
				+    print("=" * 60)
			
 
				+    print()
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    主测试函数
			
 
				+    """
			
 
				+    print("\n")
			
 
				+    print("*" * 60)
			
 
				+    print("send_decision_to_callback + PLC指令 测试")
			
 
				+    print("*" * 60)
			
 
				+    print()
			
 
				+    
			
 
				+    # 运行各个测试
			
 
				+    # 默认运行基础回调测试（包含PLC指令测试）
			
 
				+    # test_basic_callback()
			
 
				+    
			
 
				+    # 取消注释下面的行来运行其他测试
			
 
				+    # test_minimal_callback()
			
 
				+    # test_multiple_devices()
			
 
				+    # test_custom_scenario()
			
 
				+    
			
 
				+    # 完整流程测试（推荐使用）
			
 
				+    test_plc_update_with_callback()
			
 
				+    
			
 
				+    print("\n")
			
 
				+    print("*" * 60)
			
 
				+    print("测试完成")
			
 
				+    print("*" * 60)
			
 
				+    print()
			
 
				+    print("提示:")
			
 
				+    print("  - 如果 use_model_status=1，会自动发送PLC指令")
			
 
				+    print("  - 测试会先从PLC读取当前产水时间，然后修改为当前值+1")
			
 
				+    print("  - 这样确保old_value与PLC实际值匹配，避免'设置scada变量失败'错误")
			
 
				+    print("  - 可以运行 test_plc_update_with_callback() 查看完整流程")
			
 
				+    print()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
 
				+
			
--- a/models/uf-rl/超滤开发/test_plc_update.py
+++ b/models/uf-rl/超滤开发/test_plc_update.py
@@ -0,0 +1,120 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+PLC指令测试工具
			
 
				+用于测试PLC参数更新请求的实际发送
			
 
				+"""
			
 
				+
			
 
				+import requests
			
 
				+import json
			
 
				+import hashlib
			
 
				+import time
			
 
				+
			
 
				+
			
 
				+def load_config(config_file='config.json'):
			
 
				+    """加载配置文件"""
			
 
				+    with open(config_file, 'r', encoding='utf-8') as f:
			
 
				+        return json.load(f)
			
 
				+
			
 
				+
			
 
				+def generate_md5_signature(record_data, secret, timestamp):
			
 
				+    """生成MD5签名"""
			
 
				+    cal_str = f"{record_data}{secret}{timestamp}"
			
 
				+    cal_md5 = hashlib.md5(cal_str.encode('utf-8')).hexdigest()
			
 
				+    return cal_md5.upper()
			
 
				+
			
 
				+
			
 
				+def send_plc_update_test(device_name, item, old_value, new_value, command_type):
			
 
				+    """
			
 
				+    发送PLC参数更新测试
			
 
				+    
			
 
				+    参数:
			
 
				+        device_name: 设备名称
			
 
				+        item: 参数项名称
			
 
				+        old_value: 当前值
			
 
				+        new_value: 目标值
			
 
				+        command_type: 命令类型
			
 
				+        
			
 
				+    返回:
			
 
				+        是否发送成功
			
 
				+    """
			
 
				+    config = load_config()
			
 
				+    
			
 
				+    PLC_URL = config['api']['base_url'] + config['api']['plc_endpoint']
			
 
				+    PROJECT_ID = config['scada']['project_id']
			
 
				+    SCADA_SECRET = config['scada']['secret']
			
 
				+    
			
 
				+    timestamp = int(time.time())
			
 
				+    
			
 
				+    record_data = json.dumps({
			
 
				+        "project_id": PROJECT_ID,
			
 
				+        "item": item,
			
 
				+        "old_value": old_value,
			
 
				+        "new_value": new_value,
			
 
				+        "command_type": command_type
			
 
				+    }, separators=(',', ':'))
			
 
				+    
			
 
				+    signature = generate_md5_signature(record_data, SCADA_SECRET, timestamp)
			
 
				+    url = f"{PLC_URL}?sign={signature}&timestamp={timestamp}"
			
 
				+    
			
 
				+    payload = [{
			
 
				+        "project_id": PROJECT_ID,
			
 
				+        "item": item,
			
 
				+        "old_value": old_value,
			
 
				+        "new_value": new_value,
			
 
				+        "command_type": command_type
			
 
				+    }]
			
 
				+    
			
 
				+    print(f"PLC测试")
			
 
				+    print(f"设备 {device_name}")
			
 
				+    print(f"参数 {item}")
			
 
				+    print(f"旧值 {old_value}")
			
 
				+    print(f"新值 {new_value}")
			
 
				+    print(f"类型 {command_type}")
			
 
				+    print(f"时间戳 {timestamp}")
			
 
				+    print(f"URL {url}")
			
 
				+    print(f"请求体 {json.dumps(payload, indent=2, ensure_ascii=False)}")
			
 
				+    print(f"签名数据 {record_data}")
			
 
				+    print(f"签名 {signature}")
			
 
				+    print("-" * 50)
			
 
				+    
			
 
				+    try:
			
 
				+        headers = {"Content-Type": "application/json"}
			
 
				+        response = requests.post(url, headers=headers, json=payload, timeout=15)
			
 
				+        
			
 
				+        print(f"响应状态码 {response.status_code}")
			
 
				+        print(f"响应头 {dict(response.headers)}")
			
 
				+        
			
 
				+        try:
			
 
				+            response_json = response.json()
			
 
				+            print(f"响应JSON {json.dumps(response_json, indent=2, ensure_ascii=False)}")
			
 
				+        except:
			
 
				+            print(f"响应文本 {response.text}")
			
 
				+            
			
 
				+        response.raise_for_status()
			
 
				+        print("请求发送成功")
			
 
				+        return True
			
 
				+        
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        print(f"请求失败 {e}")
			
 
				+        return False
			
 
				+    except Exception as e:
			
 
				+        print(f"未知错误 {e}")
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 测试配置
			
 
				+    device_name = "UF2"
			
 
				+    item = "C.M.UF2_DB@time_production"
			
 
				+    old_value = "3800"
			
 
				+    new_value = "3801"
			
 
				+    command_type = 1
			
 
				+    
			
 
				+    print("开始PLC指令测试")
			
 
				+    success = send_plc_update_test(device_name, item, old_value, new_value, command_type)
			
 
				+    
			
 
				+    if success:
			
 
				+        print("\n测试完成 请检查PLC系统")
			
 
				+    else:
			
 
				+        print("\n测试失败 请检查网络和配置")
			
--- a/models/uf-rl/超滤开发/uf_bw.pth
+++ b/models/uf-rl/超滤开发/uf_bw.pth
--- a/models/uf-rl/超滤开发/uf_dqn_tensorboard/DQN_lr0.0001_buf2000_bs16_gamma0.95_exp0.6_default_20251017-114220/DQN_1/events.out.tfevents.1760672541.MacBook-Pro-2.local.85900.0
+++ b/models/uf-rl/超滤开发/uf_dqn_tensorboard/DQN_lr0.0001_buf2000_bs16_gamma0.95_exp0.6_default_20251017-114220/DQN_1/events.out.tfevents.1760672541.MacBook-Pro-2.local.85900.0
--- a/models/uf-rl/超滤开发/uf_fp.pth
+++ b/models/uf-rl/超滤开发/uf_fp.pth
--- a/models/uf-rl/超滤训练源码/DQN_decide.py
+++ b/models/uf-rl/超滤训练源码/DQN_decide.py
@@ -0,0 +1,246 @@
 
				+import numpy as np
			
 
				+from stable_baselines3 import DQN
			
 
				+from UF_super_RL.DQN_env import UFSuperCycleEnv
			
 
				+from UF_super_RL.DQN_env import UFParams
			
 
				+
			
 
				+# 模型路径
			
 
				+MODEL_PATH = "dqn_model.zip"
			
 
				+
			
 
				+# 加载模型（只加载一次，提高效率）
			
 
				+model = DQN.load(MODEL_PATH)
			
 
				+
			
 
				+def run_uf_DQN_decide(uf_params, TMP0_value: float):
			
 
				+    """
			
 
				+    单步决策函数：输入原始 TMP0，预测并执行动作
			
 
				+
			
 
				+    参数:
			
 
				+        TMP0_value (float): 当前 TMP0 值（单位与环境一致）
			
 
				+
			
 
				+    返回:
			
 
				+        dict: 包含模型选择的动作、动作参数、新状态、奖励等
			
 
				+    """
			
 
				+    # 1. 实例化环境
			
 
				+    base_params = uf_params
			
 
				+    env = UFSuperCycleEnv(base_params)
			
 
				+
			
 
				+    # 2. 将输入的 TMP0 写入环境
			
 
				+    env.current_params.TMP0 = TMP0_value
			
 
				+
			
 
				+    # 3. 获取归一化状态
			
 
				+    obs = env._get_obs().reshape(1, -1)
			
 
				+
			
 
				+    # 4. 模型预测动作
			
 
				+    action, _ = model.predict(obs, deterministic=True)
			
 
				+
			
 
				+    # 5. 解析动作对应的 L_s 和 t_bw_s
			
 
				+    L_s, t_bw_s = env._get_action_values(action[0])
			
 
				+
			
 
				+    # 6. 在环境中执行该动作
			
 
				+    next_obs, reward, terminated, truncated, info = env.step(action[0])
			
 
				+
			
 
				+    # 7. 整理结果
			
 
				+    result = {
			
 
				+        "action": int(action[0]),
			
 
				+        "L_s": float(L_s),
			
 
				+        "t_bw_s": float(t_bw_s),
			
 
				+        "next_obs": next_obs,
			
 
				+        "reward": reward,
			
 
				+        "terminated": terminated,
			
 
				+        "truncated": truncated,
			
 
				+        "info": info
			
 
				+    }
			
 
				+
			
 
				+    # 8. 关闭环境
			
 
				+    env.close()
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+def generate_plc_instructions(current_L_s, current_t_bw_s, model_prev_L_s, model_prev_t_bw_s, model_L_s, model_t_bw_s):
			
 
				+    """
			
 
				+    根据工厂当前值、模型上一轮决策值和模型当前轮决策值，生成PLC指令。
			
 
				+
			
 
				+    新增功能：
			
 
				+    1. 处理None值情况：如果模型上一轮值为None，则使用工厂当前值；
			
 
				+       如果工厂当前值也为None，则返回None并提示错误。
			
 
				+    """
			
 
				+    # 参数配置保持不变
			
 
				+    params = UFParams(
			
 
				+        L_min_s=3600.0, L_max_s=6000.0, L_step_s=60.0,
			
 
				+        t_bw_min_s=40.0, t_bw_max_s=60.0, t_bw_step_s=5.0,
			
 
				+    )
			
 
				+
			
 
				+    # 参数解包
			
 
				+    L_step_s = params.L_step_s
			
 
				+    t_bw_step_s = params.t_bw_step_s
			
 
				+    L_min_s = params.L_min_s
			
 
				+    L_max_s = params.L_max_s
			
 
				+    t_bw_min_s = params.t_bw_min_s
			
 
				+    t_bw_max_s = params.t_bw_max_s
			
 
				+    adjustment_threshold = 1.0
			
 
				+
			
 
				+    # 处理None值情况
			
 
				+    if model_prev_L_s is None:
			
 
				+        if current_L_s is None:
			
 
				+            print("错误: 过滤时长的工厂当前值和模型上一轮值均为None")
			
 
				+            return None, None
			
 
				+        else:
			
 
				+            # 使用工厂当前值作为基准
			
 
				+            effective_current_L = current_L_s
			
 
				+            source_L = "工厂当前值(模型上一轮值为None)"
			
 
				+    else:
			
 
				+        # 模型上一轮值不为None，继续检查工厂当前值
			
 
				+        if current_L_s is None:
			
 
				+            effective_current_L = model_prev_L_s
			
 
				+            source_L = "模型上一轮值(工厂当前值为None)"
			
 
				+        else:
			
 
				+            effective_current_L = model_prev_L_s
			
 
				+            source_L = "模型上一轮值"
			
 
				+
			
 
				+    # 对反洗时长进行同样的处理
			
 
				+    if model_prev_t_bw_s is None:
			
 
				+        if current_t_bw_s is None:
			
 
				+            print("错误: 反洗时长的工厂当前值和模型上一轮值均为None")
			
 
				+            return None, None
			
 
				+        else:
			
 
				+            effective_current_t_bw = current_t_bw_s
			
 
				+            source_t_bw = "工厂当前值(模型上一轮值为None)"
			
 
				+    else:
			
 
				+        if current_t_bw_s is None:
			
 
				+            effective_current_t_bw = model_prev_t_bw_s
			
 
				+            source_t_bw = "模型上一轮值(工厂当前值为None)"
			
 
				+        else:
			
 
				+            effective_current_t_bw = model_prev_t_bw_s
			
 
				+            source_t_bw = "模型上一轮值"
			
 
				+
			
 
				+    # 检测所有输入值是否在规定范围内（只对非None值进行检查）
			
 
				+    # 工厂当前值检查（警告）
			
 
				+    if current_L_s is not None and not (L_min_s <= current_L_s <= L_max_s):
			
 
				+        print(f"警告: 当前过滤时长 {current_L_s} 秒不在允许范围内 [{L_min_s}, {L_max_s}]")
			
 
				+    if current_t_bw_s is not None and not (t_bw_min_s <= current_t_bw_s <= t_bw_max_s):
			
 
				+        print(f"警告: 当前反洗时长 {current_t_bw_s} 秒不在允许范围内 [{t_bw_min_s}, {t_bw_max_s}]")
			
 
				+
			
 
				+    # 模型上一轮决策值检查（警告）
			
 
				+    if model_prev_L_s is not None and not (L_min_s <= model_prev_L_s <= L_max_s):
			
 
				+        print(f"警告: 模型上一轮过滤时长 {model_prev_L_s} 秒不在允许范围内 [{L_min_s}, {L_max_s}]")
			
 
				+    if model_prev_t_bw_s is not None and not (t_bw_min_s <= model_prev_t_bw_s <= t_bw_max_s):
			
 
				+        print(f"警告: 模型上一轮反洗时长 {model_prev_t_bw_s} 秒不在允许范围内 [{t_bw_min_s}, {t_bw_max_s}]")
			
 
				+
			
 
				+    # 模型当前轮决策值检查（错误）
			
 
				+    if model_L_s is None:
			
 
				+        raise ValueError("错误: 决策模型建议的过滤时长不能为None")
			
 
				+    elif not (L_min_s <= model_L_s <= L_max_s):
			
 
				+        raise ValueError(f"错误: 决策模型建议的过滤时长 {model_L_s} 秒不在允许范围内 [{L_min_s}, {L_max_s}]")
			
 
				+
			
 
				+    if model_t_bw_s is None:
			
 
				+        raise ValueError("错误: 决策模型建议的反洗时长不能为None")
			
 
				+    elif not (t_bw_min_s <= model_t_bw_s <= t_bw_max_s):
			
 
				+        raise ValueError(f"错误: 决策模型建议的反洗时长 {model_t_bw_s} 秒不在允许范围内 [{t_bw_min_s}, {t_bw_max_s}]")
			
 
				+
			
 
				+    print(f"过滤时长基准: {source_L}, 值: {effective_current_L}")
			
 
				+    print(f"反洗时长基准: {source_t_bw}, 值: {effective_current_t_bw}")
			
 
				+
			
 
				+    # 使用选定的基准值进行计算调整
			
 
				+    L_diff = model_L_s - effective_current_L
			
 
				+    L_adjustment = 0
			
 
				+    if abs(L_diff) >= adjustment_threshold * L_step_s:
			
 
				+        if L_diff >= 0:
			
 
				+            L_adjustment = L_step_s
			
 
				+        else:
			
 
				+            L_adjustment = -L_step_s
			
 
				+    next_L_s = effective_current_L + L_adjustment
			
 
				+
			
 
				+    t_bw_diff = model_t_bw_s - effective_current_t_bw
			
 
				+    t_bw_adjustment = 0
			
 
				+    if abs(t_bw_diff) >= adjustment_threshold * t_bw_step_s:
			
 
				+        if t_bw_diff >= 0:
			
 
				+            t_bw_adjustment = t_bw_step_s
			
 
				+        else:
			
 
				+            t_bw_adjustment = -t_bw_step_s
			
 
				+    next_t_bw_s = effective_current_t_bw + t_bw_adjustment
			
 
				+
			
 
				+    return next_L_s, next_t_bw_s
			
 
				+
			
 
				+
			
 
				+from UF_super_RL.DQN_env import simulate_one_supercycle
			
 
				+def calc_uf_cycle_metrics(p, TMP0, max_tmp_during_filtration, min_tmp_during_filtration, L_s: float, t_bw_s: float):
			
 
				+    """
			
 
				+    计算 UF 超滤系统的核心性能指标
			
 
				+
			
 
				+    参数:
			
 
				+        p (UFParams): UF 系统参数
			
 
				+        L_s (float): 单次过滤时间（秒）
			
 
				+        t_bw_s (float): 单次反洗时间（秒）
			
 
				+
			
 
				+    返回:
			
 
				+        dict: {
			
 
				+            "k_bw_per_ceb": 小周期次数,
			
 
				+            "ton_water_energy_kWh_per_m3": 吨水电耗,
			
 
				+            "recovery": 回收率,
			
 
				+            "net_delivery_rate_m3ph": 净供水率 (m³/h),
			
 
				+            "daily_prod_time_h": 日均产水时间 (小时/天)
			
 
				+            "max_permeability": 全周期最高渗透率(lmh/bar)
			
 
				+        }
			
 
				+    """
			
 
				+    # 将跨膜压差写入参数
			
 
				+    p.TMP0 = TMP0
			
 
				+
			
 
				+    # 模拟该参数下的超级周期
			
 
				+    feasible, info = simulate_one_supercycle(p, L_s, t_bw_s)
			
 
				+
			
 
				+    # 获得模型模拟周期信息
			
 
				+    k_bw_per_ceb = info["k_bw_per_ceb"]
			
 
				+    ton_water_energy_kWh_per_m3 = info["ton_water_energy_kWh_per_m3"]
			
 
				+    recovery = info["recovery"]
			
 
				+    net_delivery_rate_m3ph = info["net_delivery_rate_m3ph"]
			
 
				+    daily_prod_time_h = info["daily_prod_time_h"]
			
 
				+
			
 
				+    # 获得模型模拟周期内最高跨膜压差/最低跨膜压差
			
 
				+    if max_tmp_during_filtration is None:
			
 
				+        max_tmp_during_filtration = info["max_TMP_during_filtration"]
			
 
				+    if min_tmp_during_filtration is None:
			
 
				+        min_tmp_during_filtration = info["min_TMP_during_filtration"]
			
 
				+
			
 
				+    # 计算最高渗透率
			
 
				+    max_permeability = 100 * p.q_UF / (128*40) / min_tmp_during_filtration
			
 
				+
			
 
				+
			
 
				+    return {
			
 
				+        "k_bw_per_ceb": k_bw_per_ceb,
			
 
				+        "ton_water_energy_kWh_per_m3": ton_water_energy_kWh_per_m3,
			
 
				+        "recovery": recovery,
			
 
				+        "net_delivery_rate_m3ph": net_delivery_rate_m3ph,
			
 
				+        "daily_prod_time_h": daily_prod_time_h,
			
 
				+        "max_permeability": max_permeability
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+# ==============================
			
 
				+# 示例调用
			
 
				+# ==============================
			
 
				+if __name__ == "__main__":
			
 
				+    uf_params = UFParams()
			
 
				+    TMP0 = 0.03 # 原始 TMP0
			
 
				+    model_decide_result = run_uf_DQN_decide(uf_params, TMP0) # 调用模型获得动作
			
 
				+    model_L_s = model_decide_result['L_s'] # 获得模型决策产水时长
			
 
				+    model_t_bw_s = model_decide_result['t_bw_s'] # 获得模型决策反洗时长
			
 
				+
			
 
				+    current_L_s = 3800
			
 
				+    current_t_bw_s = 40
			
 
				+    model_prev_L_s = 4040
			
 
				+    model_prev_t_bw_s = 60
			
 
				+    L_s, t_bw_s = generate_plc_instructions(current_L_s, current_t_bw_s, model_prev_L_s, model_prev_t_bw_s, model_L_s, model_t_bw_s) # 获取模型下发指令
			
 
				+
			
 
				+    L_s = 4100
			
 
				+    t_bw_s = 96
			
 
				+    max_tmp_during_filtration = 0.050176 # 新增工厂数据接口：周期最高/最低跨膜压差，无工厂数据接入时传入None，calc_uf_cycle_metrics()自动获取模拟周期中的跨膜压差最值
			
 
				+    min_tmp_during_filtration = 0.012496
			
 
				+    execution_result = calc_uf_cycle_metrics(uf_params, TMP0, max_tmp_during_filtration, min_tmp_during_filtration, L_s, t_bw_s)
			
 
				+    print("\n===== 单步决策结果 =====")
			
 
				+    print(f"模型选择的动作: {model_decide_result['action']}")
			
 
				+    print(f"模型选择的L_s: {model_L_s} 秒, 模型选择的t_bw_s: {model_t_bw_s} 秒")
			
 
				+    print(f"指令下发的L_s: {L_s} 秒, 指令下发的t_bw_s: {t_bw_s} 秒")
			
 
				+    print(f"指令对应的反洗次数: {execution_result['k_bw_per_ceb']}")
			
 
				+    print(f"指令对应的吨水电耗: {execution_result['ton_water_energy_kWh_per_m3']}")
			
 
				+    print(f"指令对应的回收率: {execution_result['recovery']}")
			
 
				+    print(f"指令对应的日均产水时间: {execution_result['daily_prod_time_h']}")
			
 
				+    print(f"指令对应的最高渗透率: {execution_result['max_permeability']}")
			
--- a/models/uf-rl/超滤训练源码/DQN_env.py
+++ b/models/uf-rl/超滤训练源码/DQN_env.py
@@ -0,0 +1,340 @@
 
				+import os
			
 
				+import time
			
 
				+import random
			
 
				+import numpy as np
			
 
				+import gymnasium as gym
			
 
				+from gymnasium import spaces
			
 
				+from stable_baselines3 import DQN
			
 
				+from stable_baselines3.common.monitor import Monitor
			
 
				+from stable_baselines3.common.vec_env import DummyVecEnv
			
 
				+from stable_baselines3.common.callbacks import BaseCallback
			
 
				+from typing import Dict, Tuple, Optional
			
 
				+import torch
			
 
				+import torch.nn as nn
			
 
				+from dataclasses import dataclass, asdict
			
 
				+from UF_models import TMPIncreaseModel, TMPDecreaseModel  # 导入模型类
			
 
				+import copy
			
 
				+
			
 
				+
			
 
				+# ==== 定义膜的基础运行参数 ====
			
 
				+@dataclass
			
 
				+class UFParams:
			
 
				+    # —— 膜与运行参数 ——
			
 
				+    q_UF: float = 360.0  # 过滤进水流量（m^3/h）
			
 
				+    TMP0: float = 0.03  # 初始TMP（MPa）
			
 
				+    TMP_max: float = 0.06  # TMP硬上限（MPa）
			
 
				+
			
 
				+    # —— 膜污染动力学 ——
			
 
				+    alpha: float = 1e-6  # TMP增长系数
			
 
				+    belta: float = 1.1  # 幂指数
			
 
				+
			
 
				+    # —— 反洗参数（固定） ——
			
 
				+    q_bw_m3ph: float = 1000.0  # 物理反洗流量（m^3/h）
			
 
				+
			
 
				+    # —— CEB参数（固定） ——
			
 
				+    T_ceb_interval_h: float = 48.0  # 固定每 k 小时做一次CEB
			
 
				+    v_ceb_m3: float = 30.0  # CEB用水体积（m^3）
			
 
				+    t_ceb_s: float = 40 * 60.0  # CEB时长（s）
			
 
				+    phi_ceb: float = 1.0  # CEB去除比例（简化：完全恢复到TMP0）
			
 
				+
			
 
				+    # —— 约束与收敛 ——
			
 
				+    dTMP: float = 0.001  # 单次产水结束时，相对TMP0最大升幅（MPa）
			
 
				+
			
 
				+    # —— 搜索范围（秒） ——
			
 
				+    L_min_s: float = 3800.0  # 过滤时长下限（s）
			
 
				+    L_max_s: float = 6000.0  # 过滤时长上限（s）
			
 
				+    t_bw_min_s: float = 40.0  # 物洗时长下限（s）
			
 
				+    t_bw_max_s: float = 60.0  # 物洗时长上限（s）
			
 
				+
			
 
				+    # —— 物理反洗恢复函数参数 ——
			
 
				+    phi_bw_min: float = 0.7  # 物洗去除比例最小值
			
 
				+    phi_bw_max: float = 1.0  # 物洗去除比例最大值
			
 
				+    L_ref_s: float = 4000.0  # 过滤时长影响时间尺度
			
 
				+    tau_bw_s: float = 20.0  # 物洗时长影响时间尺度
			
 
				+    gamma_t: float = 1.0  # 物洗时长作用指数
			
 
				+
			
 
				+    # —— 网格 ——
			
 
				+    L_step_s: float = 60.0  # 过滤时长步长（s）
			
 
				+    t_bw_step_s: float = 5.0  # 物洗时长步长（s）
			
 
				+
			
 
				+    # 多目标加权及高TMP惩罚
			
 
				+    w_rec: float = 0.8  # 回收率权重
			
 
				+    w_rate: float = 0.2  # 净供水率权重
			
 
				+    w_headroom: float = 0.2  # 贴边惩罚权重
			
 
				+    r_headroom: float = 2.0  # 贴边惩罚幂次
			
 
				+    headroom_hardcap: float = 0.98  # 超过此比例直接视为不可取
			
 
				+
			
 
				+# ==== 加载模拟环境模型 ====
			
 
				+# 初始化模型
			
 
				+model_fp = TMPIncreaseModel()
			
 
				+model_bw = TMPDecreaseModel()
			
 
				+
			
 
				+# 加载参数
			
 
				+model_fp.load_state_dict(torch.load("uf_fp.pth"))
			
 
				+model_bw.load_state_dict(torch.load("uf_bw.pth"))
			
 
				+
			
 
				+# 切换到推理模式
			
 
				+model_fp.eval()
			
 
				+model_bw.eval()
			
 
				+
			
 
				+
			
 
				+def _delta_tmp(p, L_h: float) -> float:
			
 
				+    """
			
 
				+    过滤时段TMP上升量：调用 uf_fp.pth 模型
			
 
				+    """
			
 
				+    return model_fp(p, L_h)
			
 
				+
			
 
				+def phi_bw_of(p, L_s: float, t_bw_s: float) -> float:
			
 
				+    """
			
 
				+    物洗去除比例：调用 uf_bw.pth 模型
			
 
				+    """
			
 
				+    return model_bw(p, L_s, t_bw_s)
			
 
				+
			
 
				+def _tmp_after_ceb(p, L_s: float, t_bw_s: float) -> float:
			
 
				+    """
			
 
				+    计算化学清洗(CEB)后的TMP，当前为恢复初始跨膜压差
			
 
				+    """
			
 
				+    return p.TMP0
			
 
				+
			
 
				+def _v_bw_m3(p, t_bw_s: float) -> float:
			
 
				+    """
			
 
				+    物理反洗水耗
			
 
				+    """
			
 
				+    return float(p.q_bw_m3ph * (float(t_bw_s) / 3600.0))
			
 
				+
			
 
				+def simulate_one_supercycle(p: UFParams, L_s: float, t_bw_s: float):
			
 
				+    """
			
 
				+    返回 (是否可行, 指标字典)
			
 
				+    - 支持动态CEB次数：48h固定间隔
			
 
				+    - 增加日均产水时间和吨水电耗
			
 
				+    - 增加最小TMP记录
			
 
				+    """
			
 
				+    L_h = float(L_s) / 3600.0  # 小周期过滤时间(h)
			
 
				+
			
 
				+    tmp = p.TMP0
			
 
				+    max_tmp_during_filtration = tmp
			
 
				+    min_tmp_during_filtration = tmp  # 新增：初始化最小TMP
			
 
				+    max_residual_increase = 0.0
			
 
				+
			
 
				+    # 小周期总时长(h)
			
 
				+    t_small_cycle_h = (L_s + t_bw_s) / 3600.0
			
 
				+
			
 
				+    # 计算超级周期内CEB次数
			
 
				+    k_bw_per_ceb = int(np.floor(p.T_ceb_interval_h / t_small_cycle_h))
			
 
				+    if k_bw_per_ceb < 1:
			
 
				+        k_bw_per_ceb = 1  # 至少一个小周期
			
 
				+
			
 
				+    # ton水电耗查表
			
 
				+    energy_lookup = {
			
 
				+        3600: 0.1034, 3660: 0.1031, 3720: 0.1029, 3780: 0.1026,
			
 
				+        3840: 0.1023, 3900: 0.1021, 3960: 0.1019, 4020: 0.1017,
			
 
				+        4080: 0.1015, 4140: 0.1012, 4200: 0.1011
			
 
				+    }
			
 
				+
			
 
				+    for _ in range(k_bw_per_ceb):
			
 
				+        tmp_run_start = tmp
			
 
				+
			
 
				+        # 过滤阶段TMP增长
			
 
				+        dtmp = _delta_tmp(p, L_h)
			
 
				+        tmp_peak = tmp_run_start + dtmp
			
 
				+
			
 
				+        # 约束1：峰值不得超过硬上限
			
 
				+        if tmp_peak > p.TMP_max + 1e-12:
			
 
				+            return False, {"reason": "TMP_max violated during filtration", "TMP_peak": tmp_peak}
			
 
				+
			
 
				+        # 更新最大和最小TMP
			
 
				+        if tmp_peak > max_tmp_during_filtration:
			
 
				+            max_tmp_during_filtration = tmp_peak
			
 
				+        if tmp_run_start < min_tmp_during_filtration:  # 新增：记录运行开始时的最小TMP
			
 
				+            min_tmp_during_filtration = tmp_run_start
			
 
				+
			
 
				+        # 物理反洗
			
 
				+        phi = phi_bw_of(p, L_s, t_bw_s)
			
 
				+        tmp_after_bw = tmp_peak - phi * (tmp_peak - tmp_run_start)
			
 
				+
			
 
				+        # 约束2：单次残余增量控制
			
 
				+        residual_inc = tmp_after_bw - tmp_run_start
			
 
				+        if residual_inc > p.dTMP + 1e-12:
			
 
				+            return False, {
			
 
				+                "reason": "residual TMP increase after BW exceeded dTMP",
			
 
				+                "residual_increase": residual_inc,
			
 
				+                "limit_dTMP": p.dTMP
			
 
				+            }
			
 
				+        if residual_inc > max_residual_increase:
			
 
				+            max_residual_increase = residual_inc
			
 
				+
			
 
				+        tmp = tmp_after_bw
			
 
				+
			
 
				+    # CEB
			
 
				+    tmp_after_ceb = p.TMP0
			
 
				+
			
 
				+    # 体积与回收率
			
 
				+    V_feed_super = k_bw_per_ceb * p.q_UF * L_h
			
 
				+    V_loss_super = k_bw_per_ceb * _v_bw_m3(p, t_bw_s) + p.v_ceb_m3
			
 
				+    V_net = max(0.0, V_feed_super - V_loss_super)
			
 
				+    recovery = max(0.0, V_net / max(V_feed_super, 1e-12))
			
 
				+
			
 
				+    # 时间与净供水率
			
 
				+    T_super_h = k_bw_per_ceb * (L_s + t_bw_s) / 3600.0 + p.t_ceb_s / 3600.0
			
 
				+    net_delivery_rate_m3ph = V_net / max(T_super_h, 1e-12)
			
 
				+
			
 
				+    # 贴边比例与硬限
			
 
				+    headroom_ratio = max_tmp_during_filtration / max(p.TMP_max, 1e-12)
			
 
				+    if headroom_ratio > p.headroom_hardcap + 1e-12:
			
 
				+        return False, {"reason": "headroom hardcap exceeded", "headroom_ratio": headroom_ratio}
			
 
				+
			
 
				+    # —— 新增指标 1：日均产水时间（h/d） ——
			
 
				+    daily_prod_time_h = k_bw_per_ceb * L_h / T_super_h * 24.0
			
 
				+
			
 
				+    # —— 新增指标 2：吨水电耗（kWh/m³） ——
			
 
				+    closest_L = min(energy_lookup.keys(), key=lambda x: abs(x - L_s))
			
 
				+    ton_water_energy = energy_lookup[closest_L]
			
 
				+
			
 
				+    info = {
			
 
				+        "recovery": recovery,
			
 
				+        "V_feed_super_m3": V_feed_super,
			
 
				+        "V_loss_super_m3": V_loss_super,
			
 
				+        "V_net_super_m3": V_net,
			
 
				+        "supercycle_time_h": T_super_h,
			
 
				+        "net_delivery_rate_m3ph": net_delivery_rate_m3ph,
			
 
				+        "max_TMP_during_filtration": max_tmp_during_filtration,
			
 
				+        "min_TMP_during_filtration": min_tmp_during_filtration,  # 新增：最小TMP
			
 
				+        "max_residual_increase_per_run": max_residual_increase,
			
 
				+        "phi_bw_effective": phi,
			
 
				+        "TMP_after_ceb": tmp_after_ceb,
			
 
				+        "headroom_ratio": headroom_ratio,
			
 
				+        "daily_prod_time_h": daily_prod_time_h,
			
 
				+        "ton_water_energy_kWh_per_m3": ton_water_energy,
			
 
				+        "k_bw_per_ceb": k_bw_per_ceb
			
 
				+    }
			
 
				+
			
 
				+    return True, info
			
 
				+
			
 
				+def _score(p: UFParams, rec: dict) -> float:
			
 
				+    """综合评分：越大越好。通过非线性放大奖励差异，强化区分好坏动作"""
			
 
				+
			
 
				+    # —— 无量纲化净供水率 ——
			
 
				+    rate_norm = rec["net_delivery_rate_m3ph"] / max(p.q_UF, 1e-12)
			
 
				+
			
 
				+    # —— TMP soft penalty (sigmoid) ——
			
 
				+    tmp_ratio = rec["max_TMP_during_filtration"] / max(p.TMP_max, 1e-12)
			
 
				+    k = 10.0
			
 
				+    headroom_penalty = 1.0 / (1.0 + np.exp(-k * (tmp_ratio - 1.0)))
			
 
				+
			
 
				+    # —— 基础 reward（0.6~0.9左右）——
			
 
				+    base_reward = (
			
 
				+        p.w_rec * rec["recovery"]
			
 
				+        + p.w_rate * rate_norm
			
 
				+        - p.w_headroom * headroom_penalty
			
 
				+    )
			
 
				+
			
 
				+    # —— 非线性放大：平方映射 + 缩放 ——
			
 
				+    # 目的是放大好坏动作差异，同时限制最大值，避免 TD-error 过大
			
 
				+    amplified_reward = (base_reward - 0.5) ** 2 * 5.0
			
 
				+
			
 
				+    # —— 可选：保留符号，区分负奖励
			
 
				+    if base_reward < 0.5:
			
 
				+        amplified_reward = -amplified_reward
			
 
				+
			
 
				+    return amplified_reward
			
 
				+
			
 
				+
			
 
				+class UFSuperCycleEnv(gym.Env):
			
 
				+    """超滤系统环境（超级周期级别决策）"""
			
 
				+
			
 
				+    metadata = {"render_modes": ["human"]}
			
 
				+
			
 
				+    def __init__(self, base_params, max_episode_steps: int = 20):
			
 
				+        super(UFSuperCycleEnv, self).__init__()
			
 
				+
			
 
				+        self.base_params = base_params
			
 
				+        self.current_params = copy.deepcopy(base_params)
			
 
				+        self.max_episode_steps = max_episode_steps
			
 
				+        self.current_step = 0
			
 
				+
			
 
				+        # 计算离散动作空间
			
 
				+        self.L_values = np.arange(
			
 
				+            self.base_params.L_min_s,
			
 
				+            self.base_params.L_max_s + self.base_params.L_step_s,
			
 
				+            self.base_params.L_step_s
			
 
				+        )
			
 
				+        self.t_bw_values = np.arange(
			
 
				+            self.base_params.t_bw_min_s,
			
 
				+            self.base_params.t_bw_max_s + self.base_params.t_bw_step_s,
			
 
				+            self.base_params.t_bw_step_s
			
 
				+        )
			
 
				+
			
 
				+        self.num_L = len(self.L_values)
			
 
				+        self.num_bw = len(self.t_bw_values)
			
 
				+
			
 
				+        # 单一离散动作空间
			
 
				+        self.action_space = spaces.Discrete(self.num_L * self.num_bw)
			
 
				+
			
 
				+        # 状态空间增加 TMP0, 上一次动作(L_s, t_bw_s), 本周期最高 TMP
			
 
				+        # 状态归一化均在 _get_obs 内处理
			
 
				+        self.observation_space = spaces.Box(
			
 
				+            low=np.zeros(4, dtype=np.float32),
			
 
				+            high=np.ones(4, dtype=np.float32),
			
 
				+            dtype=np.float32
			
 
				+        )
			
 
				+
			
 
				+        # 初始化状态
			
 
				+        self.last_action = (self.base_params.L_min_s, self.base_params.t_bw_min_s)
			
 
				+        self.max_TMP_during_filtration = self.current_params.TMP0
			
 
				+        self.reset(seed=None)
			
 
				+
			
 
				+    def _get_obs(self):
			
 
				+        TMP0 = self.current_params.TMP0
			
 
				+        TMP0_norm = (TMP0 - 0.01) / (0.05 - 0.01)
			
 
				+
			
 
				+        L_s, t_bw_s = self.last_action
			
 
				+        L_norm = (L_s - self.base_params.L_min_s) / (self.base_params.L_max_s - self.base_params.L_min_s)
			
 
				+        t_bw_norm = (t_bw_s - self.base_params.t_bw_min_s) / (self.base_params.t_bw_max_s - self.base_params.t_bw_min_s)
			
 
				+
			
 
				+        max_TMP_norm = (self.max_TMP_during_filtration - 0.01) / (0.05 - 0.01)
			
 
				+
			
 
				+        return np.array([TMP0_norm, L_norm, t_bw_norm, max_TMP_norm], dtype=np.float32)
			
 
				+
			
 
				+    def _get_action_values(self, action):
			
 
				+        L_idx = action // self.num_bw
			
 
				+        t_bw_idx = action % self.num_bw
			
 
				+        return self.L_values[L_idx], self.t_bw_values[t_bw_idx]
			
 
				+
			
 
				+    def reset(self, seed=None, options=None):
			
 
				+        super().reset(seed=seed)
			
 
				+        self.current_params.TMP0 = np.random.uniform(0.01, 0.03)
			
 
				+        self.current_step = 0
			
 
				+        self.last_action = (self.base_params.L_min_s, self.base_params.t_bw_min_s)
			
 
				+        self.max_TMP_during_filtration = self.current_params.TMP0
			
 
				+        return self._get_obs(), {}
			
 
				+
			
 
				+    def step(self, action):
			
 
				+        self.current_step += 1
			
 
				+        L_s, t_bw_s = self._get_action_values(action)
			
 
				+        L_s = np.clip(L_s, self.base_params.L_min_s, self.base_params.L_max_s)
			
 
				+        t_bw_s = np.clip(t_bw_s, self.base_params.t_bw_min_s, self.base_params.t_bw_max_s)
			
 
				+
			
 
				+        # 模拟超级周期
			
 
				+        feasible, info = simulate_one_supercycle(self.current_params, L_s, t_bw_s)
			
 
				+
			
 
				+        if feasible:
			
 
				+            reward = _score(self.current_params, info)
			
 
				+            self.current_params.TMP0 = info["TMP_after_ceb"]
			
 
				+            self.max_TMP_during_filtration = info["max_TMP_during_filtration"]
			
 
				+            terminated = False
			
 
				+        else:
			
 
				+            reward = -20
			
 
				+            terminated = True
			
 
				+
			
 
				+        truncated = self.current_step >= self.max_episode_steps
			
 
				+        self.last_action = (L_s, t_bw_s)
			
 
				+        next_obs = self._get_obs()
			
 
				+
			
 
				+        info["feasible"] = feasible
			
 
				+        info["step"] = self.current_step
			
 
				+
			
 
				+        return next_obs, reward, terminated, truncated, info
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/models/uf-rl/超滤训练源码/DQN_train.py
+++ b/models/uf-rl/超滤训练源码/DQN_train.py
@@ -0,0 +1,244 @@
 
				+import os
			
 
				+import time
			
 
				+import random
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+
			
 
				+import gymnasium as gym
			
 
				+from gymnasium import spaces
			
 
				+from stable_baselines3 import DQN
			
 
				+from stable_baselines3.common.monitor import Monitor
			
 
				+from stable_baselines3.common.vec_env import DummyVecEnv
			
 
				+from stable_baselines3.common.callbacks import BaseCallback
			
 
				+
			
 
				+from DQN_env import UFParams, UFSuperCycleEnv
			
 
				+
			
 
				+
			
 
				+# ==== 定义强化学习超参数 ====
			
 
				+class DQNParams:
			
 
				+    """
			
 
				+    DQN 超参数定义类
			
 
				+    用于统一管理模型训练参数
			
 
				+    """
			
 
				+    # 学习率，控制神经网络更新步长
			
 
				+    learning_rate: float = 1e-4
			
 
				+
			
 
				+    # 经验回放缓冲区大小（步数）
			
 
				+    buffer_size: int = 10000
			
 
				+
			
 
				+    # 学习开始前需要收集的步数
			
 
				+    learning_starts: int = 200
			
 
				+
			
 
				+    # 每次从经验池中采样的样本数量
			
 
				+    batch_size: int = 32
			
 
				+
			
 
				+    # 折扣因子，越接近1越重视长期奖励
			
 
				+    gamma: float = 0.95
			
 
				+
			
 
				+    # 每隔多少步训练一次
			
 
				+    train_freq: int = 4
			
 
				+
			
 
				+    # 目标网络更新间隔
			
 
				+    target_update_interval: int = 2000
			
 
				+
			
 
				+    # 初始探索率 ε
			
 
				+    exploration_initial_eps: float = 1.0
			
 
				+
			
 
				+    # 从初始ε衰减到最终ε所占的训练比例
			
 
				+    exploration_fraction: float = 0.3
			
 
				+
			
 
				+    # 最终探索率 ε
			
 
				+    exploration_final_eps: float = 0.02
			
 
				+
			
 
				+    # 日志备注（用于区分不同实验）
			
 
				+    remark: str = "default"
			
 
				+
			
 
				+class UFEpisodeRecorder:
			
 
				+    """记录episode中的决策和结果"""
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.episode_data = []
			
 
				+        self.current_episode = []
			
 
				+
			
 
				+    def record_step(self, obs, action, reward, done, info):
			
 
				+        """记录单步信息"""
			
 
				+        step_data = {
			
 
				+            "obs": obs.copy(),
			
 
				+            "action": action.copy(),
			
 
				+            "reward": reward,
			
 
				+            "done": done,
			
 
				+            "info": info.copy() if info else {}
			
 
				+        }
			
 
				+        self.current_episode.append(step_data)
			
 
				+
			
 
				+        if done:
			
 
				+            self.episode_data.append(self.current_episode)
			
 
				+            self.current_episode = []
			
 
				+
			
 
				+    def get_episode_stats(self, episode_idx=-1):
			
 
				+        """获取episode统计信息"""
			
 
				+        if not self.episode_data:
			
 
				+            return {}
			
 
				+
			
 
				+        episode = self.episode_data[episode_idx]
			
 
				+        total_reward = sum(step["reward"] for step in episode)
			
 
				+        avg_recovery = np.mean([step["info"].get("recovery", 0) for step in episode if "recovery" in step["info"]])
			
 
				+        feasible_steps = sum(1 for step in episode if step["info"].get("feasible", False))
			
 
				+
			
 
				+        return {
			
 
				+            "total_reward": total_reward,
			
 
				+            "avg_recovery": avg_recovery,
			
 
				+            "feasible_steps": feasible_steps,
			
 
				+            "total_steps": len(episode)
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+# ==== 定义强化学习训练回调器 ====
			
 
				+class UFTrainingCallback(BaseCallback):
			
 
				+    """
			
 
				+    强化学习训练回调，用于记录每一步的数据到 recorder。
			
 
				+    1. 不依赖环境内部 last_* 属性
			
 
				+    2. 使用环境接口提供的 obs、actions、rewards、dones、infos
			
 
				+    3. 自动处理 episode 结束时的统计
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, recorder, verbose=0):
			
 
				+        super(UFTrainingCallback, self).__init__(verbose)
			
 
				+        self.recorder = recorder
			
 
				+
			
 
				+    def _on_step(self) -> bool:
			
 
				+        try:
			
 
				+            new_obs = self.locals.get("new_obs")
			
 
				+            actions = self.locals.get("actions")
			
 
				+            rewards = self.locals.get("rewards")
			
 
				+            dones = self.locals.get("dones")
			
 
				+            infos = self.locals.get("infos")
			
 
				+
			
 
				+            if len(new_obs) > 0:
			
 
				+                step_obs = new_obs[0]
			
 
				+                step_action = actions[0] if actions is not None else None
			
 
				+                step_reward = rewards[0] if rewards is not None else 0.0
			
 
				+                step_done = dones[0] if dones is not None else False
			
 
				+                step_info = infos[0] if infos is not None else {}
			
 
				+
			
 
				+                # 打印当前 step 的信息
			
 
				+                if self.verbose:
			
 
				+                    print(f"[Step {self.num_timesteps}] 动作={step_action}, 奖励={step_reward:.3f}, Done={step_done}")
			
 
				+
			
 
				+                # 记录数据
			
 
				+                self.recorder.record_step(
			
 
				+                    obs=step_obs,
			
 
				+                    action=step_action,
			
 
				+                    reward=step_reward,
			
 
				+                    done=step_done,
			
 
				+                    info=step_info,
			
 
				+                )
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            if self.verbose:
			
 
				+                print(f"[Callback Error] {e}")
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+class DQNTrainer:
			
 
				+    def __init__(self, env, params, callback=None):
			
 
				+        self.env = env
			
 
				+        self.params = params
			
 
				+        self.callback = callback
			
 
				+        self.log_dir = self._create_log_dir()
			
 
				+        self.model = self._create_model()
			
 
				+
			
 
				+    def _create_log_dir(self):
			
 
				+        # 创建训练日志
			
 
				+        timestamp = time.strftime("%Y%m%d-%H%M%S")
			
 
				+        log_name = (
			
 
				+            f"DQN_lr{self.params.learning_rate}_buf{self.params.buffer_size}_bs{self.params.batch_size}"
			
 
				+            f"_gamma{self.params.gamma}_exp{self.params.exploration_fraction}"
			
 
				+            f"_{self.params.remark}_{timestamp}"
			
 
				+        )
			
 
				+        log_dir = os.path.join("./uf_dqn_tensorboard", log_name)
			
 
				+        os.makedirs(log_dir, exist_ok=True)
			
 
				+        return log_dir
			
 
				+
			
 
				+    def _create_model(self):
			
 
				+        return DQN(
			
 
				+            policy="MlpPolicy",
			
 
				+            env=self.env,
			
 
				+            learning_rate=self.params.learning_rate,
			
 
				+            buffer_size=self.params.buffer_size,
			
 
				+            learning_starts=self.params.learning_starts,
			
 
				+            batch_size=self.params.batch_size,
			
 
				+            gamma=self.params.gamma,
			
 
				+            train_freq=self.params.train_freq,
			
 
				+            target_update_interval=1,
			
 
				+            tau=0.005,
			
 
				+            exploration_initial_eps=self.params.exploration_initial_eps,
			
 
				+            exploration_fraction=self.params.exploration_fraction,
			
 
				+            exploration_final_eps=self.params.exploration_final_eps,
			
 
				+            verbose=1,
			
 
				+            tensorboard_log=self.log_dir
			
 
				+        )
			
 
				+
			
 
				+    def train(self, total_timesteps: int):
			
 
				+        if self.callback:
			
 
				+            self.model.learn(total_timesteps=total_timesteps, callback=self.callback)
			
 
				+        else:
			
 
				+            self.model.learn(total_timesteps=total_timesteps)
			
 
				+        print(f"模型训练完成，日志保存在：{self.log_dir}")
			
 
				+
			
 
				+    def save(self, path=None):
			
 
				+        if path is None:
			
 
				+            path = os.path.join(self.log_dir, "dqn_model.zip")
			
 
				+        self.model.save(path)
			
 
				+        print(f"模型已保存到：{path}")
			
 
				+
			
 
				+    def load(self, path):
			
 
				+        self.model = DQN.load(path, env=self.env)
			
 
				+        print(f"模型已从 {path} 加载")
			
 
				+
			
 
				+
			
 
				+def set_global_seed(seed: int):
			
 
				+    """固定全局随机种子，保证训练可复现"""
			
 
				+    random.seed(seed)
			
 
				+    np.random.seed(seed)
			
 
				+    torch.manual_seed(seed)
			
 
				+    torch.cuda.manual_seed_all(seed)  # 如果使用GPU
			
 
				+    torch.backends.cudnn.deterministic = True
			
 
				+    torch.backends.cudnn.benchmark = False
			
 
				+
			
 
				+
			
 
				+def train_uf_rl_agent(params: UFParams, total_timesteps: int = 10000, seed: int = 2025):
			
 
				+    set_global_seed(seed)
			
 
				+    recorder = UFEpisodeRecorder()
			
 
				+    callback = UFTrainingCallback(recorder, verbose=1)
			
 
				+
			
 
				+    def make_env():
			
 
				+        env = UFSuperCycleEnv(params)
			
 
				+        env = Monitor(env)
			
 
				+        return env
			
 
				+
			
 
				+    env = DummyVecEnv([make_env])
			
 
				+
			
 
				+    dqn_params = DQNParams()
			
 
				+    trainer = DQNTrainer(env, dqn_params, callback=callback)
			
 
				+    trainer.train(total_timesteps)
			
 
				+    trainer.save()
			
 
				+
			
 
				+    stats = callback.recorder.get_episode_stats()
			
 
				+    print(f"训练完成 - 总奖励: {stats.get('total_reward', 0):.2f}, 平均回收率: {stats.get('avg_recovery', 0):.3f}")
			
 
				+
			
 
				+    return trainer.model
			
 
				+
			
 
				+
			
 
				+# 训练
			
 
				+if __name__ == "__main__":
			
 
				+    # 初始化参数
			
 
				+    params = UFParams()
			
 
				+
			
 
				+    # 训练RL代理
			
 
				+    print("开始训练RL代理...")
			
 
				+    train_uf_rl_agent(params, total_timesteps=50000)
			
 
				+
			
--- a/models/uf-rl/超滤训练源码/README.md
+++ b/models/uf-rl/超滤训练源码/README.md
@@ -0,0 +1,500 @@
 
				+# UF超滤系统强化学习决策模型训练逻辑说明
			
 
				+
			
 
				+## 模型概述
			
 
				+
			
 
				+这是一个基于**深度强化学习（DQN）**的超滤系统运行参数优化模型。不同于前两个"预测模型"，这个模型的目标是**决策**：在给定当前跨膜压差（TMP）的情况下，自动决定最优的产水时长和反洗时长。
			
 
				+
			
 
				+**核心问题**：如何平衡产水量、回收率、能耗和膜寿命？
			
 
				+
			
 
				+## 问题背景
			
 
				+
			
 
				+### 超滤运行周期
			
 
				+
			
 
				+超滤系统运行遵循"小周期"模式：
			
 
				+```
			
 
				+[产水L秒] → [反洗t_bw秒] → [产水L秒] → [反洗t_bw秒] → ... → [化学清洗CEB]
			
 
				+```
			
 
				+
			
 
				+- **产水阶段**：过滤原水，TMP逐渐升高（膜污染）
			
 
				+- **反洗阶段**：反向冲洗，TMP部分恢复
			
 
				+- **化学清洗（CEB）**：每48小时一次，TMP完全恢复
			
 
				+
			
 
				+### 决策难题
			
 
				+
			
 
				+**调节杠杆**：
			
 
				+- `L_s`：单次产水时长（3600-6000秒）
			
 
				+- `t_bw_s`：单次反洗时长（40-60秒）
			
 
				+
			
 
				+**矛盾目标**：
			
 
				+1. **产水量↑**：希望L_s长、t_bw_s短（多产水、少反洗）
			
 
				+2. **回收率↑**：希望t_bw_s短（减少反洗水耗）
			
 
				+3. **膜保护↓**：希望L_s短、t_bw_s长（频繁反洗、TMP不升太高）
			
 
				+4. **能耗↓**：产水时间越长，单位吨水的泵能耗越低
			
 
				+
			
 
				+**传统方法**：人工经验+固定参数，难以在复杂约束下找到最优解  
			
 
				+**强化学习方法**：让AI自己探索，学习在不同TMP下的最佳决策
			
 
				+
			
 
				+## 核心思路：强化学习框架
			
 
				+
			
 
				+### 1. 强化学习是什么？
			
 
				+
			
 
				+把决策问题想象成玩游戏：
			
 
				+```
			
 
				+游戏状态（TMP）→ AI选择动作（L_s, t_bw_s）→ 执行动作 → 获得奖励（回收率、净供水率）→ 新状态（TMP更新）
			
 
				+```
			
 
				+
			
 
				+AI通过**反复试错**，学习哪些动作能获得高奖励。
			
 
				+
			
 
				+### 2. Markov决策过程（MDP）建模
			
 
				+
			
 
				+#### 状态（State）
			
 
				+```python
			
 
				+state = [
			
 
				+    TMP0_normalized,           # 当前初始TMP（归一化到0-1）
			
 
				+    last_L_s_normalized,       # 上一次产水时长（归一化）
			
 
				+    last_t_bw_s_normalized,    # 上一次反洗时长（归一化）
			
 
				+    max_TMP_normalized         # 本周期最高TMP（归一化）
			
 
				+]
			
 
				+```
			
 
				+**4维状态向量**描述当前系统状态
			
 
				+
			
 
				+#### 动作（Action）
			
 
				+```python
			
 
				+# 离散动作空间：L_s × t_bw_s的网格
			
 
				+L_s范围：3800-6000秒，步长60秒 → 37个选项
			
 
				+t_bw_s范围：40-60秒，步长5秒 → 5个选项
			
 
				+
			
 
				+总动作数 = 37 × 5 = 185个
			
 
				+```
			
 
				+
			
 
				+每个动作对应一个`(L_s, t_bw_s)`组合
			
 
				+
			
 
				+#### 奖励（Reward）
			
 
				+```python
			
 
				+# 多目标加权奖励
			
 
				+reward = 0.8 × recovery           # 回收率（主要目标）
			
 
				+       + 0.2 × rate_normalized    # 净供水率
			
 
				+       - 0.2 × headroom_penalty   # TMP贴边惩罚
			
 
				+```
			
 
				+
			
 
				+**奖励设计原则**：
			
 
				+- 高回收率 → 高奖励
			
 
				+- 高净供水率 → 高奖励
			
 
				+- TMP接近上限 → 负奖励（膜风险）
			
 
				+- 违反约束 → 大负奖励（-20）
			
 
				+
			
 
				+#### 状态转移
			
 
				+```python
			
 
				+# 模拟器：根据物理模型计算下一个状态
			
 
				+def simulate_one_supercycle(TMP0, L_s, t_bw_s):
			
 
				+    # 1. 计算产水阶段TMP上升
			
 
				+    delta_TMP = model_fp(L_s)  # 调用TMP增长模型
			
 
				+    TMP_peak = TMP0 + delta_TMP
			
 
				+    
			
 
				+    # 2. 计算反洗恢复
			
 
				+    phi = model_bw(L_s, t_bw_s)  # 调用反洗恢复模型
			
 
				+    TMP_after_bw = TMP_peak - phi × (TMP_peak - TMP0)
			
 
				+    
			
 
				+    # 3. 多次小周期后CEB
			
 
				+    TMP_new = TMP0  # 化学清洗后完全恢复
			
 
				+    
			
 
				+    # 4. 计算指标
			
 
				+    recovery = (产水 - 反洗水耗 - CEB水耗) / 产水
			
 
				+    net_rate = 净产水 / 总时间
			
 
				+    
			
 
				+    return TMP_new, recovery, net_rate, ...
			
 
				+```
			
 
				+
			
 
				+## DQN算法详解
			
 
				+
			
 
				+### 什么是DQN？
			
 
				+
			
 
				+**Deep Q-Network（深度Q网络）**：
			
 
				+- 用神经网络估计**Q值函数**：`Q(state, action) = 预期累积奖励`
			
 
				+- 最优策略：在每个状态选择Q值最大的动作
			
 
				+
			
 
				+```
			
 
				+状态 → [神经网络] → 每个动作的Q值 → 选择最大Q值的动作
			
 
				+```
			
 
				+
			
 
				+### 神经网络结构
			
 
				+
			
 
				+```python
			
 
				+# Stable-Baselines3的MlpPolicy默认结构
			
 
				+输入层：4维状态
			
 
				+隐藏层1：64神经元 + ReLU
			
 
				+隐藏层2：64神经元 + ReLU
			
 
				+输出层：185个动作的Q值
			
 
				+```
			
 
				+
			
 
				+### 训练流程（`DQN_train.py`）
			
 
				+
			
 
				+#### 1. 经验回放（Experience Replay）
			
 
				+```python
			
 
				+buffer_size = 10000  # 存储10000条经验
			
 
				+
			
 
				+# 交互过程
			
 
				+for step in range(total_timesteps):
			
 
				+    action = model.select_action(state)        # ε-贪心选择动作
			
 
				+    next_state, reward = env.step(action)      # 执行动作
			
 
				+    buffer.store(state, action, reward, next_state)  # 存入缓冲区
			
 
				+    
			
 
				+    # 从缓冲区随机采样训练
			
 
				+    if step > learning_starts:
			
 
				+        batch = buffer.sample(batch_size=32)
			
 
				+        model.train_on_batch(batch)
			
 
				+```
			
 
				+
			
 
				+**为什么需要经验回放？**
			
 
				+- 打破数据相关性（连续状态往往相似）
			
 
				+- 提高样本利用效率（同一条经验可多次使用）
			
 
				+
			
 
				+#### 2. ε-贪心探索
			
 
				+```python
			
 
				+# 随机探索 vs 利用已学知识
			
 
				+if random() < epsilon:
			
 
				+    action = random_action()   # 探索：随机选
			
 
				+else:
			
 
				+    action = argmax(Q(state))  # 利用：选Q值最大的
			
 
				+
			
 
				+# epsilon从1.0衰减到0.02
			
 
				+epsilon = 1.0 → 0.8 → ... → 0.02
			
 
				+```
			
 
				+
			
 
				+**探索-利用权衡**：
			
 
				+- 初期多探索（发现好动作）
			
 
				+- 后期多利用（稳定在最优策略）
			
 
				+
			
 
				+#### 3. 目标网络（Target Network）
			
 
				+```python
			
 
				+# 两个网络：当前网络 + 目标网络
			
 
				+Q_current(state, action)  # 每步更新
			
 
				+Q_target(next_state, a')   # 每2000步同步一次
			
 
				+
			
 
				+# TD误差
			
 
				+loss = MSE(Q_current(s,a), reward + γ × max(Q_target(s', a')))
			
 
				+```
			
 
				+
			
 
				+**为什么需要目标网络？**
			
 
				+- 稳定训练（避免"追逐移动目标"问题）
			
 
				+- 减少Q值估计的震荡
			
 
				+
			
 
				+#### 4. 训练超参数
			
 
				+
			
 
				+```python
			
 
				+class DQNParams:
			
 
				+    learning_rate = 1e-4          # 学习率
			
 
				+    buffer_size = 10000           # 经验池大小
			
 
				+    learning_starts = 200         # 200步后开始学习
			
 
				+    batch_size = 32               # 每次训练32个样本
			
 
				+    gamma = 0.95                  # 折扣因子（重视长期奖励）
			
 
				+    train_freq = 4                # 每4步训练一次
			
 
				+    target_update_interval = 2000 # 每2000步更新目标网络
			
 
				+    exploration_fraction = 0.3    # 前30%训练时间用于探索
			
 
				+    exploration_final_eps = 0.02  # 最终保留2%探索
			
 
				+```
			
 
				+
			
 
				+## 模拟环境（`DQN_env.py`）
			
 
				+
			
 
				+### UFSuperCycleEnv类
			
 
				+
			
 
				+```python
			
 
				+class UFSuperCycleEnv(gym.Env):
			
 
				+    def reset(self):
			
 
				+        # 重置环境：随机初始TMP
			
 
				+        self.TMP0 = random.uniform(0.01, 0.03)
			
 
				+        return self._get_obs()
			
 
				+    
			
 
				+    def step(self, action):
			
 
				+        # 执行动作
			
 
				+        L_s, t_bw_s = self._decode_action(action)
			
 
				+        
			
 
				+        # 调用模拟器
			
 
				+        feasible, info = simulate_one_supercycle(self.TMP0, L_s, t_bw_s)
			
 
				+        
			
 
				+        if feasible:
			
 
				+            reward = _score(info)  # 计算奖励
			
 
				+            self.TMP0 = info["TMP_after_ceb"]  # 更新TMP
			
 
				+            done = False
			
 
				+        else:
			
 
				+            reward = -20  # 违反约束，大负奖励
			
 
				+            done = True   # episode终止
			
 
				+        
			
 
				+        return next_state, reward, done, info
			
 
				+```
			
 
				+
			
 
				+### 约束检查
			
 
				+
			
 
				+```python
			
 
				+# 硬约束1：TMP峰值不得超过0.06 MPa
			
 
				+if TMP_peak > 0.06:
			
 
				+    return False
			
 
				+
			
 
				+# 硬约束2：单次残余增量不得超过0.001 MPa
			
 
				+if (TMP_after_bw - TMP0) > 0.001:
			
 
				+    return False
			
 
				+
			
 
				+# 硬约束3：TMP不得超过上限的98%
			
 
				+if TMP_peak / TMP_max > 0.98:
			
 
				+    return False
			
 
				+```
			
 
				+
			
 
				+### 物理模型集成
			
 
				+
			
 
				+```python
			
 
				+# TMP增长模型（uf_fp.pth）
			
 
				+def _delta_tmp(L_h):
			
 
				+    return model_fp(params, L_h)  # 产水时长 → TMP增量
			
 
				+
			
 
				+# 反洗恢复模型（uf_bw.pth）
			
 
				+def phi_bw_of(L_s, t_bw_s):
			
 
				+    return model_bw(params, L_s, t_bw_s)  # (产水时长, 反洗时长) → 恢复比例
			
 
				+```
			
 
				+
			
 
				+这两个模型是基于数据拟合或物理建模得到的。
			
 
				+
			
 
				+## 决策使用（`DQN_decide.py`）
			
 
				+
			
 
				+### 单步决策接口
			
 
				+
			
 
				+```python
			
 
				+def run_uf_DQN_decide(uf_params, TMP0_value):
			
 
				+    # 1. 创建环境
			
 
				+    env = UFSuperCycleEnv(uf_params)
			
 
				+    env.current_params.TMP0 = TMP0_value  # 设置当前TMP
			
 
				+    
			
 
				+    # 2. 加载训练好的模型
			
 
				+    model = DQN.load("dqn_model.zip")
			
 
				+    
			
 
				+    # 3. 预测动作（确定性，不探索）
			
 
				+    action, _ = model.predict(state, deterministic=True)
			
 
				+    
			
 
				+    # 4. 解码动作
			
 
				+    L_s, t_bw_s = decode_action(action)
			
 
				+    
			
 
				+    return {
			
 
				+        "action": action,
			
 
				+        "L_s": L_s,
			
 
				+        "t_bw_s": t_bw_s,
			
 
				+        "expected_recovery": info["recovery"],
			
 
				+        ...
			
 
				+    }
			
 
				+```
			
 
				+
			
 
				+### PLC指令生成
			
 
				+
			
 
				+为了避免频繁大幅调整（工艺稳定性），使用**渐进式调整**：
			
 
				+
			
 
				+```python
			
 
				+def generate_plc_instructions(current, model_prev, model_current):
			
 
				+    # 计算差异
			
 
				+    diff = model_current - effective_current
			
 
				+    
			
 
				+    # 渐进调整：每次只调整一个步长
			
 
				+    if abs(diff) >= threshold:
			
 
				+        adjustment = +step_size if diff > 0 else -step_size
			
 
				+    else:
			
 
				+        adjustment = 0
			
 
				+    
			
 
				+    next_value = effective_current + adjustment
			
 
				+    return next_value
			
 
				+```
			
 
				+
			
 
				+**示例**：
			
 
				+```
			
 
				+当前L_s = 4000秒
			
 
				+模型建议 = 4300秒
			
 
				+步长 = 60秒
			
 
				+
			
 
				+第1轮下发：4060秒（+60）
			
 
				+第2轮下发：4120秒（+60）
			
 
				+...
			
 
				+第5轮下发：4300秒（到达目标）
			
 
				+```
			
 
				+
			
 
				+## 性能指标计算（`DQN_decide.py`）
			
 
				+
			
 
				+```python
			
 
				+def calc_uf_cycle_metrics(TMP0, L_s, t_bw_s):
			
 
				+    # 模拟一个超级周期
			
 
				+    feasible, info = simulate_one_supercycle(params, L_s, t_bw_s)
			
 
				+    
			
 
				+    return {
			
 
				+        "k_bw_per_ceb": 小周期次数,
			
 
				+        "recovery": 回收率,
			
 
				+        "net_delivery_rate_m3ph": 净供水率（m³/h）,
			
 
				+        "daily_prod_time_h": 日均产水时间（h/天）,
			
 
				+        "ton_water_energy_kWh_per_m3": 吨水电耗（kWh/m³）,
			
 
				+        "max_permeability": 最高渗透率（lmh/bar）
			
 
				+    }
			
 
				+```
			
 
				+
			
 
				+## 文件结构说明
			
 
				+
			
 
				+```
			
 
				+uf-rl/
			
 
				+├── DQN_train.py         # 强化学习训练脚本（DQN算法）
			
 
				+├── DQN_env.py           # 模拟环境（MDP定义、物理模拟）
			
 
				+├── DQN_decide.py        # 决策接口（加载模型、生成指令）
			
 
				+├── UF_decide.py         # 传统优化方法（网格搜索，用于对比）
			
 
				+├── UF_models.py         # 物理模型定义（TMP增长、反洗恢复）
			
 
				+├── uf_fp.pth            # TMP增长模型权重
			
 
				+├── uf_bw.pth            # 反洗恢复模型权重
			
 
				+└── dqn_model.zip        # 训练好的DQN模型
			
 
				+```
			
 
				+
			
 
				+## 训练流程总结
			
 
				+
			
 
				+```mermaid
			
 
				+graph LR
			
 
				+    A[初始化环境] --> B[随机初始TMP]
			
 
				+    B --> C{ε-贪心选择动作}
			
 
				+    C -->|探索| D[随机动作]
			
 
				+    C -->|利用| E[Q值最大动作]
			
 
				+    D --> F[模拟执行]
			
 
				+    E --> F
			
 
				+    F --> G{约束检查}
			
 
				+    G -->|可行| H[计算奖励]
			
 
				+    G -->|不可行| I[负奖励-20]
			
 
				+    H --> J[存入经验池]
			
 
				+    I --> J
			
 
				+    J --> K{达到学习步数?}
			
 
				+    K -->|是| L[采样训练]
			
 
				+    K -->|否| M[继续交互]
			
 
				+    L --> N{episode结束?}
			
 
				+    M --> N
			
 
				+    N -->|否| C
			
 
				+    N -->|是| B
			
 
				+```
			
 
				+
			
 
				+## 与传统方法对比
			
 
				+
			
 
				+### 传统网格搜索（`UF_decide.py`）
			
 
				+
			
 
				+```python
			
 
				+# 穷举所有(L_s, t_bw_s)组合
			
 
				+for L_s in [3600, 3660, ..., 4200]:
			
 
				+    for t_bw_s in [90, 92, ..., 100]:
			
 
				+        feasible, metrics = simulate(L_s, t_bw_s)
			
 
				+        if feasible and score > best_score:
			
 
				+            best = (L_s, t_bw_s)
			
 
				+```
			
 
				+
			
 
				+**优点**：简单、可解释、保证找到网格上的最优解  
			
 
				+**缺点**：
			
 
				+- 计算量大（数百次模拟）
			
 
				+- 参数空间离散化（可能错过真正最优点）
			
 
				+- 无法泛化（每个TMP都要重新搜索）
			
 
				+
			
 
				+### 强化学习（DQN）
			
 
				+
			
 
				+**优点**：
			
 
				+- 训练后推理快（一次前向传播）
			
 
				+- 能泛化到不同TMP（学到状态-动作映射）
			
 
				+- 可处理更复杂的状态（如历史趋势）
			
 
				+
			
 
				+**缺点**：
			
 
				+- 训练耗时（需要大量交互）
			
 
				+- 黑盒性（难以解释为何选择某动作）
			
 
				+- 性能受模拟器精度影响
			
 
				+
			
 
				+## 训练建议
			
 
				+
			
 
				+### 提升策略性能
			
 
				+
			
 
				+1. **改进奖励设计**：
			
 
				+   ```python
			
 
				+   # 添加渗透率奖励
			
 
				+   reward += 0.1 × permeability
			
 
				+   
			
 
				+   # 添加稳定性奖励（动作变化小）
			
 
				+   reward -= 0.05 × |action - last_action|
			
 
				+   ```
			
 
				+
			
 
				+2. **增加状态信息**：
			
 
				+   ```python
			
 
				+   state = [
			
 
				+       TMP0, last_L, last_t_bw, max_TMP,
			
 
				+       water_quality,  # 水质指标
			
 
				+       days_since_ceb, # 距上次CEB天数
			
 
				+       ...
			
 
				+   ]
			
 
				+   ```
			
 
				+
			
 
				+3. **课程学习（Curriculum Learning）**：
			
 
				+   ```python
			
 
				+   # 阶段1：简单场景（TMP变化小）
			
 
				+   env.TMP_range = [0.025, 0.035]
			
 
				+   train(10000 steps)
			
 
				+   
			
 
				+   # 阶段2：中等场景
			
 
				+   env.TMP_range = [0.01, 0.04]
			
 
				+   train(20000 steps)
			
 
				+   
			
 
				+   # 阶段3：困难场景（全范围）
			
 
				+   env.TMP_range = [0.01, 0.05]
			
 
				+   train(20000 steps)
			
 
				+   ```
			
 
				+
			
 
				+### 加速训练
			
 
				+
			
 
				+```python
			
 
				+# 1. 减少训练步数
			
 
				+total_timesteps = 10000  # 从50000降到10000
			
 
				+
			
 
				+# 2. 增大batch_size（如果内存足够）
			
 
				+batch_size = 64
			
 
				+
			
 
				+# 3. 调高learning_rate（小心不稳定）
			
 
				+learning_rate = 5e-4
			
 
				+
			
 
				+# 4. 预训练：从传统方法生成初始数据
			
 
				+buffer.load_from_grid_search()
			
 
				+```
			
 
				+
			
 
				+## 常见问题
			
 
				+
			
 
				+**Q：为什么用强化学习而不是监督学习？**  
			
 
				+A：监督学习需要"正确答案"标签，但这里没有标准答案（最优策略本身就是要学习的）。强化学习通过奖励信号自己探索最优策略。
			
 
				+
			
 
				+**Q：模拟器不准确怎么办？**  
			
 
				+A：这是强化学习最大风险。解决方法：
			
 
				+- 用真实数据校准模拟器
			
 
				+- Sim-to-Real迁移（在真实系统上微调）
			
 
				+- 保守策略（加大安全裕度）
			
 
				+
			
 
				+**Q：能否用于在线学习？**  
			
 
				+A：可以，但需谨慎：
			
 
				+- 设置安全约束（避免危险动作）
			
 
				+- 分阶段部署（先离线验证）
			
 
				+- 人工监督（关键决策需人工确认）
			
 
				+
			
 
				+**Q：为什么动作空间是离散的？**  
			
 
				+A：DQN擅长离散动作（每个动作一个Q值）。如果需要连续动作，可用DDPG、SAC等算法。
			
 
				+
			
 
				+**Q：如何评估策略好坏？**  
			
 
				+A：
			
 
				+- 离线：在验证集上计算平均回收率、净供水率
			
 
				+- 在线：实际运行后对比历史数据
			
 
				+- 对比基线：与传统固定参数、网格搜索比较
			
 
				+
			
 
				+## 未来优化方向
			
 
				+
			
 
				+1. **多智能体协同**：多个UF模组联合优化
			
 
				+2. **分层强化学习**：高层决策策略，低层决策参数
			
 
				+3. **模型预测控制（MPC）集成**：结合物理模型和学习策略
			
 
				+4. **安全强化学习**：硬约束保证（Safety RL）
			
 
				+5. **离线强化学习**：仅用历史数据训练（Offline RL）
			
 
				+
			
 
				+## 总结
			
 
				+
			
 
				+UF-RL模型是一个**决策优化系统**，通过深度强化学习学习在不同跨膜压差下的最优运行策略。相比传统方法：
			
 
				+- **更智能**：能适应不同状态，无需人工调参
			
 
				+- **更高效**：训练后推理快速
			
 
				+- **更全面**：平衡多个矛盾目标
			
 
				+
			
 
				+但同时也需要：
			
 
				+- **准确的模拟器**：保证学到的策略有效
			
 
				+- **充分的训练**：探索足够多的状态-动作组合
			
 
				+- **谨慎的部署**：实际应用前充分验证
			
 
				+
			
--- a/models/uf-rl/超滤训练源码/UF_RL_架构问题与优化方案.md
+++ b/models/uf-rl/超滤训练源码/UF_RL_架构问题与优化方案.md
@@ -0,0 +1,1076 @@
 
				+# UF-RL 架构问题分析与优化方案
			
 
				+
			
 
				+## 目录
			
 
				+1. [架构层面问题](#架构层面问题)
			
 
				+2. [代码实现问题](#代码实现问题)
			
 
				+3. [算法设计问题](#算法设计问题)
			
 
				+4. [工程质量问题](#工程质量问题)
			
 
				+5. [优化方案](#优化方案)
			
 
				+6. [重构建议](#重构建议)
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 架构层面问题
			
 
				+
			
 
				+### 问题1：物理模型是伪神经网络 ⚠️⚠️⚠️
			
 
				+
			
 
				+**位置**：`UF_models.py`
			
 
				+
			
 
				+**问题描述**：
			
 
				+```python
			
 
				+class TMPIncreaseModel(torch.nn.Module):
			
 
				+    def forward(self, p, L_h):
			
 
				+        # 这不是神经网络，只是数学公式！
			
 
				+        return float(p.alpha * (p.q_UF ** p.belta) * L_h)
			
 
				+```
			
 
				+
			
 
				+**问题分析**：
			
 
				+1. **没有可训练参数**：继承`nn.Module`但没有定义任何`nn.Parameter`
			
 
				+2. **保存.pth无意义**：`state_dict()`是空字典
			
 
				+3. **完全基于人工公式**：没有从数据中学习
			
 
				+
			
 
				+**验证问题**：
			
 
				+```python
			
 
				+model = TMPIncreaseModel()
			
 
				+print(model.state_dict())  # 输出：OrderedDict()
			
 
				+print(list(model.parameters()))  # 输出：[]
			
 
				+```
			
 
				+
			
 
				+**影响**：
			
 
				+- 模拟器精度完全取决于人工公式的准确性
			
 
				+- 无法利用真实运行数据改进模型
			
 
				+- 存在**Sim-to-Real Gap**（模拟与真实的差异）
			
 
				+
			
 
				+**评级**：🔴 严重问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 问题2：状态空间信息不足 ⚠️⚠️
			
 
				+
			
 
				+**位置**：`DQN_env.py` - `_get_obs()`
			
 
				+
			
 
				+**当前状态**：
			
 
				+```python
			
 
				+state = [
			
 
				+    TMP0_norm,      # 当前TMP
			
 
				+    L_norm,         # 上次产水时长
			
 
				+    t_bw_norm,      # 上次反洗时长
			
 
				+    max_TMP_norm    # 周期最高TMP
			
 
				+]  # 仅4维
			
 
				+```
			
 
				+
			
 
				+**缺失信息**：
			
 
				+1. **水质特征**：浊度、COD、温度、pH等
			
 
				+2. **历史趋势**：TMP变化速率、污染累积趋势
			
 
				+3. **时间信息**：自上次CEB的时间、季节性
			
 
				+4. **膜状态**：膜龄、历史清洗次数、累积运行时间
			
 
				+5. **运行模式**：当前流量、压力、回收率
			
 
				+
			
 
				+**后果**：
			
 
				+- 智能体难以学习长期策略
			
 
				+- 无法适应不同水质条件
			
 
				+- 泛化能力弱
			
 
				+
			
 
				+**评级**：🟡 中等问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 问题3：奖励函数设计不合理 ⚠️⚠️
			
 
				+
			
 
				+**位置**：`DQN_env.py` - `_score()`
			
 
				+
			
 
				+**问题代码**：
			
 
				+```python
			
 
				+def _score(p, rec):
			
 
				+    base_reward = 0.8 × recovery + 0.2 × rate_norm - 0.2 × headroom_penalty
			
 
				+    # 基础奖励范围：0.6 ~ 0.9
			
 
				+    
			
 
				+    # 非线性放大
			
 
				+    amplified = (base_reward - 0.5) ** 2 * 5.0
			
 
				+    if base_reward < 0.5:
			
 
				+        amplified = -amplified
			
 
				+    
			
 
				+    return amplified
			
 
				+```
			
 
				+
			
 
				+**问题1：非线性变换过于激进**
			
 
				+
			
 
				+奖励映射示例：
			
 
				+| base_reward | amplified | 倍数变化 |
			
 
				+|-------------|-----------|---------|
			
 
				+| 0.85 | 0.613 | - |
			
 
				+| 0.80 | 0.450 | ↓36% |
			
 
				+| 0.75 | 0.313 | ↓31% |
			
 
				+| 0.70 | 0.200 | ↓36% |
			
 
				+
			
 
				+**后果**：
			
 
				+- Q值估计困难（奖励尺度不一致）
			
 
				+- 梯度不稳定
			
 
				+- 可能导致训练震荡
			
 
				+
			
 
				+**问题2：约束违反惩罚不合理**
			
 
				+
			
 
				+```python
			
 
				+if not feasible:
			
 
				+    reward = -20  # 硬编码的大惩罚
			
 
				+```
			
 
				+
			
 
				+**分析**：
			
 
				+- `-20`与正常奖励（0.2~0.8）相差25-100倍
			
 
				+- 没有区分不同约束违反的严重程度
			
 
				+- 可能导致智能体过度保守
			
 
				+
			
 
				+**问题3：sigmoid惩罚形式复杂**
			
 
				+
			
 
				+```python
			
 
				+headroom_penalty = 1 / (1 + exp(-10 × (tmp_ratio - 1.0)))
			
 
				+```
			
 
				+
			
 
				+- 参数k=10是硬编码的
			
 
				+- TMP贴边惩罚与其他目标权重不匹配
			
 
				+
			
 
				+**评级**：🟡 中等问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 问题4：episode设置不合理 ⚠️
			
 
				+
			
 
				+**位置**：`DQN_env.py` - `__init__()`
			
 
				+
			
 
				+**问题代码**：
			
 
				+```python
			
 
				+class UFSuperCycleEnv:
			
 
				+    def __init__(self, base_params, max_episode_steps=20):
			
 
				+        self.max_episode_steps = 20  # 固定20步
			
 
				+```
			
 
				+
			
 
				+**问题分析**：
			
 
				+1. **太短**：20步约等于20个超级周期（40-60天）
			
 
				+   - 智能体难以学习长期策略
			
 
				+   - 无法捕捉膜长期劣化趋势
			
 
				+
			
 
				+2. **固定长度**：
			
 
				+   - 不利于学习不同时间尺度的策略
			
 
				+   - 没有自然终止条件（如膜完全失效）
			
 
				+
			
 
				+3. **截断vs终止混淆**：
			
 
				+   ```python
			
 
				+   truncated = self.current_step >= 20  # 强制截断
			
 
				+   ```
			
 
				+   - 截断的episode不应该视为失败
			
 
				+   - 但当前代码没有区分处理
			
 
				+
			
 
				+**建议**：
			
 
				+- 增加到50-100步
			
 
				+- 添加自然终止条件（如TMP超限3次）
			
 
				+
			
 
				+**评级**：🟡 中等问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 代码实现问题
			
 
				+
			
 
				+### 问题5：目标网络更新策略冲突 ⚠️⚠️⚠️
			
 
				+
			
 
				+**位置**：`DQN_train.py` - `_create_model()`
			
 
				+
			
 
				+**冲突代码**：
			
 
				+```python
			
 
				+class DQNParams:
			
 
				+    target_update_interval: int = 2000  # 参数说明：每2000步更新
			
 
				+
			
 
				+def _create_model(self):
			
 
				+    return DQN(
			
 
				+        ...
			
 
				+        target_update_interval=1,  # 实际代码：每1步更新
			
 
				+        tau=0.005,                # 软更新系数
			
 
				+        ...
			
 
				+    )
			
 
				+```
			
 
				+
			
 
				+**问题分析**：
			
 
				+1. **参数说明与实现不一致**
			
 
				+2. **软更新与硬更新混淆**：
			
 
				+   - `target_update_interval=1` + `tau=0.005` → 软更新
			
 
				+   - 注释说的是硬更新
			
 
				+   - 两种策略特性不同
			
 
				+
			
 
				+**soft update（当前实际使用）**：
			
 
				+```python
			
 
				+θ_target = 0.005 × θ_current + 0.995 × θ_target
			
 
				+```
			
 
				+- 优点：平滑收敛
			
 
				+- 缺点：可能不够稳定（对DQN而言）
			
 
				+
			
 
				+**hard update（注释说明）**：
			
 
				+```python
			
 
				+Every 2000 steps: θ_target = θ_current
			
 
				+```
			
 
				+- 优点：稳定性好（DQN原始设计）
			
 
				+- 缺点：更新滞后
			
 
				+
			
 
				+**建议**：
			
 
				+```python
			
 
				+# 改为经典DQN的硬更新
			
 
				+target_update_interval=1000,  # 每1000步硬更新
			
 
				+tau=1.0,                      # tau=1表示完全复制
			
 
				+```
			
 
				+
			
 
				+**评级**：🔴 严重问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 问题6：经验池太小 ⚠️
			
 
				+
			
 
				+**位置**：`DQN_train.py` - `DQNParams`
			
 
				+
			
 
				+**问题代码**：
			
 
				+```python
			
 
				+buffer_size: int = 10000  # 仅10000条经验
			
 
				+```
			
 
				+
			
 
				+**问题分析**：
			
 
				+1. **相对于动作空间太小**：
			
 
				+   - 185个动作
			
 
				+   - 理想情况：每个动作至少100条经验 → 需要18500
			
 
				+   - 考虑不同状态：需要更多
			
 
				+
			
 
				+2. **经验覆盖率低**：
			
 
				+   - 10000步训练期间，大部分动作可能没被充分探索
			
 
				+   - 导致Q值估计偏差
			
 
				+
			
 
				+3. **旧经验快速被覆盖**：
			
 
				+   - 50000步训练，经验池会被覆盖5次
			
 
				+   - 早期的好经验可能被丢弃
			
 
				+
			
 
				+**建议**：
			
 
				+```python
			
 
				+buffer_size: int = 50000  # 增加到50000
			
 
				+```
			
 
				+
			
 
				+**评级**：🟡 中等问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 问题7：探索开始过早 ⚠️
			
 
				+
			
 
				+**位置**：`DQN_train.py` - `DQNParams`
			
 
				+
			
 
				+**问题代码**：
			
 
				+```python
			
 
				+learning_starts: int = 200  # 仅200步随机探索
			
 
				+```
			
 
				+
			
 
				+**问题分析**：
			
 
				+1. **预填充不足**：
			
 
				+   - 200步 < 动作数量（185）
			
 
				+   - 许多动作可能一次都没被采样
			
 
				+
			
 
				+2. **早期训练不稳定**：
			
 
				+   - 经验池数据分布严重偏斜
			
 
				+   - Q值初始估计误差大
			
 
				+
			
 
				+**标准实践**：
			
 
				+- 至少 `buffer_size × 0.1` = 5000步
			
 
				+- 或 `action_space × 10` = 1850步
			
 
				+
			
 
				+**建议**：
			
 
				+```python
			
 
				+learning_starts: int = 5000  # 增加到5000
			
 
				+```
			
 
				+
			
 
				+**评级**：🟡 中等问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 问题8：归一化范围硬编码 ⚠️
			
 
				+
			
 
				+**位置**：`DQN_env.py` - `_get_obs()`
			
 
				+
			
 
				+**问题代码**：
			
 
				+```python
			
 
				+def _get_obs(self):
			
 
				+    TMP0_norm = (TMP0 - 0.01) / (0.05 - 0.01)  # 硬编码范围
			
 
				+    ...
			
 
				+```
			
 
				+
			
 
				+**问题分析**：
			
 
				+1. **缺乏灵活性**：
			
 
				+   - 如果TMP范围变化（新膜/旧膜），需要修改代码
			
 
				+   - 不同工厂TMP范围可能不同
			
 
				+
			
 
				+2. **边界处理不当**：
			
 
				+   ```python
			
 
				+   TMP0 = 0.03  # 如果0.03对应的归一化值？
			
 
				+   norm = (0.03 - 0.01) / 0.04 = 0.5  # 中间值
			
 
				+   ```
			
 
				+   - 如果TMP超出范围[0.01, 0.05]会怎样？未做clipping
			
 
				+
			
 
				+3. **不同维度归一化不一致**：
			
 
				+   - TMP: [0.01, 0.05]
			
 
				+   - L_s: [3800, 6000]
			
 
				+   - t_bw_s: [40, 60]
			
 
				+   - 范围差异大，但都归一化到[0, 1]
			
 
				+
			
 
				+**建议**：
			
 
				+```python
			
 
				+class Normalizer:
			
 
				+    def __init__(self):
			
 
				+        self.tmp_min = 0.01
			
 
				+        self.tmp_max = 0.05
			
 
				+        # ...可配置
			
 
				+    
			
 
				+    def normalize_tmp(self, tmp):
			
 
				+        return np.clip((tmp - self.tmp_min) / (self.tmp_max - self.tmp_min), 0, 1)
			
 
				+```
			
 
				+
			
 
				+**评级**：🟢 轻微问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 问题9：全局模型加载 ⚠️
			
 
				+
			
 
				+**位置**：`DQN_env.py` - 顶层
			
 
				+
			
 
				+**问题代码**：
			
 
				+```python
			
 
				+# 全局加载（模块导入时执行）
			
 
				+model_fp = TMPIncreaseModel()
			
 
				+model_bw = TMPDecreaseModel()
			
 
				+model_fp.load_state_dict(torch.load("uf_fp.pth"))
			
 
				+model_bw.load_state_dict(torch.load("uf_bw.pth"))
			
 
				+model_fp.eval()
			
 
				+model_bw.eval()
			
 
				+```
			
 
				+
			
 
				+**问题分析**：
			
 
				+1. **不支持多环境并行**：
			
 
				+   - 如果使用`SubprocVecEnv`（多进程），每个进程都会加载
			
 
				+   - 浪费内存
			
 
				+
			
 
				+2. **路径硬编码**：
			
 
				+   - 必须在当前目录下有`uf_fp.pth`
			
 
				+   - 不利于部署
			
 
				+
			
 
				+3. **无法动态切换模型**：
			
 
				+   - 如果想测试不同的物理模型，需要重启程序
			
 
				+
			
 
				+4. **测试困难**：
			
 
				+   - 单元测试时无法mock这些模型
			
 
				+
			
 
				+**建议**：
			
 
				+```python
			
 
				+class UFSuperCycleEnv:
			
 
				+    def __init__(self, base_params, model_dir="./"):
			
 
				+        self.model_fp = TMPIncreaseModel()
			
 
				+        self.model_bw = TMPDecreaseModel()
			
 
				+        self.model_fp.load_state_dict(torch.load(f"{model_dir}/uf_fp.pth"))
			
 
				+        self.model_bw.load_state_dict(torch.load(f"{model_dir}/uf_bw.pth"))
			
 
				+```
			
 
				+
			
 
				+**评级**：🟡 中等问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 问题10：缺少模型checkpoint ⚠️⚠️
			
 
				+
			
 
				+**位置**：`DQN_train.py` - `train()`
			
 
				+
			
 
				+**问题代码**：
			
 
				+```python
			
 
				+def train(self, total_timesteps: int):
			
 
				+    self.model.learn(total_timesteps=total_timesteps, callback=self.callback)
			
 
				+    # 训练结束后才保存一次
			
 
				+```
			
 
				+
			
 
				+**问题分析**：
			
 
				+1. **训练中断风险**：
			
 
				+   - 50000步训练可能需要数小时
			
 
				+   - 如果中途崩溃，所有进度丢失
			
 
				+
			
 
				+2. **无法回滚到最佳模型**：
			
 
				+   - 如果训练后期发散，无法恢复到中间的好模型
			
 
				+
			
 
				+3. **难以对比不同阶段**：
			
 
				+   - 无法分析不同训练阶段的策略差异
			
 
				+
			
 
				+**建议**：
			
 
				+```python
			
 
				+from stable_baselines3.common.callbacks import CheckpointCallback
			
 
				+
			
 
				+checkpoint_callback = CheckpointCallback(
			
 
				+    save_freq=5000,  # 每5000步保存
			
 
				+    save_path='./checkpoints/',
			
 
				+    name_prefix='uf_dqn'
			
 
				+)
			
 
				+
			
 
				+model.learn(..., callback=[checkpoint_callback, training_callback])
			
 
				+```
			
 
				+
			
 
				+**评级**：🟡 中等问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 算法设计问题
			
 
				+
			
 
				+### 问题11：DQN不是最佳选择 ⚠️⚠️
			
 
				+
			
 
				+**当前选择**：DQN（Deep Q-Network）
			
 
				+
			
 
				+**DQN特点**：
			
 
				+- ✅ 简单、稳定
			
 
				+- ✅ 离散动作空间
			
 
				+- ❌ 样本效率低
			
 
				+- ❌ 难以处理连续动作
			
 
				+- ❌ 探索能力弱
			
 
				+
			
 
				+**问题分析**：
			
 
				+
			
 
				+1. **动作空间其实是连续的**：
			
 
				+   - L_s ∈ [3800, 6000] 秒（连续）
			
 
				+   - t_bw_s ∈ [40, 60] 秒（连续）
			
 
				+   - 当前用网格离散化（37×5=185个点）
			
 
				+   - 损失精度
			
 
				+
			
 
				+2. **更适合的算法**：
			
 
				+
			
 
				+| 算法 | 优点 | 缺点 | 适用性 |
			
 
				+|------|------|------|--------|
			
 
				+| **SAC** | 连续动作、样本高效、稳定 | 稍复杂 | ⭐⭐⭐⭐⭐ |
			
 
				+| **TD3** | 连续动作、稳定 | 探索能力弱 | ⭐⭐⭐⭐ |
			
 
				+| **PPO** | 稳定、易调参 | 样本效率低 | ⭐⭐⭐ |
			
 
				+| **DQN** | 简单 | 连续动作支持差 | ⭐⭐ |
			
 
				+
			
 
				+**推荐改用SAC**：
			
 
				+```python
			
 
				+from stable_baselines3 import SAC
			
 
				+
			
 
				+model = SAC(
			
 
				+    policy="MlpPolicy",
			
 
				+    env=env,
			
 
				+    learning_rate=3e-4,
			
 
				+    buffer_size=100000,
			
 
				+    batch_size=256,
			
 
				+    tau=0.005,
			
 
				+    gamma=0.99,
			
 
				+    verbose=1
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+**改用SAC的好处**：
			
 
				+- 动作空间从185个离散点 → 连续范围
			
 
				+- 样本效率提升2-3倍
			
 
				+- 更适合精细控制
			
 
				+
			
 
				+**评级**：🟡 中等问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 问题12：缺少curriculum learning ⚠️
			
 
				+
			
 
				+**当前训练**：
			
 
				+```python
			
 
				+def reset(self):
			
 
				+    self.TMP0 = uniform(0.01, 0.03)  # 固定范围
			
 
				+```
			
 
				+
			
 
				+**问题**：
			
 
				+- 从一开始就面对全部难度
			
 
				+- 智能体需要同时学习：
			
 
				+  - 低TMP下的最优策略
			
 
				+  - 高TMP下的安全策略
			
 
				+  - 约束边界的处理
			
 
				+- 学习效率低
			
 
				+
			
 
				+**curriculum learning思路**：
			
 
				+
			
 
				+```python
			
 
				+# 阶段1：简单场景（0-10k步）
			
 
				+TMP_range = [0.025, 0.03]  # 窄范围
			
 
				+constraint_relaxed = True  # 放宽约束
			
 
				+
			
 
				+# 阶段2：中等场景（10k-30k步）
			
 
				+TMP_range = [0.02, 0.035]
			
 
				+constraint_relaxed = False
			
 
				+
			
 
				+# 阶段3：困难场景（30k-50k步）
			
 
				+TMP_range = [0.01, 0.04]  # 全范围
			
 
				+add_noise = True  # 增加噪声
			
 
				+```
			
 
				+
			
 
				+**实现示例**：
			
 
				+```python
			
 
				+class CurriculumEnv(UFSuperCycleEnv):
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+        self.difficulty = 1  # 难度等级
			
 
				+    
			
 
				+    def reset(self):
			
 
				+        if self.difficulty == 1:
			
 
				+            self.TMP0 = uniform(0.025, 0.03)
			
 
				+        elif self.difficulty == 2:
			
 
				+            self.TMP0 = uniform(0.02, 0.035)
			
 
				+        else:
			
 
				+            self.TMP0 = uniform(0.01, 0.04)
			
 
				+        return super().reset()
			
 
				+    
			
 
				+    def increase_difficulty(self):
			
 
				+        self.difficulty = min(3, self.difficulty + 1)
			
 
				+```
			
 
				+
			
 
				+**评级**：🟢 轻微问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 工程质量问题
			
 
				+
			
 
				+### 问题13：缺少单元测试 ⚠️⚠️
			
 
				+
			
 
				+**当前状态**：无任何测试代码
			
 
				+
			
 
				+**关键模块应测试**：
			
 
				+
			
 
				+```python
			
 
				+# tests/test_env.py
			
 
				+def test_env_reset():
			
 
				+    env = UFSuperCycleEnv(UFParams())
			
 
				+    obs, info = env.reset()
			
 
				+    assert obs.shape == (4,)
			
 
				+    assert 0 <= obs.all() <= 1
			
 
				+
			
 
				+def test_env_step():
			
 
				+    env = UFSuperCycleEnv(UFParams())
			
 
				+    env.reset()
			
 
				+    obs, reward, done, truncated, info = env.step(0)
			
 
				+    assert isinstance(reward, float)
			
 
				+    assert isinstance(done, bool)
			
 
				+
			
 
				+def test_simulate_feasibility():
			
 
				+    p = UFParams()
			
 
				+    # 测试可行动作
			
 
				+    feasible, info = simulate_one_supercycle(p, 4000, 50)
			
 
				+    assert feasible == True
			
 
				+    
			
 
				+    # 测试不可行动作（过长时间）
			
 
				+    feasible, info = simulate_one_supercycle(p, 7000, 50)
			
 
				+    assert feasible == False
			
 
				+
			
 
				+def test_reward_range():
			
 
				+    """测试奖励是否在合理范围"""
			
 
				+    rewards = []
			
 
				+    for _ in range(1000):
			
 
				+        # 采样不同状态和动作
			
 
				+        reward = _score(params, info)
			
 
				+        rewards.append(reward)
			
 
				+    
			
 
				+    assert min(rewards) > -30  # 避免过大负奖励
			
 
				+    assert max(rewards) < 10   # 避免奖励爆炸
			
 
				+```
			
 
				+
			
 
				+**评级**：🟡 中等问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 问题14：缺少配置管理 ⚠️
			
 
				+
			
 
				+**当前状态**：参数散落在多个类中
			
 
				+
			
 
				+**建议结构**：
			
 
				+
			
 
				+```python
			
 
				+# config.yaml
			
 
				+environment:
			
 
				+  tmp_range: [0.01, 0.05]
			
 
				+  action_range:
			
 
				+    L_s: [3800, 6000]
			
 
				+    t_bw_s: [40, 60]
			
 
				+  constraints:
			
 
				+    tmp_max: 0.06
			
 
				+    dTMP: 0.001
			
 
				+
			
 
				+dqn:
			
 
				+  learning_rate: 1e-4
			
 
				+  buffer_size: 50000
			
 
				+  batch_size: 64
			
 
				+  gamma: 0.95
			
 
				+
			
 
				+training:
			
 
				+  total_timesteps: 100000
			
 
				+  checkpoint_freq: 5000
			
 
				+  eval_freq: 2000
			
 
				+```
			
 
				+
			
 
				+```python
			
 
				+# config.py
			
 
				+import yaml
			
 
				+from dataclasses import dataclass
			
 
				+
			
 
				+@dataclass
			
 
				+class Config:
			
 
				+    @staticmethod
			
 
				+    def from_yaml(path):
			
 
				+        with open(path) as f:
			
 
				+            data = yaml.safe_load(f)
			
 
				+        return Config(**data)
			
 
				+```
			
 
				+
			
 
				+**评级**：🟢 轻微问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 问题15：缺少日志系统 ⚠️
			
 
				+
			
 
				+**当前状态**：只有print语句
			
 
				+
			
 
				+**建议**：
			
 
				+```python
			
 
				+import logging
			
 
				+
			
 
				+logging.basicConfig(
			
 
				+    level=logging.INFO,
			
 
				+    format='%(asctime)s [%(levelname)s] %(message)s',
			
 
				+    handlers=[
			
 
				+        logging.FileHandler('training.log'),
			
 
				+        logging.StreamHandler()
			
 
				+    ]
			
 
				+)
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+# 使用
			
 
				+logger.info(f"Episode {ep} - reward: {reward:.3f}")
			
 
				+logger.warning(f"Constraint violation at step {step}")
			
 
				+logger.error(f"Training failed: {error}")
			
 
				+```
			
 
				+
			
 
				+**评级**：🟢 轻微问题
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 优化方案
			
 
				+
			
 
				+### 方案1：物理模型升级 🔥
			
 
				+
			
 
				+**目标**：用真实神经网络替代数学公式
			
 
				+
			
 
				+#### 数据收集
			
 
				+```python
			
 
				+# 收集真实运行数据
			
 
				+data = {
			
 
				+    'L_h': [...],         # 产水时长
			
 
				+    'q_UF': [...],        # 流量
			
 
				+    'temp': [...],        # 温度
			
 
				+    'delta_TMP': [...]    # 实测TMP增长
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+#### 模型训练
			
 
				+```python
			
 
				+class RealTMPIncreaseModel(nn.Module):
			
 
				+    def __init__(self):
			
 
				+        super().__init__()
			
 
				+        self.net = nn.Sequential(
			
 
				+            nn.Linear(10, 64),   # 输入：L_h, q_UF, temp, ...
			
 
				+            nn.ReLU(),
			
 
				+            nn.Linear(64, 32),
			
 
				+            nn.ReLU(),
			
 
				+            nn.Linear(32, 1)     # 输出：delta_TMP
			
 
				+        )
			
 
				+    
			
 
				+    def forward(self, features):
			
 
				+        return self.net(features)
			
 
				+
			
 
				+# 监督学习训练
			
 
				+model = RealTMPIncreaseModel()
			
 
				+optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
			
 
				+
			
 
				+for epoch in range(100):
			
 
				+    for batch in data_loader:
			
 
				+        pred = model(batch['features'])
			
 
				+        loss = F.mse_loss(pred, batch['delta_TMP'])
			
 
				+        optimizer.zero_grad()
			
 
				+        loss.backward()
			
 
				+        optimizer.step()
			
 
				+```
			
 
				+
			
 
				+#### 集成到环境
			
 
				+```python
			
 
				+def _delta_tmp(p, L_h):
			
 
				+    features = torch.FloatTensor([
			
 
				+        L_h,
			
 
				+        p.q_UF,
			
 
				+        p.temp,
			
 
				+        ...
			
 
				+    ])
			
 
				+    with torch.no_grad():
			
 
				+        delta = model_fp(features).item()
			
 
				+    return delta
			
 
				+```
			
 
				+
			
 
				+**收益**：
			
 
				+- ✅ 模拟器精度大幅提升
			
 
				+- ✅ 可持续改进（随数据积累）
			
 
				+- ✅ 缩小Sim-to-Real Gap
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 方案2：状态空间扩展 🔥
			
 
				+
			
 
				+**新状态设计**：
			
 
				+```python
			
 
				+def _get_obs(self):
			
 
				+    state = [
			
 
				+        # 基础状态（4维）
			
 
				+        TMP0_norm,
			
 
				+        L_norm,
			
 
				+        t_bw_norm,
			
 
				+        max_TMP_norm,
			
 
				+        
			
 
				+        # 水质特征（3维）
			
 
				+        turbidity_norm,      # 浊度
			
 
				+        conductivity_norm,   # 电导率
			
 
				+        temperature_norm,    # 温度
			
 
				+        
			
 
				+        # 历史趋势（4维）
			
 
				+        tmp_change_rate,     # TMP变化速率
			
 
				+        avg_L_last_5,        # 最近5次平均产水时长
			
 
				+        avg_recovery_last_5, # 最近5次平均回收率
			
 
				+        days_since_ceb,      # 距上次CEB天数
			
 
				+        
			
 
				+        # 膜状态（2维）
			
 
				+        membrane_age,        # 膜龄（归一化）
			
 
				+        total_cycles,        # 总运行周期数
			
 
				+    ]
			
 
				+    return np.array(state, dtype=np.float32)  # 13维
			
 
				+```
			
 
				+
			
 
				+**实现历史追踪**：
			
 
				+```python
			
 
				+class UFSuperCycleEnv:
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+        self.history = {
			
 
				+            'L_s': deque(maxlen=5),
			
 
				+            'recovery': deque(maxlen=5),
			
 
				+            'TMP': deque(maxlen=10)
			
 
				+        }
			
 
				+    
			
 
				+    def step(self, action):
			
 
				+        ...
			
 
				+        self.history['L_s'].append(L_s)
			
 
				+        self.history['recovery'].append(info['recovery'])
			
 
				+        self.history['TMP'].append(self.TMP0)
			
 
				+        ...
			
 
				+```
			
 
				+
			
 
				+**收益**：
			
 
				+- ✅ 智能体感知能力增强
			
 
				+- ✅ 能学习长期策略
			
 
				+- ✅ 适应不同运行条件
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 方案3：奖励函数重构 🔥
			
 
				+
			
 
				+**新奖励设计**：
			
 
				+```python
			
 
				+def _score_v2(p, rec, constraint_violation=None):
			
 
				+    # 1. 基础奖励（保持简单）
			
 
				+    recovery_reward = rec['recovery']  # [0.9, 0.98]
			
 
				+    rate_reward = rec['net_rate'] / p.q_UF  # [0.85, 0.95]
			
 
				+    
			
 
				+    # 2. TMP惩罚（线性，避免非线性）
			
 
				+    tmp_penalty = max(0, (rec['max_TMP'] / p.TMP_max - 0.9)) * 2
			
 
				+    # TMP<90%上限：无惩罚
			
 
				+    # TMP=95%上限：惩罚0.1
			
 
				+    # TMP=100%上限：惩罚0.2
			
 
				+    
			
 
				+    # 3. 约束违反惩罚（分级）
			
 
				+    if constraint_violation:
			
 
				+        if constraint_violation == 'tmp_peak':
			
 
				+            penalty = -5  # 峰值超限
			
 
				+        elif constraint_violation == 'residual':
			
 
				+            penalty = -3  # 残余增量超限
			
 
				+        elif constraint_violation == 'headroom':
			
 
				+            penalty = -2  # 贴边过度
			
 
				+    else:
			
 
				+        penalty = 0
			
 
				+    
			
 
				+    # 4. 稳定性奖励（鼓励平稳操作）
			
 
				+    stability_bonus = 0
			
 
				+    if hasattr(env, 'last_action'):
			
 
				+        action_change = abs(current_action - env.last_action)
			
 
				+        if action_change < 0.1:  # 动作变化小
			
 
				+            stability_bonus = 0.05
			
 
				+    
			
 
				+    # 5. 总奖励（加权和，无非线性变换）
			
 
				+    reward = (
			
 
				+        0.6 * recovery_reward 
			
 
				+        + 0.3 * rate_reward 
			
 
				+        - 0.2 * tmp_penalty
			
 
				+        + penalty
			
 
				+        + stability_bonus
			
 
				+    )
			
 
				+    
			
 
				+    return reward
			
 
				+```
			
 
				+
			
 
				+**关键改进**：
			
 
				+1. **移除非线性变换**：保持奖励尺度一致
			
 
				+2. **分级惩罚**：区分不同约束违反的严重性
			
 
				+3. **稳定性奖励**：鼓励平滑控制
			
 
				+4. **可解释性**：每项奖励含义清晰
			
 
				+
			
 
				+**收益**：
			
 
				+- ✅ 训练更稳定
			
 
				+- ✅ Q值估计更准确
			
 
				+- ✅ 策略更合理
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 方案4：改用SAC算法 🔥
			
 
				+
			
 
				+**完整实现**：
			
 
				+
			
 
				+```python
			
 
				+from stable_baselines3 import SAC
			
 
				+from gymnasium import spaces
			
 
				+
			
 
				+class UFSuperCycleEnvContinuous(UFSuperCycleEnv):
			
 
				+    def __init__(self, *args, **kwargs):
			
 
				+        super().__init__(*args, **kwargs)
			
 
				+        
			
 
				+        # 改为连续动作空间
			
 
				+        self.action_space = spaces.Box(
			
 
				+            low=np.array([0.0, 0.0]),      # [L_norm, t_bw_norm]
			
 
				+            high=np.array([1.0, 1.0]),
			
 
				+            dtype=np.float32
			
 
				+        )
			
 
				+    
			
 
				+    def step(self, action):
			
 
				+        # 反归一化
			
 
				+        L_s = self.L_min + action[0] * (self.L_max - self.L_min)
			
 
				+        t_bw_s = self.t_bw_min + action[1] * (self.t_bw_max - self.t_bw_min)
			
 
				+        
			
 
				+        # 其余逻辑相同
			
 
				+        ...
			
 
				+
			
 
				+# 训练
			
 
				+model = SAC(
			
 
				+    policy="MlpPolicy",
			
 
				+    env=env,
			
 
				+    learning_rate=3e-4,
			
 
				+    buffer_size=100000,
			
 
				+    batch_size=256,
			
 
				+    tau=0.005,
			
 
				+    gamma=0.99,
			
 
				+    ent_coef='auto',  # 自动调整熵系数
			
 
				+    target_entropy='auto',
			
 
				+    verbose=1,
			
 
				+    tensorboard_log="./sac_tensorboard/"
			
 
				+)
			
 
				+
			
 
				+model.learn(total_timesteps=100000)
			
 
				+```
			
 
				+
			
 
				+**SAC优势**：
			
 
				+- ✅ 连续动作（无需离散化）
			
 
				+- ✅ 样本效率高（off-policy）
			
 
				+- ✅ 探索能力强（熵正则化）
			
 
				+- ✅ 更稳定（twin Q-networks）
			
 
				+
			
 
				+**收益**：
			
 
				+- 训练时间缩短30-50%
			
 
				+- 策略精度提升（无网格限制）
			
 
				+- 更好的泛化能力
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 方案5：增加evaluation循环 🔥
			
 
				+
			
 
				+**实现**：
			
 
				+```python
			
 
				+from stable_baselines3.common.callbacks import EvalCallback
			
 
				+
			
 
				+eval_env = UFSuperCycleEnv(UFParams())
			
 
				+eval_callback = EvalCallback(
			
 
				+    eval_env,
			
 
				+    best_model_save_path='./best_model/',
			
 
				+    log_path='./eval_logs/',
			
 
				+    eval_freq=2000,       # 每2000步评估一次
			
 
				+    n_eval_episodes=10,   # 每次评估10个episode
			
 
				+    deterministic=True,   # 确定性策略评估
			
 
				+    render=False
			
 
				+)
			
 
				+
			
 
				+model.learn(
			
 
				+    total_timesteps=100000,
			
 
				+    callback=[eval_callback, checkpoint_callback, training_callback]
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+**收益**：
			
 
				+- ✅ 实时监控泛化性能
			
 
				+- ✅ 自动保存最佳模型
			
 
				+- ✅ 早期发现过拟合
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 重构建议
			
 
				+
			
 
				+### 重构方案A：渐进式改进（推荐）
			
 
				+
			
 
				+**阶段1：修复关键bug（1-2天）**
			
 
				+```
			
 
				+1. 修复目标网络更新冲突 → 硬更新
			
 
				+2. 增大buffer_size到50000
			
 
				+3. 增大learning_starts到5000
			
 
				+4. 添加checkpoint保存
			
 
				+5. 修复奖励函数（移除非线性）
			
 
				+```
			
 
				+
			
 
				+**阶段2：优化训练（3-5天）**
			
 
				+```
			
 
				+1. 扩展状态空间（添加历史信息）
			
 
				+2. 增加episode长度到50步
			
 
				+3. 实现curriculum learning
			
 
				+4. 添加evaluation循环
			
 
				+```
			
 
				+
			
 
				+**阶段3：算法升级（1周）**
			
 
				+```
			
 
				+1. 改用SAC算法
			
 
				+2. 连续动作空间
			
 
				+3. 超参数调优
			
 
				+```
			
 
				+
			
 
				+**阶段4：模型升级（2-4周）**
			
 
				+```
			
 
				+1. 收集真实运行数据
			
 
				+2. 训练神经网络物理模型
			
 
				+3. 集成到环境
			
 
				+4. 验证Sim-to-Real性能
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 重构方案B：全面重写（高风险）
			
 
				+
			
 
				+**新架构设计**：
			
 
				+
			
 
				+```
			
 
				+项目结构：
			
 
				+uf_rl_v2/
			
 
				+├── config/
			
 
				+│   ├── env_config.yaml
			
 
				+│   ├── sac_config.yaml
			
 
				+│   └── train_config.yaml
			
 
				+├── models/
			
 
				+│   ├── physics/
			
 
				+│   │   ├── tmp_model.py       # 神经网络物理模型
			
 
				+│   │   └── train_physics.py   # 物理模型训练脚本
			
 
				+│   ├── policy/
			
 
				+│   │   └── sac_policy.py      # SAC策略网络
			
 
				+│   └── reward/
			
 
				+│       └── reward_shaping.py  # 奖励函数设计
			
 
				+├── envs/
			
 
				+│   ├── uf_env_v2.py           # 重构的环境
			
 
				+│   └── wrappers.py            # 环境包装器
			
 
				+├── utils/
			
 
				+│   ├── logger.py              # 日志系统
			
 
				+│   ├── callbacks.py           # 训练回调
			
 
				+│   └── evaluation.py          # 评估工具
			
 
				+├── tests/
			
 
				+│   ├── test_env.py
			
 
				+│   ├── test_physics.py
			
 
				+│   └── test_training.py
			
 
				+├── train.py                   # 训练入口
			
 
				+└── requirements.txt
			
 
				+```
			
 
				+
			
 
				+**核心改进**：
			
 
				+1. **模块化设计**：物理模型、策略、环境解耦
			
 
				+2. **配置驱动**：所有参数外部化
			
 
				+3. **完整测试**：覆盖所有关键模块
			
 
				+4. **现代算法**：使用SAC
			
 
				+5. **数据驱动**：神经网络物理模型
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 优先级排序
			
 
				+
			
 
				+### 🔴 高优先级（必须修复）
			
 
				+
			
 
				+1. **目标网络更新冲突** → 1小时
			
 
				+2. **奖励函数非线性变换** → 2小时
			
 
				+3. **增大buffer_size和learning_starts** → 0.5小时
			
 
				+
			
 
				+**总计**：半天可完成
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 🟡 中优先级（建议修复）
			
 
				+
			
 
				+4. **状态空间扩展（添加历史）** → 1天
			
 
				+5. **改用SAC算法** → 2-3天
			
 
				+6. **增加checkpoint和evaluation** → 1天
			
 
				+7. **episode长度调整** → 0.5天
			
 
				+
			
 
				+**总计**：1周可完成
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 🟢 低优先级（可选）
			
 
				+
			
 
				+8. **curriculum learning** → 2天
			
 
				+9. **配置文件管理** → 1天
			
 
				+10. **单元测试** → 2-3天
			
 
				+11. **日志系统** → 0.5天
			
 
				+12. **神经网络物理模型** → 2-4周
			
 
				+
			
 
				+**总计**：1-2周可完成
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 总结
			
 
				+
			
 
				+### 核心问题
			
 
				+1. **物理模型是假神经网络**（最严重）
			
 
				+2. **目标网络更新策略冲突**
			
 
				+3. **奖励函数设计不合理**
			
 
				+4. **状态空间信息不足**
			
 
				+5. **DQN不是最佳算法选择**
			
 
				+
			
 
				+### 最小可行改进方案（MVP）
			
 
				+
			
 
				+```python
			
 
				+# 1. 修复目标网络（5分钟）
			
 
				+target_update_interval=1000, tau=1.0
			
 
				+
			
 
				+# 2. 简化奖励函数（10分钟）
			
 
				+reward = 0.6*recovery + 0.3*rate - 0.2*tmp_penalty + constraint_penalty
			
 
				+
			
 
				+# 3. 增大经验池（1行）
			
 
				+buffer_size=50000, learning_starts=5000
			
 
				+
			
 
				+# 4. 添加checkpoint（5分钟）
			
 
				+CheckpointCallback(save_freq=5000, ...)
			
 
				+
			
 
				+# 5. 添加evaluation（5分钟）
			
 
				+EvalCallback(eval_freq=2000, ...)
			
 
				+```
			
 
				+
			
 
				+**总时间**：不到1小时  
			
 
				+**预期提升**：训练稳定性提升50%+，最终性能提升20%+
			
 
				+
			
 
				+### 理想改进方案
			
 
				+
			
 
				+1. 收集真实数据 → 训练神经网络物理模型
			
 
				+2. 改用SAC + 连续动作空间
			
 
				+3. 扩展状态空间（13维）
			
 
				+4. 重构奖励函数
			
 
				+5. 完善工程质量（测试、日志、配置）
			
 
				+
			
 
				+**总时间**：3-4周  
			
 
				+**预期提升**：训练效率提升3倍+，策略性能提升50%+，工业可用性大幅提升
			
 
				+
			
--- a/models/uf-rl/超滤训练源码/UF_RL_训练与预测流程详解.md
+++ b/models/uf-rl/超滤训练源码/UF_RL_训练与预测流程详解.md
@@ -0,0 +1,1082 @@
 
				+# UF-RL 训练与预测流程详解
			
 
				+
			
 
				+## 目录
			
 
				+1. [训练阶段完整流程](#训练阶段完整流程)
			
 
				+2. [预测阶段完整流程](#预测阶段完整流程)
			
 
				+3. [从训练到部署的完整链路](#从训练到部署的完整链路)
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 训练阶段完整流程
			
 
				+
			
 
				+### 概述：智能体如何学会决策
			
 
				+
			
 
				+把训练过程想象成**培养一个工程师**：
			
 
				+
			
 
				+```
			
 
				+新手工程师（随机决策）
			
 
				+    ↓ 通过大量实践
			
 
				+    ↓ 记住成功/失败的经验
			
 
				+    ↓ 总结规律
			
 
				+    ↓
			
 
				+经验丰富的工程师（最优决策）
			
 
				+```
			
 
				+
			
 
				+强化学习就是这个过程的数学化实现。
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 阶段0：准备工作（程序启动）
			
 
				+
			
 
				+#### 步骤0.1：固定随机种子
			
 
				+```python
			
 
				+def set_global_seed(seed=2025):
			
 
				+    random.seed(2025)
			
 
				+    np.random.seed(2025)
			
 
				+    torch.manual_seed(2025)
			
 
				+    torch.cuda.manual_seed_all(2025)
			
 
				+```
			
 
				+
			
 
				+**作用**：保证每次训练结果一致，便于调试和复现
			
 
				+
			
 
				+---
			
 
				+
			
 
				+#### 步骤0.2：创建超滤系统参数
			
 
				+```python
			
 
				+params = UFParams(
			
 
				+    q_UF=360.0,          # 进水流量：360 m³/h
			
 
				+    TMP0=0.03,           # 初始TMP：0.03 MPa
			
 
				+    TMP_max=0.06,        # TMP上限：0.06 MPa
			
 
				+    L_min_s=3800.0,      # 产水时长下限：3800秒
			
 
				+    L_max_s=6000.0,      # 产水时长上限：6000秒
			
 
				+    t_bw_min_s=40.0,     # 反洗时长下限：40秒
			
 
				+    t_bw_max_s=60.0,     # 反洗时长上限：60秒
			
 
				+    ...
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+**这些参数定义了**：
			
 
				+- 物理系统的运行范围
			
 
				+- 决策空间的边界
			
 
				+- 约束条件
			
 
				+
			
 
				+---
			
 
				+
			
 
				+#### 步骤0.3：创建模拟环境
			
 
				+```python
			
 
				+env = UFSuperCycleEnv(params)
			
 
				+env = Monitor(env)           # 包装：记录统计信息
			
 
				+env = DummyVecEnv([env])     # 包装：向量化接口
			
 
				+```
			
 
				+
			
 
				+**环境的作用**：
			
 
				+- 模拟超滤系统的运行
			
 
				+- 接收智能体的动作（产水时长、反洗时长）
			
 
				+- 返回奖励和下一个状态
			
 
				+
			
 
				+**为什么需要包装？**
			
 
				+- `Monitor`：自动记录每个episode的奖励、长度等
			
 
				+- `DummyVecEnv`：统一单环境/多环境的接口（虽然这里只有1个）
			
 
				+
			
 
				+---
			
 
				+
			
 
				+#### 步骤0.4：创建DQN智能体
			
 
				+```python
			
 
				+model = DQN(
			
 
				+    policy="MlpPolicy",        # 使用多层感知机策略网络
			
 
				+    env=env,
			
 
				+    learning_rate=1e-4,        # 学习率
			
 
				+    buffer_size=10000,         # 经验回放池大小
			
 
				+    learning_starts=200,       # 开始学习前的随机探索步数
			
 
				+    batch_size=32,             # 每次训练的样本数
			
 
				+    gamma=0.95,                # 折扣因子（重视长期奖励）
			
 
				+    train_freq=4,              # 每4步训练一次
			
 
				+    target_update_interval=1,  # 目标网络更新间隔
			
 
				+    tau=0.005,                 # 软更新系数
			
 
				+    exploration_initial_eps=1.0,   # 初始探索率100%
			
 
				+    exploration_fraction=0.3,      # 前30%训练时间探索衰减
			
 
				+    exploration_final_eps=0.02,    # 最终探索率2%
			
 
				+    verbose=1,
			
 
				+    tensorboard_log="./uf_dqn_tensorboard/"
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+**DQN智能体包含**：
			
 
				+1. **Q网络（当前网络）**：估计每个动作的价值
			
 
				+2. **目标网络**：提供稳定的学习目标
			
 
				+3. **经验回放池**：存储历史经验
			
 
				+4. **优化器**：更新网络参数
			
 
				+
			
 
				+**Q网络结构**：
			
 
				+```python
			
 
				+输入层：4维状态 → [TMP0, last_L, last_t_bw, max_TMP]
			
 
				+隐藏层1：4 → 64 (ReLU激活)
			
 
				+隐藏层2：64 → 64 (ReLU激活)
			
 
				+输出层：64 → 185 (每个动作的Q值)
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+#### 步骤0.5：创建回调和记录器
			
 
				+```python
			
 
				+recorder = UFEpisodeRecorder()          # 记录每个episode的数据
			
 
				+callback = UFTrainingCallback(recorder) # 训练回调
			
 
				+```
			
 
				+
			
 
				+**记录内容**：
			
 
				+- 每一步的状态、动作、奖励
			
 
				+- 每个episode的总奖励、回收率等
			
 
				+- 用于训练后分析
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 阶段1：初始化（第1个Episode开始）
			
 
				+
			
 
				+#### 环境重置
			
 
				+```python
			
 
				+state = env.reset()
			
 
				+
			
 
				+# 环境内部执行：
			
 
				+self.TMP0 = np.random.uniform(0.01, 0.03)  # 随机初始TMP，例如0.025
			
 
				+self.current_step = 0
			
 
				+self.last_action = (3800, 40)  # 初始动作：最保守的选择
			
 
				+self.max_TMP = self.TMP0
			
 
				+
			
 
				+# 计算初始状态
			
 
				+state = [
			
 
				+    (0.025 - 0.01) / 0.04,     # TMP0归一化 = 0.375
			
 
				+    (3800 - 3800) / 2200,       # last_L归一化 = 0.0
			
 
				+    (40 - 40) / 20,             # last_t_bw归一化 = 0.0
			
 
				+    (0.025 - 0.01) / 0.04       # max_TMP归一化 = 0.375
			
 
				+]
			
 
				+# state = [0.375, 0.0, 0.0, 0.375]
			
 
				+```
			
 
				+
			
 
				+**状态解释**：
			
 
				+- 当前是一个"全新的超滤膜"，TMP=0.025
			
 
				+- 还没有历史操作（last_L和last_t_bw都是初始值）
			
 
				+- 周期最高TMP就是当前TMP
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 阶段2：交互与学习循环（50000步）
			
 
				+
			
 
				+#### 完整的一步流程图
			
 
				+
			
 
				+```
			
 
				+┌─────────────────────────────────────────────────────────┐
			
 
				+│                    第 N 步                              │
			
 
				+└─────────────────────────────────────────────────────────┘
			
 
				+
			
 
				+1. 当前状态
			
 
				+   state = [0.375, 0.491, 0.5, 0.429]
			
 
				+        ↓
			
 
				+
			
 
				+2. 动作选择（ε-贪心）
			
 
				+   ┌────────────────────────────┐
			
 
				+   │ if random() < epsilon:     │
			
 
				+   │     action = random(0-184) │  ← 探索
			
 
				+   │ else:                      │
			
 
				+   │     Q值 = Q_network(state) │  ← 利用
			
 
				+   │     action = argmax(Q值)   │
			
 
				+   └────────────────────────────┘
			
 
				+        ↓
			
 
				+   假设选择 action = 92
			
 
				+        ↓
			
 
				+
			
 
				+3. 动作解码
			
 
				+   L_idx = 92 // 5 = 18
			
 
				+   t_bw_idx = 92 % 5 = 2
			
 
				+   L_s = 3800 + 18×60 = 4880秒
			
 
				+   t_bw_s = 40 + 2×5 = 50秒
			
 
				+        ↓
			
 
				+
			
 
				+4. 执行模拟
			
 
				+   simulate_one_supercycle(TMP0=0.025, L_s=4880, t_bw_s=50)
			
 
				+   ┌──────────────────────────────────┐
			
 
				+   │  计算小周期次数：k = 35         │
			
 
				+   │  For i in range(35):             │
			
 
				+   │    产水：TMP增长                 │
			
 
				+   │    反洗：TMP部分恢复             │
			
 
				+   │  CEB：TMP完全恢复                │
			
 
				+   │  计算指标：回收率、净供水率等   │
			
 
				+   └──────────────────────────────────┘
			
 
				+        ↓
			
 
				+   返回：feasible=True, info={recovery:0.97, net_rate:338, ...}
			
 
				+        ↓
			
 
				+
			
 
				+5. 计算奖励
			
 
				+   reward = _score(info)
			
 
				+   = 0.8×0.97 + 0.2×0.94 - 0.2×0.00
			
 
				+   = 0.964（基础奖励）
			
 
				+   放大后 = (0.964-0.5)² × 5 = 1.076
			
 
				+        ↓
			
 
				+
			
 
				+6. 观察新状态
			
 
				+   TMP0_new = 0.025（CEB后恢复）
			
 
				+   next_state = [0.375, 0.491, 0.5, 0.429]
			
 
				+        ↓
			
 
				+
			
 
				+7. 存储经验
			
 
				+   buffer.add(
			
 
				+       state = [0.375, 0.0, 0.0, 0.375],
			
 
				+       action = 92,
			
 
				+       reward = 1.076,
			
 
				+       next_state = [0.375, 0.491, 0.5, 0.429],
			
 
				+       done = False
			
 
				+   )
			
 
				+        ↓
			
 
				+
			
 
				+8. 训练网络（每4步，且步数>200）
			
 
				+   ┌──────────────────────────────────────┐
			
 
				+   │ if step % 4 == 0 and step > 200:    │
			
 
				+   │    batch = buffer.sample(32)         │
			
 
				+   │    训练Q网络（详见下方）             │
			
 
				+   └──────────────────────────────────────┘
			
 
				+        ↓
			
 
				+
			
 
				+9. 更新epsilon
			
 
				+   epsilon = max(0.02, 1.0 - step/15000)
			
 
				+        ↓
			
 
				+
			
 
				+10. 记录数据
			
 
				+    callback.record_step(state, action, reward, ...)
			
 
				+        ↓
			
 
				+
			
 
				+11. 检查episode是否结束
			
 
				+    if done or step >= 20:
			
 
				+        state = env.reset()  # 开始新episode
			
 
				+    else:
			
 
				+        state = next_state   # 继续当前episode
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 深入理解：Q网络训练（第204步首次训练）
			
 
				+
			
 
				+```python
			
 
				+# ===== 第204步：终于可以开始学习了！=====
			
 
				+
			
 
				+# 1. 从经验池随机采样32条经验
			
 
				+batch = buffer.sample(32)
			
 
				+
			
 
				+# 采样结果示例：
			
 
				+batch = {
			
 
				+    'state': [
			
 
				+        [0.375, 0.0, 0.0, 0.375],      # 第1条经验的state
			
 
				+        [0.425, 0.2, 0.3, 0.450],      # 第2条经验的state
			
 
				+        ...                             # 共32条
			
 
				+    ],
			
 
				+    'action': [92, 105, 78, ...],      # 32个动作
			
 
				+    'reward': [1.076, 0.845, -20, ...],# 32个奖励
			
 
				+    'next_state': [...],                # 32个next_state
			
 
				+    'done': [False, False, True, ...]   # 32个done标志
			
 
				+}
			
 
				+
			
 
				+# 2. 转换为PyTorch张量
			
 
				+state_tensor = torch.FloatTensor(batch['state'])       # [32, 4]
			
 
				+action_tensor = torch.LongTensor(batch['action'])      # [32]
			
 
				+reward_tensor = torch.FloatTensor(batch['reward'])     # [32]
			
 
				+next_state_tensor = torch.FloatTensor(batch['next_state'])  # [32, 4]
			
 
				+done_tensor = torch.FloatTensor(batch['done'])         # [32]
			
 
				+
			
 
				+# 3. 计算当前Q值（Q_current）
			
 
				+q_values = Q_network(state_tensor)  # [32, 185]
			
 
				+# 对于第1条经验，Q网络预测所有185个动作的Q值
			
 
				+# 例如：[0.5, 0.6, ..., 1.2, ..., 0.8]（185个值）
			
 
				+
			
 
				+q_current = q_values.gather(1, action_tensor.unsqueeze(1))  # [32, 1]
			
 
				+# gather操作：取出实际执行的动作对应的Q值
			
 
				+# 对于第1条经验，action=92，取出q_values[92] = 1.2
			
 
				+# 结果：[1.2, 0.9, -5.0, ...]（32个值）
			
 
				+
			
 
				+# 4. 计算目标Q值（Q_target）
			
 
				+with torch.no_grad():  # 不计算梯度，加速
			
 
				+    # 用目标网络预测next_state的Q值
			
 
				+    next_q_values = Q_target(next_state_tensor)  # [32, 185]
			
 
				+    
			
 
				+    # 取每个next_state的最大Q值
			
 
				+    next_q_max, _ = next_q_values.max(dim=1)  # [32]
			
 
				+    # 例如：[1.5, 1.3, 0.0, ...]
			
 
				+    
			
 
				+    # 贝尔曼方程：Q_target = reward + gamma × max(Q_next) × (1-done)
			
 
				+    target = reward_tensor + 0.95 × next_q_max × (1 - done_tensor)
			
 
				+    # 对于第1条经验：
			
 
				+    # target = 1.076 + 0.95 × 1.5 × (1-0) = 2.501
			
 
				+    # 对于第3条经验（done=True）：
			
 
				+    # target = -20 + 0.95 × 0.0 × (1-1) = -20
			
 
				+
			
 
				+# 5. 计算TD误差（损失函数）
			
 
				+loss = F.mse_loss(q_current.squeeze(), target)
			
 
				+# MSE = mean((q_current - target)²)
			
 
				+# 例如：mean([(1.2-2.501)², (0.9-2.135)², ...])
			
 
				+# 假设 loss = 3.45
			
 
				+
			
 
				+# 6. 反向传播更新Q网络
			
 
				+optimizer.zero_grad()   # 清空之前的梯度
			
 
				+loss.backward()         # 计算梯度
			
 
				+optimizer.step()        # 更新参数
			
 
				+
			
 
				+# 更新后，Q_network的参数被调整：
			
 
				+# - 如果q_current < target，增大该动作的Q值
			
 
				+# - 如果q_current > target，减小该动作的Q值
			
 
				+
			
 
				+# 7. 软更新目标网络
			
 
				+for param, target_param in zip(Q_network.parameters(), Q_target.parameters()):
			
 
				+    target_param.data.copy_(
			
 
				+        0.005 × param.data + 0.995 × target_param.data
			
 
				+    )
			
 
				+# 目标网络缓慢追踪Q网络，tau=0.005表示每次只更新0.5%
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 训练过程的关键时间点
			
 
				+
			
 
				+#### 时间线（50000步训练）
			
 
				+
			
 
				+```
			
 
				+步数     | epsilon | 行为                    | 学习状态
			
 
				+---------|---------|------------------------|------------------
			
 
				+0-200    | 1.0     | 纯随机探索             | 填充经验池
			
 
				+204      | 1.0     | 首次训练               | Q值从随机初始化开始学习
			
 
				+500      | 0.98    | 探索为主               | Q值逐渐有意义
			
 
				+2000     | 0.87    | 探索与利用并存         | 策略初步成型
			
 
				+5000     | 0.67    | 开始偏向利用           | Q值趋于稳定
			
 
				+15000    | 0.02    | 探索衰减完成           | 基本使用最优策略
			
 
				+15000+   | 0.02    | 98%利用，2%探索        | 策略优化与稳定
			
 
				+50000    | 0.02    | 训练结束               | 保存最终模型
			
 
				+```
			
 
				+
			
 
				+#### 示例：第5000步时的决策过程
			
 
				+
			
 
				+```python
			
 
				+# 当前状态：TMP稍高
			
 
				+state = [0.625, 0.55, 0.6, 0.650]  # TMP0=0.035 MPa
			
 
				+
			
 
				+# epsilon=0.67，仍有67%概率随机探索
			
 
				+if random() < 0.67:
			
 
				+    action = random.randint(0, 184)  # 假设随机到115
			
 
				+else:
			
 
				+    # 33%概率使用Q网络
			
 
				+    q_values = Q_network(state)
			
 
				+    # Q值示例（部分）：
			
 
				+    # action 0 (L=3800, t_bw=40): Q=0.45  （保守，TMP低但产水少）
			
 
				+    # action 92 (L=4880, t_bw=50): Q=0.78 （平衡）
			
 
				+    # action 115 (L=5080, t_bw=55): Q=0.82 （激进，产水多但TMP升高）
			
 
				+    # action 184 (L=6000, t_bw=60): Q=-2.5 （太激进，违反约束）
			
 
				+    
			
 
				+    action = argmax(q_values) = 115
			
 
				+
			
 
				+# 解码动作
			
 
				+L_s = 5080, t_bw_s = 55
			
 
				+
			
 
				+# 执行模拟
			
 
				+feasible, info = simulate_one_supercycle(0.035, 5080, 55)
			
 
				+# 结果：feasible=True（刚好没违反约束）
			
 
				+#       recovery=0.965, net_rate=330
			
 
				+
			
 
				+# 计算奖励
			
 
				+reward = 0.75（较好但不是最优，因为TMP有点高）
			
 
				+
			
 
				+# 存储经验并训练
			
 
				+# Q网络学到：在TMP=0.035时，action=115是一个还不错的选择
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 示例：第25000步时的决策过程
			
 
				+
			
 
				+```python
			
 
				+# 同样的状态
			
 
				+state = [0.625, 0.55, 0.6, 0.650]  # TMP0=0.035 MPa
			
 
				+
			
 
				+# epsilon=0.02，只有2%概率随机探索
			
 
				+if random() < 0.02:
			
 
				+    action = random(0-184)
			
 
				+else:
			
 
				+    # 98%概率使用Q网络（此时Q值已经很准确）
			
 
				+    q_values = Q_network(state)
			
 
				+    # 经过25000步学习，Q值更精准：
			
 
				+    # action 0: Q=0.52   （保守，稳定）
			
 
				+    # action 85: Q=0.88  （最优！）✓
			
 
				+    # action 92: Q=0.85  （次优）
			
 
				+    # action 115: Q=0.65 （之前试过，风险高）
			
 
				+    # action 184: Q=-15.0（确定违反约束）
			
 
				+    
			
 
				+    action = 85  # 选择最优动作
			
 
				+
			
 
				+# 解码
			
 
				+L_s = 4720, t_bw_s = 45
			
 
				+
			
 
				+# 执行
			
 
				+feasible, info = simulate(0.035, 4720, 45)
			
 
				+# 结果：feasible=True
			
 
				+#       recovery=0.972, net_rate=335
			
 
				+#       TMP贴边度低，安全
			
 
				+
			
 
				+# 奖励
			
 
				+reward = 0.92（接近最优）
			
 
				+
			
 
				+# 智能体学会了：
			
 
				+# - 在TMP较高时，要稍微保守一点
			
 
				+# - L_s=4720是在高TMP下的最佳平衡点
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 阶段3：训练结束与保存
			
 
				+
			
 
				+```python
			
 
				+# 训练完成
			
 
				+model.learn(total_timesteps=50000)  # 循环结束
			
 
				+
			
 
				+# 保存模型
			
 
				+model.save("dqn_model.zip")
			
 
				+
			
 
				+# 模型文件包含：
			
 
				+# 1. Q_network的所有参数（权重和偏置）
			
 
				+# 2. 优化器状态
			
 
				+# 3. 训练配置（学习率等）
			
 
				+# 不包含：经验回放池（太大且推理时不需要）
			
 
				+```
			
 
				+
			
 
				+**训练日志统计**：
			
 
				+```python
			
 
				+stats = recorder.get_episode_stats()
			
 
				+print(f"""
			
 
				+训练完成统计：
			
 
				+- 总步数：50000
			
 
				+- 总episode数：约2500（平均每episode 20步）
			
 
				+- 最终平均奖励：0.85
			
 
				+- 约束违反率：5%（从初期80%大幅下降）
			
 
				+- 平均回收率：0.968
			
 
				+""")
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 预测阶段完整流程
			
 
				+
			
 
				+### 概述：训练好的智能体如何工作
			
 
				+
			
 
				+训练完成后，模型已经学会了：
			
 
				+```
			
 
				+给定状态（TMP, 历史操作） → 选择最优动作（L_s, t_bw_s）
			
 
				+```
			
 
				+
			
 
				+预测阶段不需要：
			
 
				+- ✗ 探索（epsilon=0，总是选择最优）
			
 
				+- ✗ 训练（不更新网络参数）
			
 
				+- ✗ 经验池（不存储新经验）
			
 
				+
			
 
				+预测阶段只需要：
			
 
				+- ✓ 加载训练好的Q网络
			
 
				+- ✓ 输入当前状态
			
 
				+- ✓ 输出最优动作
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 预测流程详解
			
 
				+
			
 
				+#### 场景：工厂实时决策
			
 
				+
			
 
				+假设当前时间：2025-01-15 10:00，超滤系统运行中，需要决定下一个周期的参数。
			
 
				+
			
 
				+---
			
 
				+
			
 
				+#### 步骤1：获取当前系统状态
			
 
				+
			
 
				+```python
			
 
				+# 从工厂SCADA系统读取实时数据
			
 
				+current_TMP0 = 0.032  # 当前TMP（MPa）
			
 
				+last_L_s = 4500       # 上一周期产水时长（秒）
			
 
				+last_t_bw_s = 50      # 上一周期反洗时长（秒）
			
 
				+max_TMP_last = 0.045  # 上一周期最高TMP（MPa）
			
 
				+
			
 
				+# 也可以从数据库查询历史记录
			
 
				+# 或者如果是第一次运行，使用默认值
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+#### 步骤2：初始化决策环境和模型
			
 
				+
			
 
				+```python
			
 
				+from DQN_decide import run_uf_DQN_decide
			
 
				+from DQN_env import UFParams
			
 
				+
			
 
				+# 2.1 创建系统参数（与训练时一致）
			
 
				+uf_params = UFParams(
			
 
				+    q_UF=360.0,
			
 
				+    TMP_max=0.06,
			
 
				+    # ... 其他参数
			
 
				+)
			
 
				+
			
 
				+# 2.2 加载训练好的模型（自动完成）
			
 
				+# 模型文件：dqn_model.zip
			
 
				+# 内部会执行：
			
 
				+# model = DQN.load("dqn_model.zip")
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+#### 步骤3：执行决策
			
 
				+
			
 
				+```python
			
 
				+# 3.1 调用决策接口
			
 
				+result = run_uf_DQN_decide(
			
 
				+    uf_params=uf_params,
			
 
				+    TMP0_value=0.032  # 输入当前TMP
			
 
				+)
			
 
				+
			
 
				+# 3.2 决策内部流程详解
			
 
				+def run_uf_DQN_decide(uf_params, TMP0_value):
			
 
				+    # Step A: 创建环境实例
			
 
				+    env = UFSuperCycleEnv(uf_params)
			
 
				+    
			
 
				+    # Step B: 设置当前TMP
			
 
				+    env.current_params.TMP0 = 0.032
			
 
				+    env.last_action = (4500, 50)      # 使用历史动作
			
 
				+    env.max_TMP_during_filtration = 0.045
			
 
				+    
			
 
				+    # Step C: 获取归一化状态
			
 
				+    obs = env._get_obs()
			
 
				+    # obs = [
			
 
				+    #     (0.032 - 0.01) / 0.04 = 0.55,   # TMP0
			
 
				+    #     (4500 - 3800) / 2200 = 0.318,   # last_L
			
 
				+    #     (50 - 40) / 20 = 0.5,           # last_t_bw
			
 
				+    #     (0.045 - 0.01) / 0.04 = 0.875   # max_TMP
			
 
				+    # ]
			
 
				+    # obs = [0.55, 0.318, 0.5, 0.875]
			
 
				+    
			
 
				+    # Step D: 模型预测（确定性，不探索）
			
 
				+    obs_reshaped = obs.reshape(1, -1)  # [1, 4]
			
 
				+    action, _states = model.predict(obs_reshaped, deterministic=True)
			
 
				+    
			
 
				+    # 模型内部执行：
			
 
				+    # q_values = Q_network(obs_reshaped)  # [1, 185]
			
 
				+    # 例如 q_values = [[0.45, 0.67, ..., 0.89, ..., -3.2]]
			
 
				+    # action = argmax(q_values) = 105
			
 
				+    # （选择Q值最大的动作）
			
 
				+    
			
 
				+    action = action[0]  # 105
			
 
				+    
			
 
				+    # Step E: 解码动作
			
 
				+    L_s, t_bw_s = env._get_action_values(105)
			
 
				+    # L_idx = 105 // 5 = 21
			
 
				+    # t_bw_idx = 105 % 5 = 0
			
 
				+    # L_s = 3800 + 21×60 = 5060秒
			
 
				+    # t_bw_s = 40 + 0×5 = 40秒
			
 
				+    
			
 
				+    # Step F: 模拟验证（可选，检查可行性）
			
 
				+    next_obs, reward, terminated, truncated, info = env.step(105)
			
 
				+    
			
 
				+    # Step G: 返回决策结果
			
 
				+    return {
			
 
				+        "action": 105,
			
 
				+        "L_s": 5060.0,
			
 
				+        "t_bw_s": 40.0,
			
 
				+        "next_obs": next_obs,
			
 
				+        "reward": reward,
			
 
				+        "terminated": terminated,
			
 
				+        "truncated": truncated,
			
 
				+        "info": info
			
 
				+    }
			
 
				+
			
 
				+# 3.3 获取决策结果
			
 
				+print(f"""
			
 
				+模型决策结果：
			
 
				+- 建议产水时长：{result['L_s']} 秒 (约{result['L_s']/60:.1f}分钟)
			
 
				+- 建议反洗时长：{result['t_bw_s']} 秒
			
 
				+- 预期回收率：{result['info']['recovery']:.3f}
			
 
				+- 预期净供水率：{result['info']['net_delivery_rate_m3ph']:.1f} m³/h
			
 
				+- 预期周期最高TMP：{result['info']['max_TMP_during_filtration']:.4f} MPa
			
 
				+""")
			
 
				+
			
 
				+# 输出示例：
			
 
				+# 模型决策结果：
			
 
				+# - 建议产水时长：5060 秒 (约84.3分钟)
			
 
				+# - 建议反洗时长：40 秒
			
 
				+# - 预期回收率：0.968
			
 
				+# - 预期净供水率：332.5 m³/h
			
 
				+# - 预期周期最高TMP：0.0485 MPa
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+#### 步骤4：生成PLC指令（渐进式调整）
			
 
				+
			
 
				+为了避免参数突变导致系统不稳定，使用渐进式调整策略：
			
 
				+
			
 
				+```python
			
 
				+from DQN_decide import generate_plc_instructions
			
 
				+
			
 
				+# 4.1 准备输入
			
 
				+current_L_s = 4500      # 工厂当前设定值
			
 
				+current_t_bw_s = 50     # 工厂当前设定值
			
 
				+model_prev_L_s = 4800   # 模型上一轮建议值（如果有）
			
 
				+model_prev_t_bw_s = 45  # 模型上一轮建议值
			
 
				+model_L_s = 5060        # 模型本轮建议值
			
 
				+model_t_bw_s = 40       # 模型本轮建议值
			
 
				+
			
 
				+# 4.2 生成渐进式指令
			
 
				+next_L_s, next_t_bw_s = generate_plc_instructions(
			
 
				+    current_L_s,
			
 
				+    current_t_bw_s,
			
 
				+    model_prev_L_s,
			
 
				+    model_prev_t_bw_s,
			
 
				+    model_L_s,
			
 
				+    model_t_bw_s
			
 
				+)
			
 
				+
			
 
				+# 4.3 内部逻辑详解
			
 
				+def generate_plc_instructions(...):
			
 
				+    # Step 1: 选择基准值（上一轮模型值 vs 当前值）
			
 
				+    # 选择更接近本轮模型建议的那个
			
 
				+    if abs(current_L_s - model_L_s) <= abs(model_prev_L_s - model_L_s):
			
 
				+        effective_current_L = 4500  # 当前值更接近
			
 
				+    else:
			
 
				+        effective_current_L = 4800  # 上轮值更接近
			
 
				+    
			
 
				+    # 假设选择了4800
			
 
				+    
			
 
				+    # Step 2: 计算差异
			
 
				+    L_diff = model_L_s - effective_current_L
			
 
				+    # = 5060 - 4800 = 260秒
			
 
				+    
			
 
				+    # Step 3: 渐进调整（每次最多变化1个步长）
			
 
				+    L_step_s = 60  # 步长60秒
			
 
				+    threshold = 1.0
			
 
				+    
			
 
				+    if abs(L_diff) >= threshold * L_step_s:
			
 
				+        if L_diff > 0:
			
 
				+            L_adjustment = +60  # 向上调整
			
 
				+        else:
			
 
				+            L_adjustment = -60  # 向下调整
			
 
				+    else:
			
 
				+        L_adjustment = 0  # 差异小，不调整
			
 
				+    
			
 
				+    next_L_s = 4800 + 60 = 4860秒
			
 
				+    
			
 
				+    # 同样处理t_bw_s
			
 
				+    # t_bw_diff = 40 - 45 = -5秒
			
 
				+    # abs(-5) >= 1.0 × 5 → True
			
 
				+    # t_bw_adjustment = -5
			
 
				+    next_t_bw_s = 45 - 5 = 40秒
			
 
				+    
			
 
				+    return 4860, 40
			
 
				+
			
 
				+# 4.4 结果
			
 
				+print(f"""
			
 
				+PLC指令：
			
 
				+- 下发产水时长：{next_L_s} 秒（从{effective_current_L}秒逐步调整）
			
 
				+- 下发反洗时长：{next_t_bw_s} 秒
			
 
				+- 调整方向：向模型建议值({model_L_s}秒, {model_t_bw_s}秒)靠拢
			
 
				+- 需要继续调整轮数：约{abs(model_L_s - next_L_s) // 60}轮
			
 
				+""")
			
 
				+
			
 
				+# 输出：
			
 
				+# PLC指令：
			
 
				+# - 下发产水时长：4860 秒（从4800秒逐步调整）
			
 
				+# - 下发反洗时长：40 秒
			
 
				+# - 调整方向：向模型建议值(5060秒, 40秒)靠拢
			
 
				+# - 需要继续调整轮数：约3轮
			
 
				+```
			
 
				+
			
 
				+**渐进调整的好处**：
			
 
				+- ✅ 避免参数突变导致TMP急剧波动
			
 
				+- ✅ 给操作员时间观察和干预
			
 
				+- ✅ 系统更平稳过渡
			
 
				+
			
 
				+---
			
 
				+
			
 
				+#### 步骤5：计算预期性能指标
			
 
				+
			
 
				+在实际下发指令前，先模拟计算性能：
			
 
				+
			
 
				+```python
			
 
				+from DQN_decide import calc_uf_cycle_metrics
			
 
				+
			
 
				+# 5.1 计算指令对应的性能
			
 
				+TMP0 = 0.032
			
 
				+max_tmp = 0.048  # 如果工厂有实测数据
			
 
				+min_tmp = 0.025
			
 
				+L_s = 4860
			
 
				+t_bw_s = 40
			
 
				+
			
 
				+metrics = calc_uf_cycle_metrics(
			
 
				+    uf_params,
			
 
				+    TMP0,
			
 
				+    max_tmp,
			
 
				+    min_tmp,
			
 
				+    L_s,
			
 
				+    t_bw_s
			
 
				+)
			
 
				+
			
 
				+# 5.2 内部计算流程
			
 
				+def calc_uf_cycle_metrics(...):
			
 
				+    # 模拟一个完整超级周期
			
 
				+    feasible, info = simulate_one_supercycle(params, L_s, t_bw_s)
			
 
				+    
			
 
				+    # 提取关键指标
			
 
				+    k_bw_per_ceb = info["k_bw_per_ceb"]  # 小周期次数
			
 
				+    recovery = info["recovery"]            # 回收率
			
 
				+    net_rate = info["net_delivery_rate_m3ph"]  # 净供水率
			
 
				+    daily_prod_time = info["daily_prod_time_h"]  # 日均产水时间
			
 
				+    ton_water_energy = info["ton_water_energy_kWh_per_m3"]  # 吨水电耗
			
 
				+    
			
 
				+    # 计算渗透率
			
 
				+    if min_tmp is not None:
			
 
				+        max_permeability = 100 * q_UF / (膜面积) / min_tmp
			
 
				+    else:
			
 
				+        max_permeability = info中计算
			
 
				+    
			
 
				+    return {
			
 
				+        "k_bw_per_ceb": 36,
			
 
				+        "recovery": 0.968,
			
 
				+        "net_delivery_rate_m3ph": 332.8,
			
 
				+        "daily_prod_time_h": 18.5,
			
 
				+        "ton_water_energy_kWh_per_m3": 0.1019,
			
 
				+        "max_permeability": 58.3  # lmh/bar
			
 
				+    }
			
 
				+
			
 
				+# 5.3 展示结果
			
 
				+print(f"""
			
 
				+预期性能指标：
			
 
				+- 小周期次数（48h内）：{metrics['k_bw_per_ceb']}
			
 
				+- 回收率：{metrics['recovery']:.2%}
			
 
				+- 净供水率：{metrics['net_delivery_rate_m3ph']:.1f} m³/h
			
 
				+- 日均产水时间：{metrics['daily_prod_time_h']:.1f} 小时/天
			
 
				+- 吨水电耗：{metrics['ton_water_energy_kWh_per_m3']:.4f} kWh/m³
			
 
				+- 最高渗透率：{metrics['max_permeability']:.1f} lmh/bar
			
 
				+""")
			
 
				+
			
 
				+# 输出：
			
 
				+# 预期性能指标：
			
 
				+# - 小周期次数（48h内）：36
			
 
				+# - 回收率：96.80%
			
 
				+# - 净供水率：332.8 m³/h
			
 
				+# - 日均产水时间：18.5 小时/天
			
 
				+# - 吨水电耗：0.1019 kWh/m³
			
 
				+# - 最高渗透率：58.3 lmh/bar
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+#### 步骤6：下发PLC指令
			
 
				+
			
 
				+```python
			
 
				+# 6.1 准备PLC通信数据包
			
 
				+plc_command = {
			
 
				+    "timestamp": "2025-01-15 10:00:00",
			
 
				+    "L_s": 4860,        # 产水时长（秒）
			
 
				+    "t_bw_s": 40,       # 反洗时长（秒）
			
 
				+    "source": "AI_DQN", # 指令来源
			
 
				+    "confidence": 0.95  # 模型置信度
			
 
				+}
			
 
				+
			
 
				+# 6.2 发送到PLC（伪代码）
			
 
				+# plc_client.write_registers(
			
 
				+#     address=1000,
			
 
				+#     values=[4860, 40]
			
 
				+# )
			
 
				+
			
 
				+# 6.3 记录操作日志
			
 
				+log_decision(
			
 
				+    timestamp="2025-01-15 10:00:00",
			
 
				+    TMP0=0.032,
			
 
				+    model_L_s=5060,
			
 
				+    model_t_bw_s=40,
			
 
				+    plc_L_s=4860,
			
 
				+    plc_t_bw_s=40,
			
 
				+    expected_recovery=0.968
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+#### 步骤7：周期性重复预测
			
 
				+
			
 
				+```python
			
 
				+# 每个超级周期结束后（约48小时）重新预测
			
 
				+
			
 
				+while True:
			
 
				+    # 等待当前周期结束
			
 
				+    wait_for_cycle_end()
			
 
				+    
			
 
				+    # 获取最新系统状态
			
 
				+    current_state = get_system_state()
			
 
				+    
			
 
				+    # 执行决策
			
 
				+    result = run_uf_DQN_decide(uf_params, current_state.TMP0)
			
 
				+    
			
 
				+    # 渐进调整
			
 
				+    next_L, next_t_bw = generate_plc_instructions(...)
			
 
				+    
			
 
				+    # 下发指令
			
 
				+    send_to_plc(next_L, next_t_bw)
			
 
				+    
			
 
				+    # 记录日志
			
 
				+    log_decision(...)
			
 
				+    
			
 
				+    # 等待下一周期
			
 
				+    sleep(48_hours)
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 从训练到部署的完整链路
			
 
				+
			
 
				+### 完整流程图
			
 
				+
			
 
				+```
			
 
				+┌─────────────────────────────────────────────────────────────┐
			
 
				+│                    阶段1：离线训练                           │
			
 
				+└─────────────────────────────────────────────────────────────┘
			
 
				+
			
 
				+1. 数据准备
			
 
				+   ├─ 历史运行数据（可选，用于物理模型）
			
 
				+   └─ 系统参数配置（UFParams）
			
 
				+
			
 
				+2. 模拟器开发
			
 
				+   ├─ 物理模型：TMP增长、反洗恢复
			
 
				+   └─ 约束检查：TMP上限、残余增量
			
 
				+
			
 
				+3. 强化学习训练
			
 
				+   ├─ 环境：UFSuperCycleEnv
			
 
				+   ├─ 算法：DQN
			
 
				+   ├─ 训练：50000步，约数小时
			
 
				+   └─ 输出：dqn_model.zip
			
 
				+
			
 
				+4. 模型验证
			
 
				+   ├─ 测试不同TMP条件
			
 
				+   ├─ 检查约束满足率
			
 
				+   └─ 评估性能指标
			
 
				+
			
 
				+        ↓
			
 
				+
			
 
				+┌─────────────────────────────────────────────────────────────┐
			
 
				+│                    阶段2：在线部署                           │
			
 
				+└─────────────────────────────────────────────────────────────┘
			
 
				+
			
 
				+5. 部署准备
			
 
				+   ├─ 将dqn_model.zip部署到服务器
			
 
				+   ├─ 配置PLC通信接口
			
 
				+   └─ 搭建监控系统
			
 
				+
			
 
				+6. 实时决策循环
			
 
				+   Every 48 hours:
			
 
				+   ├─ 从SCADA读取TMP0
			
 
				+   ├─ 调用run_uf_DQN_decide()
			
 
				+   ├─ 生成PLC指令（渐进式）
			
 
				+   ├─ 下发到PLC
			
 
				+   └─ 记录日志
			
 
				+
			
 
				+7. 持续监控
			
 
				+   ├─ 实时性能追踪
			
 
				+   ├─ 异常告警
			
 
				+   └─ 人工干预接口
			
 
				+
			
 
				+        ↓
			
 
				+
			
 
				+┌─────────────────────────────────────────────────────────────┐
			
 
				+│                    阶段3：持续优化                           │
			
 
				+└─────────────────────────────────────────────────────────────┘
			
 
				+
			
 
				+8. 数据收集
			
 
				+   ├─ 记录实际运行数据
			
 
				+   ├─ 标注异常事件
			
 
				+   └─ 构建真实数据集
			
 
				+
			
 
				+9. 模型迭代
			
 
				+   ├─ 用真实数据训练物理模型
			
 
				+   ├─ 重新训练强化学习策略
			
 
				+   └─ A/B测试新旧模型
			
 
				+
			
 
				+10. 上线新版本
			
 
				+    ├─ 灰度发布
			
 
				+    ├─ 性能对比
			
 
				+    └─ 全量替换
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 训练vs预测对比表
			
 
				+
			
 
				+| 维度 | 训练阶段 | 预测阶段 |
			
 
				+|------|---------|---------|
			
 
				+| **目标** | 学习最优策略 | 应用最优策略 |
			
 
				+| **输入** | 随机初始状态 | 真实系统状态 |
			
 
				+| **动作选择** | ε-贪心（探索+利用） | 贪心（纯利用） |
			
 
				+| **奖励** | 计算并用于学习 | 计算但不学习 |
			
 
				+| **网络更新** | 每4步更新参数 | 不更新参数 |
			
 
				+| **经验池** | 存储并采样 | 不使用 |
			
 
				+| **epsilon** | 1.0 → 0.02 | 0（不探索） |
			
 
				+| **时间** | 数小时（50000步） | 毫秒级（单次推理） |
			
 
				+| **输出** | dqn_model.zip | (L_s, t_bw_s) |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 关键差异示例
			
 
				+
			
 
				+#### 训练时的动作选择（第1000步）
			
 
				+```python
			
 
				+state = [0.55, 0.318, 0.5, 0.875]
			
 
				+epsilon = 0.93  # 仍在高探索期
			
 
				+
			
 
				+if random() < 0.93:
			
 
				+    action = 127  # 93%概率：随机探索
			
 
				+else:
			
 
				+    q_values = Q_network(state)
			
 
				+    action = argmax(q_values)  # 7%概率：利用
			
 
				+
			
 
				+# 即使知道最优动作，也故意选择次优动作来探索
			
 
				+```
			
 
				+
			
 
				+#### 预测时的动作选择
			
 
				+```python
			
 
				+state = [0.55, 0.318, 0.5, 0.875]
			
 
				+epsilon = 0  # 预测时不探索
			
 
				+
			
 
				+# 总是选择Q值最大的动作
			
 
				+q_values = Q_network(state)
			
 
				+# q_values = [0.45, ..., 0.89, ..., 0.76, ...]
			
 
				+action = argmax(q_values)  # 一定选择最优
			
 
				+
			
 
				+# 确定性决策，相同状态总是输出相同动作
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 常见问题解答
			
 
				+
			
 
				+### Q1：为什么训练需要50000步？
			
 
				+**A**：
			
 
				+- 185个动作 × 多种状态 = 需要大量经验
			
 
				+- 前30%（15000步）主要是探索，发现好动作
			
 
				+- 后70%（35000步）是优化，微调策略
			
 
				+- 太少（如5000步）策略不稳定，太多（如500000步）训练时间长且收益递减
			
 
				+
			
 
				+### Q2：预测速度有多快？
			
 
				+**A**：
			
 
				+```python
			
 
				+import time
			
 
				+
			
 
				+start = time.time()
			
 
				+result = run_uf_DQN_decide(uf_params, 0.032)
			
 
				+end = time.time()
			
 
				+
			
 
				+print(f"预测耗时：{(end-start)*1000:.2f} 毫秒")
			
 
				+# 典型输出：预测耗时：15.32 毫秒
			
 
				+
			
 
				+# 分解：
			
 
				+# - 状态归一化：<1ms
			
 
				+# - Q网络前向传播：5-10ms
			
 
				+# - 动作解码：<1ms
			
 
				+# - 模拟验证：5-10ms
			
 
				+```
			
 
				+
			
 
				+### Q3：模型何时需要重新训练？
			
 
				+**A**：以下情况需要重训：
			
 
				+1. **系统参数变化**：更换膜组、改变流量范围
			
 
				+2. **物理模型更新**：有了真实数据，改进模拟器
			
 
				+3. **性能下降**：实际回收率持续低于预期
			
 
				+4. **新约束**：增加了新的运行限制
			
 
				+
			
 
				+通常**3-6个月**重训一次。
			
 
				+
			
 
				+### Q4：如何确保预测的动作安全？
			
 
				+**A**：多重保障：
			
 
				+```python
			
 
				+# 1. 训练时的约束学习
			
 
				+# 模型在训练时已经学会避免违反约束
			
 
				+
			
 
				+# 2. 预测后的模拟验证
			
 
				+feasible, info = simulate_one_supercycle(TMP0, L_s, t_bw_s)
			
 
				+if not feasible:
			
 
				+    # 回退到保守策略
			
 
				+    L_s, t_bw_s = safe_default_action()
			
 
				+
			
 
				+# 3. 渐进式调整
			
 
				+# 每次只调整60秒，避免突变
			
 
				+
			
 
				+# 4. 人工监督
			
 
				+# 操作员可以随时覆盖模型决策
			
 
				+```
			
 
				+
			
 
				+### Q5：训练时为什么要探索？
			
 
				+**A**：
			
 
				+```
			
 
				+假设没有探索（epsilon=0）：
			
 
				+
			
 
				+初始Q值是随机的，假设：
			
 
				+- action 50 的Q值 = 0.8（最高）
			
 
				+- action 92 的Q值 = 0.3
			
 
				+- action 120 的Q值 = 0.5
			
 
				+
			
 
				+智能体会一直选择action 50，永远不会尝试92和120。
			
 
				+
			
 
				+但实际上：
			
 
				+- action 50 的真实价值 = 0.6（不够好）
			
 
				+- action 92 的真实价值 = 0.9（最优！）
			
 
				+- action 120 的真实价值 = 0.7
			
 
				+
			
 
				+没有探索，永远发现不了action 92才是最优的。
			
 
				+
			
 
				+有探索：
			
 
				+- 前期随机尝试各种动作
			
 
				+- 发现action 92 获得高奖励
			
 
				+- 更新Q值：Q(92) = 0.3 → 0.9
			
 
				+- 后期选择action 92
			
 
				+```
			
 
				+
			
 
				+### Q6：预测可以并行吗？
			
 
				+**A**：可以，但要注意：
			
 
				+```python
			
 
				+# 单个预测（串行）
			
 
				+result = run_uf_DQN_decide(uf_params, TMP0)
			
 
				+
			
 
				+# 批量预测（并行，需要修改代码）
			
 
				+TMP0_batch = [0.025, 0.030, 0.035, 0.040]
			
 
				+results = run_uf_DQN_decide_batch(uf_params, TMP0_batch)
			
 
				+
			
 
				+# 内部并行：
			
 
				+obs_batch = np.array([
			
 
				+    [0.375, 0.0, 0.0, 0.375],
			
 
				+    [0.55, 0.2, 0.3, 0.6],
			
 
				+    ...
			
 
				+])  # [4, 4]
			
 
				+
			
 
				+q_values = Q_network(obs_batch)  # [4, 185]
			
 
				+actions = q_values.argmax(dim=1)  # [4]
			
 
				+
			
 
				+# GPU加速：
			
 
				+# PyTorch自动利用GPU并行计算
			
 
				+# 批量预测速度几乎和单次一样快
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 总结
			
 
				+
			
 
				+### 训练流程核心
			
 
				+1. **准备阶段**：创建环境、初始化Q网络
			
 
				+2. **探索阶段**（0-15000步）：随机尝试，填充经验池
			
 
				+3. **学习阶段**（200-50000步）：从经验中学习，更新Q值
			
 
				+4. **优化阶段**（15000-50000步）：利用为主，微调策略
			
 
				+5. **保存模型**：导出dqn_model.zip
			
 
				+
			
 
				+### 预测流程核心
			
 
				+1. **加载模型**：读取训练好的Q网络
			
 
				+2. **获取状态**：从系统读取TMP等信息
			
 
				+3. **模型推理**：Q网络计算，选择最优动作
			
 
				+4. **渐进调整**：生成PLC指令，逐步靠近目标
			
 
				+5. **下发执行**：发送到PLC，控制超滤系统
			
 
				+
			
 
				+### 关键洞察
			
 
				+- **训练是学习过程**：智能体从"一无所知"到"经验丰富"
			
 
				+- **预测是应用过程**：智能体"发挥所学"解决实际问题
			
 
				+- **探索是必要代价**：没有探索就没有发现
			
 
				+- **渐进是安全保障**：避免参数突变引发风险
			
 
				+
			
--- a/models/uf-rl/超滤训练源码/UF_RL_详细技术文档.md
+++ b/models/uf-rl/超滤训练源码/UF_RL_详细技术文档.md
@@ -0,0 +1,1160 @@
 
				+# UF-RL 强化学习系统详细技术文档
			
 
				+
			
 
				+## 目录
			
 
				+1. [整体架构](#整体架构)
			
 
				+2. [代码结构分析](#代码结构分析)
			
 
				+3. [训练流程详解](#训练流程详解)
			
 
				+4. [关键代码解析](#关键代码解析)
			
 
				+5. [数据流与控制流](#数据流与控制流)
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 整体架构
			
 
				+
			
 
				+### 系统组成
			
 
				+
			
 
				+```
			
 
				+┌─────────────────────────────────────────────────────────┐
			
 
				+│                     DQN训练系统                         │
			
 
				+│                                                         │
			
 
				+│  ┌──────────────┐      ┌──────────────┐              │
			
 
				+│  │  DQN_train   │─────▶│   DQN Agent  │              │
			
 
				+│  │  (训练脚本)  │      │ (神经网络)   │              │
			
 
				+│  └──────────────┘      └──────────────┘              │
			
 
				+│         │                      │                       │
			
 
				+│         │                      │ predict               │
			
 
				+│         ▼                      ▼                       │
			
 
				+│  ┌──────────────────────────────────────┐             │
			
 
				+│  │         UFSuperCycleEnv              │             │
			
 
				+│  │         (强化学习环境)               │             │
			
 
				+│  │                                      │             │
			
 
				+│  │  ┌────────────────────────────────┐ │             │
			
 
				+│  │  │   simulate_one_supercycle()    │ │             │
			
 
				+│  │  │   (物理模拟器)                 │ │             │
			
 
				+│  │  │                                │ │             │
			
 
				+│  │  │  ┌──────────┐  ┌──────────┐  │ │             │
			
 
				+│  │  │  │model_fp  │  │model_bw  │  │ │             │
			
 
				+│  │  │  │TMP增长   │  │反洗恢复  │  │ │             │
			
 
				+│  │  │  └──────────┘  └──────────┘  │ │             │
			
 
				+│  │  └────────────────────────────────┘ │             │
			
 
				+│  └──────────────────────────────────────┘             │
			
 
				+│         │                                              │
			
 
				+│         ▼                                              │
			
 
				+│  ┌──────────────┐                                     │
			
 
				+│  │  Callback    │                                     │
			
 
				+│  │  (记录器)    │                                     │
			
 
				+│  └──────────────┘                                     │
			
 
				+└─────────────────────────────────────────────────────────┘
			
 
				+```
			
 
				+
			
 
				+### 文件职责
			
 
				+
			
 
				+| 文件 | 职责 | 核心类/函数 |
			
 
				+|------|------|------------|
			
 
				+| `DQN_train.py` | 训练入口、参数配置、训练循环 | `DQNTrainer`, `train_uf_rl_agent()` |
			
 
				+| `DQN_env.py` | 强化学习环境、物理模拟 | `UFSuperCycleEnv`, `simulate_one_supercycle()` |
			
 
				+| `UF_models.py` | TMP动力学模型 | `TMPIncreaseModel`, `TMPDecreaseModel` |
			
 
				+| `DQN_decide.py` | 推理决策接口 | `run_uf_DQN_decide()`, `generate_plc_instructions()` |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 代码结构分析
			
 
				+
			
 
				+### 1. UF_models.py - 物理模型层
			
 
				+
			
 
				+#### TMPIncreaseModel：TMP增长模型
			
 
				+
			
 
				+```python
			
 
				+class TMPIncreaseModel(torch.nn.Module):
			
 
				+    def forward(self, p, L_h):
			
 
				+        # 简化的膜污染动力学公式
			
 
				+        return float(p.alpha * (p.q_UF ** p.belta) * L_h)
			
 
				+```
			
 
				+
			
 
				+**公式解释**：
			
 
				+```
			
 
				+ΔTMP = α × Q^β × t
			
 
				+
			
 
				+其中：
			
 
				+- α (alpha)：污染系数（1e-6）
			
 
				+- Q (q_UF)：进水流量（360 m³/h）
			
 
				+- β (belta)：幂指数（1.1）
			
 
				+- t (L_h)：过滤时间（小时）
			
 
				+```
			
 
				+
			
 
				+**物理含义**：
			
 
				+- TMP增长与过滤时间线性相关
			
 
				+- 流量越大，污染速率越快（幂律关系）
			
 
				+- 假设污染速率恒定（实际会随时间变化）
			
 
				+
			
 
				+#### TMPDecreaseModel：反洗恢复模型
			
 
				+
			
 
				+```python
			
 
				+class TMPDecreaseModel(torch.nn.Module):
			
 
				+    def forward(self, p, L_s, t_bw_s):
			
 
				+        # 反洗去除比例上界（随过滤时长衰减）
			
 
				+        upper_L = phi_bw_min + (phi_bw_max - phi_bw_min) * exp(-L / L_ref)
			
 
				+        
			
 
				+        # 反洗时长增益（饱和曲线）
			
 
				+        time_gain = 1 - exp(-(t / tau_bw) ^ gamma_t)
			
 
				+        
			
 
				+        # 实际去除比例
			
 
				+        phi = upper_L × time_gain
			
 
				+        return clip(phi, 0.0, 0.999)
			
 
				+```
			
 
				+
			
 
				+**公式解释**：
			
 
				+```
			
 
				+φ(L, t) = [φ_min + (φ_max - φ_min) × e^(-L/L_ref)] × [1 - e^(-(t/τ)^γ)]
			
 
				+
			
 
				+其中：
			
 
				+- φ_min = 0.7：最小去除比例（长时间过滤后）
			
 
				+- φ_max = 1.0：最大去除比例（短时间过滤）
			
 
				+- L_ref = 4000s：过滤时长影响的时间尺度
			
 
				+- τ = 20s：反洗时长影响的时间尺度
			
 
				+- γ = 1.0：反洗时长作用指数
			
 
				+```
			
 
				+
			
 
				+**物理含义**：
			
 
				+- 过滤时间越长，污染越难清除（upper_L下降）
			
 
				+- 反洗时间越长，去除效果越好（time_gain上升）
			
 
				+- 存在饱和效应：反洗超过一定时间后效果不再显著提升
			
 
				+
			
 
				+**关键问题**：
			
 
				+⚠️ 这两个"模型"实际上是**数学公式**，不是神经网络！
			
 
				+- 没有可训练的参数（state_dict为空）
			
 
				+- 保存为.pth文件没有实际意义
			
 
				+- 完全基于人工设定的公式，可能与真实系统有偏差
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 2. DQN_env.py - 环境层
			
 
				+
			
 
				+#### UFParams：系统参数配置
			
 
				+
			
 
				+```python
			
 
				+@dataclass
			
 
				+class UFParams:
			
 
				+    # 膜运行参数
			
 
				+    q_UF: float = 360.0        # 进水流量
			
 
				+    TMP0: float = 0.03         # 初始TMP
			
 
				+    TMP_max: float = 0.06      # TMP上限
			
 
				+    
			
 
				+    # 污染动力学参数
			
 
				+    alpha: float = 1e-6        # TMP增长系数
			
 
				+    belta: float = 1.1         # 幂指数
			
 
				+    
			
 
				+    # 反洗参数
			
 
				+    q_bw_m3ph: float = 1000.0  # 反洗流量
			
 
				+    
			
 
				+    # CEB参数
			
 
				+    T_ceb_interval_h: float = 48.0  # CEB间隔
			
 
				+    v_ceb_m3: float = 30.0     # CEB用水
			
 
				+    t_ceb_s: float = 2400.0    # CEB时长
			
 
				+    
			
 
				+    # 约束
			
 
				+    dTMP: float = 0.001        # 单次残余增量上限
			
 
				+    
			
 
				+    # 动作空间
			
 
				+    L_min_s: float = 3800.0    # 过滤时长下限
			
 
				+    L_max_s: float = 6000.0    # 过滤时长上限
			
 
				+    t_bw_min_s: float = 40.0   # 反洗时长下限
			
 
				+    t_bw_max_s: float = 60.0   # 反洗时长上限
			
 
				+    
			
 
				+    # 奖励权重
			
 
				+    w_rec: float = 0.8         # 回收率权重
			
 
				+    w_rate: float = 0.2        # 净供水率权重
			
 
				+    w_headroom: float = 0.2    # TMP贴边惩罚权重
			
 
				+```
			
 
				+
			
 
				+#### simulate_one_supercycle()：核心物理模拟器
			
 
				+
			
 
				+```python
			
 
				+def simulate_one_supercycle(p: UFParams, L_s: float, t_bw_s: float):
			
 
				+    """
			
 
				+    模拟一个完整的超级周期（多个小周期 + 1次CEB）
			
 
				+    
			
 
				+    输入：
			
 
				+        p: 系统参数
			
 
				+        L_s: 单次产水时长（秒）
			
 
				+        t_bw_s: 单次反洗时长（秒）
			
 
				+    
			
 
				+    输出：
			
 
				+        (feasible, info)
			
 
				+        - feasible: 是否满足所有约束
			
 
				+        - info: 性能指标字典
			
 
				+    """
			
 
				+```
			
 
				+
			
 
				+**执行流程**：
			
 
				+
			
 
				+```
			
 
				+1. 初始化
			
 
				+   tmp = TMP0
			
 
				+   max_tmp = TMP0
			
 
				+   min_tmp = TMP0
			
 
				+
			
 
				+2. 计算小周期次数
			
 
				+   小周期时长 = L_s + t_bw_s
			
 
				+   k_bw_per_ceb = floor(48小时 / 小周期时长)
			
 
				+
			
 
				+3. 循环k_bw_per_ceb次（多个小周期）
			
 
				+   For i in range(k_bw_per_ceb):
			
 
				+       3.1 产水阶段
			
 
				+           tmp_start = tmp
			
 
				+           Δtmp = model_fp(L_h)  # 计算TMP增长
			
 
				+           tmp_peak = tmp_start + Δtmp
			
 
				+           
			
 
				+           约束检查1：tmp_peak ≤ TMP_max
			
 
				+           If 违反: return False
			
 
				+           
			
 
				+       3.2 反洗阶段
			
 
				+           φ = model_bw(L_s, t_bw_s)  # 计算去除比例
			
 
				+           tmp_after_bw = tmp_peak - φ × (tmp_peak - tmp_start)
			
 
				+           
			
 
				+           约束检查2：(tmp_after_bw - tmp_start) ≤ dTMP
			
 
				+           If 违反: return False
			
 
				+           
			
 
				+       3.3 更新TMP
			
 
				+           tmp = tmp_after_bw
			
 
				+           更新max_tmp和min_tmp
			
 
				+
			
 
				+4. CEB阶段
			
 
				+   tmp = TMP0  # 完全恢复
			
 
				+
			
 
				+5. 计算性能指标
			
 
				+   V_feed = k × q_UF × L_h           # 总进水
			
 
				+   V_loss = k × V_bw + V_ceb         # 总损失
			
 
				+   V_net = V_feed - V_loss            # 净产水
			
 
				+   
			
 
				+   recovery = V_net / V_feed          # 回收率
			
 
				+   net_rate = V_net / T_super         # 净供水率
			
 
				+   
			
 
				+   吨水电耗 = 查表(L_s)
			
 
				+   日均产水时间 = (k × L_h / T_super) × 24
			
 
				+
			
 
				+6. 贴边检查
			
 
				+   headroom_ratio = max_tmp / TMP_max
			
 
				+   If headroom_ratio > 0.98: return False
			
 
				+
			
 
				+7. 返回结果
			
 
				+   return True, {
			
 
				+       "recovery": recovery,
			
 
				+       "net_delivery_rate_m3ph": net_rate,
			
 
				+       "max_TMP_during_filtration": max_tmp,
			
 
				+       ...
			
 
				+   }
			
 
				+```
			
 
				+
			
 
				+**约束体系**：
			
 
				+
			
 
				+| 约束 | 检查点 | 物理含义 |
			
 
				+|------|--------|---------|
			
 
				+| TMP峰值 ≤ 0.06 MPa | 产水后 | 防止膜破裂 |
			
 
				+| 单次残余增量 ≤ 0.001 MPa | 反洗后 | 控制污染累积速率 |
			
 
				+| TMP贴边 < 98% | 周期结束 | 保留安全余量 |
			
 
				+
			
 
				+#### _score()：奖励函数
			
 
				+
			
 
				+```python
			
 
				+def _score(p: UFParams, rec: dict) -> float:
			
 
				+    # 1. 归一化净供水率
			
 
				+    rate_norm = rec["net_delivery_rate_m3ph"] / p.q_UF
			
 
				+    
			
 
				+    # 2. TMP软惩罚（sigmoid）
			
 
				+    tmp_ratio = rec["max_TMP"] / p.TMP_max
			
 
				+    k = 10.0
			
 
				+    headroom_penalty = 1 / (1 + exp(-k × (tmp_ratio - 1.0)))
			
 
				+    
			
 
				+    # 3. 基础奖励（加权和）
			
 
				+    base_reward = (
			
 
				+        0.8 × recovery 
			
 
				+        + 0.2 × rate_norm 
			
 
				+        - 0.2 × headroom_penalty
			
 
				+    )
			
 
				+    # 典型范围：0.6 ~ 0.9
			
 
				+    
			
 
				+    # 4. 非线性放大
			
 
				+    amplified = (base_reward - 0.5)² × 5.0
			
 
				+    
			
 
				+    # 5. 保留符号
			
 
				+    if base_reward < 0.5:
			
 
				+        amplified = -amplified
			
 
				+    
			
 
				+    return amplified
			
 
				+```
			
 
				+
			
 
				+**奖励设计逻辑**：
			
 
				+
			
 
				+```
			
 
				+目标1：高回收率（主要）
			
 
				+    - 回收率接近1 → 高奖励
			
 
				+    - 权重0.8
			
 
				+
			
 
				+目标2：高净供水率（次要）
			
 
				+    - 净供水率/进水流量 → 归一化到0-1
			
 
				+    - 权重0.2
			
 
				+
			
 
				+惩罚：TMP贴边
			
 
				+    - TMP接近上限 → sigmoid惩罚
			
 
				+    - TMP超过上限 → 惩罚急剧增大
			
 
				+    - 权重0.2
			
 
				+
			
 
				+非线性变换目的：
			
 
				+    - 放大好动作和坏动作的差异
			
 
				+    - 让Q值学习更快
			
 
				+    - 典型奖励范围：-1.25 ~ 0.8
			
 
				+```
			
 
				+
			
 
				+**奖励曲线分析**：
			
 
				+
			
 
				+```python
			
 
				+base_reward = 0.85 → amplified = (0.85-0.5)²×5 = 0.6125
			
 
				+base_reward = 0.70 → amplified = (0.70-0.5)²×5 = 0.2000
			
 
				+base_reward = 0.50 → amplified = 0.0000
			
 
				+base_reward = 0.30 → amplified = -(0.30-0.5)²×5 = -0.2000
			
 
				+```
			
 
				+
			
 
				+**问题**：
			
 
				+⚠️ 非线性变换可能导致：
			
 
				+- Q值估计不稳定
			
 
				+- 梯度爆炸/消失
			
 
				+- 不同状态下的奖励尺度差异过大
			
 
				+
			
 
				+#### UFSuperCycleEnv：强化学习环境
			
 
				+
			
 
				+```python
			
 
				+class UFSuperCycleEnv(gym.Env):
			
 
				+    """
			
 
				+    Gym标准环境接口
			
 
				+    """
			
 
				+    
			
 
				+    def __init__(self, base_params, max_episode_steps=20):
			
 
				+        # 离散动作空间
			
 
				+        L_values = arange(3800, 6001, 60)  # 37个选项
			
 
				+        t_bw_values = arange(40, 61, 5)    # 5个选项
			
 
				+        self.action_space = Discrete(37 × 5 = 185)
			
 
				+        
			
 
				+        # 连续状态空间（归一化到[0,1]）
			
 
				+        self.observation_space = Box(
			
 
				+            low=0, high=1, shape=(4,)
			
 
				+        )
			
 
				+```
			
 
				+
			
 
				+**状态定义**：
			
 
				+
			
 
				+```python
			
 
				+def _get_obs(self):
			
 
				+    # 状态向量：[TMP0, last_L, last_t_bw, max_TMP]
			
 
				+    return [
			
 
				+        (TMP0 - 0.01) / (0.05 - 0.01),           # 当前初始TMP
			
 
				+        (L_s - 3800) / (6000 - 3800),            # 上次产水时长
			
 
				+        (t_bw_s - 40) / (60 - 40),               # 上次反洗时长
			
 
				+        (max_TMP - 0.01) / (0.05 - 0.01)         # 本周期最高TMP
			
 
				+    ]
			
 
				+```
			
 
				+
			
 
				+**状态空间分析**：
			
 
				+
			
 
				+| 维度 | 物理意义 | 归一化范围 | 作用 |
			
 
				+|------|---------|-----------|------|
			
 
				+| TMP0 | 当前初始压差 | [0.01, 0.05] MPa | 主要状态，决定可行动作范围 |
			
 
				+| last_L | 上次产水时长 | [3800, 6000] s | 历史信息，捕捉趋势 |
			
 
				+| last_t_bw | 上次反洗时长 | [40, 60] s | 历史信息，捕捉趋势 |
			
 
				+| max_TMP | 周期最高TMP | [0.01, 0.05] MPa | 安全信息，避免贴边 |
			
 
				+
			
 
				+**动作映射**：
			
 
				+
			
 
				+```python
			
 
				+def _get_action_values(self, action):
			
 
				+    # action ∈ [0, 184]
			
 
				+    L_idx = action // 5       # 过滤时长索引
			
 
				+    t_bw_idx = action % 5      # 反洗时长索引
			
 
				+    
			
 
				+    L_s = 3800 + L_idx × 60    # 3800, 3860, ..., 6000
			
 
				+    t_bw_s = 40 + t_bw_idx × 5 # 40, 45, 50, 55, 60
			
 
				+    
			
 
				+    return (L_s, t_bw_s)
			
 
				+```
			
 
				+
			
 
				+**训练循环**：
			
 
				+
			
 
				+```python
			
 
				+def reset(self):
			
 
				+    # 随机初始TMP（增加训练多样性）
			
 
				+    self.TMP0 = uniform(0.01, 0.03)
			
 
				+    self.current_step = 0
			
 
				+    self.last_action = (3800, 40)  # 初始为最保守动作
			
 
				+    return self._get_obs()
			
 
				+
			
 
				+def step(self, action):
			
 
				+    self.current_step += 1
			
 
				+    
			
 
				+    # 1. 解码动作
			
 
				+    L_s, t_bw_s = self._get_action_values(action)
			
 
				+    
			
 
				+    # 2. 执行模拟
			
 
				+    feasible, info = simulate_one_supercycle(
			
 
				+        self.current_params, L_s, t_bw_s
			
 
				+    )
			
 
				+    
			
 
				+    # 3. 计算奖励
			
 
				+    if feasible:
			
 
				+        reward = _score(self.current_params, info)
			
 
				+        self.TMP0 = info["TMP_after_ceb"]  # 更新状态
			
 
				+        terminated = False
			
 
				+    else:
			
 
				+        reward = -20  # 约束违反大惩罚
			
 
				+        terminated = True
			
 
				+    
			
 
				+    # 4. 检查截断
			
 
				+    truncated = (self.current_step >= 20)
			
 
				+    
			
 
				+    # 5. 返回
			
 
				+    return next_obs, reward, terminated, truncated, info
			
 
				+```
			
 
				+
			
 
				+**Episode流程示意**：
			
 
				+
			
 
				+```
			
 
				+reset() → TMP0=0.025
			
 
				+  ↓
			
 
				+step(action=92) → (L=4900, t_bw=50) → reward=0.45 → TMP0=0.025
			
 
				+  ↓
			
 
				+step(action=105) → (L=5160, t_bw=45) → reward=0.52 → TMP0=0.026
			
 
				+  ↓
			
 
				+...（最多20步）
			
 
				+  ↓
			
 
				+truncated=True → episode结束
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### 3. DQN_train.py - 训练层
			
 
				+
			
 
				+#### DQNParams：训练超参数
			
 
				+
			
 
				+```python
			
 
				+class DQNParams:
			
 
				+    learning_rate = 1e-4           # Adam学习率
			
 
				+    buffer_size = 10000            # 经验回放池大小
			
 
				+    learning_starts = 200          # 开始学习前的随机探索步数
			
 
				+    batch_size = 32                # 每次训练采样数
			
 
				+    gamma = 0.95                   # 折扣因子
			
 
				+    train_freq = 4                 # 每4步训练一次
			
 
				+    target_update_interval = 2000  # 目标网络更新间隔
			
 
				+    exploration_initial_eps = 1.0  # 初始探索率
			
 
				+    exploration_fraction = 0.3     # 探索衰减比例
			
 
				+    exploration_final_eps = 0.02   # 最终探索率
			
 
				+```
			
 
				+
			
 
				+**参数含义详解**：
			
 
				+
			
 
				+| 参数 | 作用 | 典型值 | 当前值 | 评价 |
			
 
				+|------|------|--------|--------|------|
			
 
				+| learning_rate | 梯度下降步长 | 1e-4~1e-3 | 1e-4 | ✓ 合理 |
			
 
				+| buffer_size | 经验池容量 | 10k~1M | 10k | ⚠️ 偏小 |
			
 
				+| learning_starts | 预填充步数 | 1k~10k | 200 | ⚠️ 太小 |
			
 
				+| batch_size | SGD批大小 | 32~256 | 32 | ✓ 合理 |
			
 
				+| gamma | 未来奖励折扣 | 0.9~0.99 | 0.95 | ✓ 合理 |
			
 
				+| train_freq | 训练频率 | 1~16 | 4 | ✓ 合理 |
			
 
				+| target_update | 目标网络同步 | 1k~10k | 2000 | ⚠️ 代码冲突 |
			
 
				+| exploration | 探索策略 | 前20-50% | 前30% | ✓ 合理 |
			
 
				+
			
 
				+#### DQNTrainer：训练器
			
 
				+
			
 
				+```python
			
 
				+class DQNTrainer:
			
 
				+    def __init__(self, env, params, callback=None):
			
 
				+        self.env = env
			
 
				+        self.params = params
			
 
				+        self.callback = callback
			
 
				+        
			
 
				+        # 创建日志目录
			
 
				+        self.log_dir = self._create_log_dir()
			
 
				+        
			
 
				+        # 创建DQN模型
			
 
				+        self.model = self._create_model()
			
 
				+```
			
 
				+
			
 
				+**模型创建**：
			
 
				+
			
 
				+```python
			
 
				+def _create_model(self):
			
 
				+    return DQN(
			
 
				+        policy="MlpPolicy",  # 多层感知机
			
 
				+        env=self.env,
			
 
				+        learning_rate=1e-4,
			
 
				+        buffer_size=10000,
			
 
				+        learning_starts=200,
			
 
				+        batch_size=32,
			
 
				+        gamma=0.95,
			
 
				+        train_freq=4,
			
 
				+        
			
 
				+        # ⚠️ 注意：这里有冲突
			
 
				+        target_update_interval=1,  # 硬编码为1
			
 
				+        tau=0.005,                 # soft update参数
			
 
				+        
			
 
				+        exploration_initial_eps=1.0,
			
 
				+        exploration_fraction=0.3,
			
 
				+        exploration_final_eps=0.02,
			
 
				+        verbose=1,
			
 
				+        tensorboard_log=self.log_dir
			
 
				+    )
			
 
				+```
			
 
				+
			
 
				+**目标网络更新策略冲突**：
			
 
				+
			
 
				+```python
			
 
				+# 参数说明的是：
			
 
				+target_update_interval = 2000  # 每2000步硬更新
			
 
				+
			
 
				+# 但代码实际使用：
			
 
				+target_update_interval = 1     # 每1步软更新
			
 
				+tau = 0.005                    # 软更新系数
			
 
				+
			
 
				+# 软更新公式：
			
 
				+θ_target = τ × θ_current + (1-τ) × θ_target
			
 
				+```
			
 
				+
			
 
				+**两种更新策略对比**：
			
 
				+
			
 
				+| 策略 | 优点 | 缺点 | 适用场景 |
			
 
				+|------|------|------|---------|
			
 
				+| 硬更新 | 稳定性好 | 更新滞后 | 经典DQN |
			
 
				+| 软更新 | 平滑收敛 | 可能不稳定 | DDPG/TD3 |
			
 
				+
			
 
				+当前代码实际使用**软更新**，但注释说明是硬更新，存在混淆。
			
 
				+
			
 
				+#### 训练主流程
			
 
				+
			
 
				+```python
			
 
				+def train(self, total_timesteps: int):
			
 
				+    self.model.learn(
			
 
				+        total_timesteps=total_timesteps,
			
 
				+        callback=self.callback
			
 
				+    )
			
 
				+```
			
 
				+
			
 
				+**Stable-Baselines3内部流程**（简化）：
			
 
				+
			
 
				+```python
			
 
				+# learn() 内部逻辑
			
 
				+for step in range(total_timesteps):
			
 
				+    # 1. ε-贪心选择动作
			
 
				+    if random() < epsilon:
			
 
				+        action = env.action_space.sample()  # 探索
			
 
				+    else:
			
 
				+        action = argmax(Q_network(state))    # 利用
			
 
				+    
			
 
				+    # 2. 执行动作
			
 
				+    next_state, reward, done, info = env.step(action)
			
 
				+    
			
 
				+    # 3. 存入经验池
			
 
				+    replay_buffer.add(state, action, reward, next_state, done)
			
 
				+    
			
 
				+    # 4. 训练（每train_freq=4步一次）
			
 
				+    if step % 4 == 0 and step > learning_starts:
			
 
				+        # 从经验池采样
			
 
				+        batch = replay_buffer.sample(batch_size=32)
			
 
				+        
			
 
				+        # 计算TD目标
			
 
				+        with torch.no_grad():
			
 
				+            q_next = Q_target(next_state).max(dim=1)
			
 
				+            target = reward + gamma × q_next × (1 - done)
			
 
				+        
			
 
				+        # 计算当前Q值
			
 
				+        q_current = Q_network(state)[action]
			
 
				+        
			
 
				+        # 计算损失
			
 
				+        loss = MSE(q_current, target)
			
 
				+        
			
 
				+        # 反向传播
			
 
				+        optimizer.zero_grad()
			
 
				+        loss.backward()
			
 
				+        optimizer.step()
			
 
				+    
			
 
				+    # 5. 软更新目标网络（每1步）
			
 
				+    Q_target = tau × Q_network + (1-tau) × Q_target
			
 
				+    
			
 
				+    # 6. 衰减epsilon
			
 
				+    epsilon = max(
			
 
				+        epsilon_final,
			
 
				+        epsilon_initial - step / (total_steps × exploration_fraction)
			
 
				+    )
			
 
				+    
			
 
				+    # 7. 回调记录
			
 
				+    callback.on_step()
			
 
				+    
			
 
				+    # 8. Episode重置
			
 
				+    if done:
			
 
				+        state = env.reset()
			
 
				+```
			
 
				+
			
 
				+#### UFTrainingCallback：训练回调
			
 
				+
			
 
				+```python
			
 
				+class UFTrainingCallback(BaseCallback):
			
 
				+    def __init__(self, recorder, verbose=0):
			
 
				+        self.recorder = recorder
			
 
				+    
			
 
				+    def _on_step(self) -> bool:
			
 
				+        # 从locals获取当前步信息
			
 
				+        obs = self.locals.get("new_obs")[0]
			
 
				+        action = self.locals.get("actions")[0]
			
 
				+        reward = self.locals.get("rewards")[0]
			
 
				+        done = self.locals.get("dones")[0]
			
 
				+        info = self.locals.get("infos")[0]
			
 
				+        
			
 
				+        # 记录到recorder
			
 
				+        self.recorder.record_step(obs, action, reward, done, info)
			
 
				+        
			
 
				+        # 打印（如果verbose=1）
			
 
				+        if self.verbose:
			
 
				+            print(f"[Step {self.num_timesteps}] "
			
 
				+                  f"action={action}, reward={reward:.3f}, done={done}")
			
 
				+        
			
 
				+        return True  # 继续训练
			
 
				+```
			
 
				+
			
 
				+**记录器UFEpisodeRecorder**：
			
 
				+
			
 
				+```python
			
 
				+class UFEpisodeRecorder:
			
 
				+    def __init__(self):
			
 
				+        self.episode_data = []      # 所有episode的记录
			
 
				+        self.current_episode = []   # 当前episode的步数据
			
 
				+    
			
 
				+    def record_step(self, obs, action, reward, done, info):
			
 
				+        step_data = {
			
 
				+            "obs": obs,
			
 
				+            "action": action,
			
 
				+            "reward": reward,
			
 
				+            "done": done,
			
 
				+            "info": info
			
 
				+        }
			
 
				+        self.current_episode.append(step_data)
			
 
				+        
			
 
				+        if done:
			
 
				+            self.episode_data.append(self.current_episode)
			
 
				+            self.current_episode = []
			
 
				+    
			
 
				+    def get_episode_stats(self, episode_idx=-1):
			
 
				+        episode = self.episode_data[episode_idx]
			
 
				+        return {
			
 
				+            "total_reward": sum(step["reward"] for step in episode),
			
 
				+            "avg_recovery": mean([step["info"]["recovery"] for step in episode]),
			
 
				+            "feasible_steps": sum(1 for s in episode if s["info"]["feasible"]),
			
 
				+            "total_steps": len(episode)
			
 
				+        }
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 训练流程详解
			
 
				+
			
 
				+### 完整训练流程图
			
 
				+
			
 
				+```
			
 
				+┌─────────────────────────────────────────────────────────────┐
			
 
				+│                      训练流程                               │
			
 
				+└─────────────────────────────────────────────────────────────┘
			
 
				+
			
 
				+1. 初始化阶段
			
 
				+   ├─ set_global_seed(2025)          # 固定随机种子
			
 
				+   ├─ params = UFParams()             # 创建系统参数
			
 
				+   ├─ env = UFSuperCycleEnv(params)   # 创建环境
			
 
				+   ├─ env = Monitor(env)              # 包装监控
			
 
				+   ├─ env = DummyVecEnv([env])        # 向量化环境
			
 
				+   ├─ recorder = UFEpisodeRecorder()  # 创建记录器
			
 
				+   ├─ callback = UFTrainingCallback() # 创建回调
			
 
				+   └─ model = DQN(...)                # 创建DQN模型
			
 
				+
			
 
				+2. 训练循环（50000步）
			
 
				+   For step = 1 to 50000:
			
 
				+       ├─ 探索vs利用决策
			
 
				+       │   If random() < epsilon(step):
			
 
				+       │       action = random([0, 184])     # 探索
			
 
				+       │   Else:
			
 
				+       │       state_tensor = torch.FloatTensor(state)
			
 
				+       │       q_values = Q_network(state_tensor)  # [185]
			
 
				+       │       action = argmax(q_values)     # 利用
			
 
				+       │
			
 
				+       ├─ 执行动作
			
 
				+       │   L_s, t_bw_s = decode_action(action)
			
 
				+       │   next_state, reward, done, info = env.step(action)
			
 
				+       │
			
 
				+       ├─ 存储经验
			
 
				+       │   replay_buffer.add(state, action, reward, next_state, done)
			
 
				+       │
			
 
				+       ├─ 训练网络（每4步，且step > 200）
			
 
				+       │   If step % 4 == 0 and step > 200:
			
 
				+       │       batch = replay_buffer.sample(32)
			
 
				+       │       ├─ 前向传播
			
 
				+       │       │   q_current = Q_network(batch.state)[batch.action]
			
 
				+       │       │   q_next_max = Q_target(batch.next_state).max()
			
 
				+       │       │   target = batch.reward + 0.95 × q_next_max × (1 - batch.done)
			
 
				+       │       ├─ 计算损失
			
 
				+       │       │   loss = MSE(q_current, target)
			
 
				+       │       ├─ 反向传播
			
 
				+       │       │   optimizer.zero_grad()
			
 
				+       │       │   loss.backward()
			
 
				+       │       │   optimizer.step()
			
 
				+       │       └─ 软更新目标网络
			
 
				+       │           Q_target ← 0.005×Q_network + 0.995×Q_target
			
 
				+       │
			
 
				+       ├─ 衰减epsilon
			
 
				+       │   epsilon = max(0.02, 1.0 - step/15000)  # 前30%线性衰减
			
 
				+       │
			
 
				+       ├─ 记录数据
			
 
				+       │   callback.on_step()  # 记录obs, action, reward
			
 
				+       │
			
 
				+       └─ Episode结束处理
			
 
				+           If done or truncated:
			
 
				+               state = env.reset()  # 重置环境
			
 
				+               recorder.save_episode()
			
 
				+
			
 
				+3. 保存模型
			
 
				+   ├─ model.save("dqn_model.zip")
			
 
				+   └─ print(statistics)
			
 
				+```
			
 
				+
			
 
				+### 训练时间线分析
			
 
				+
			
 
				+假设训练50000步，每个episode平均10步：
			
 
				+
			
 
				+```
			
 
				+步数范围     | epsilon | 训练行为           | 说明
			
 
				+------------|---------|-------------------|------------------
			
 
				+0-200       | 1.0     | 纯随机探索         | 预填充经验池
			
 
				+200-15000   | 1.0→0.02| 探索衰减期         | 逐渐从探索转向利用
			
 
				+15000-50000 | 0.02    | 基本利用，2%探索   | 稳定策略优化
			
 
				+
			
 
				+训练触发：
			
 
				+- 0-200步：不训练，仅收集经验
			
 
				+- 200-50000步：每4步训练1次 → 共12450次梯度更新
			
 
				+
			
 
				+目标网络更新：
			
 
				+- 每1步软更新，tau=0.005
			
 
				+- 相当于每200步目标网络更新约63%
			
 
				+```
			
 
				+
			
 
				+### 关键时刻详解
			
 
				+
			
 
				+#### Episode开始
			
 
				+
			
 
				+```python
			
 
				+state = env.reset()
			
 
				+# 环境内部执行：
			
 
				+TMP0 = uniform(0.01, 0.03)  # 随机初始TMP
			
 
				+current_step = 0
			
 
				+last_action = (3800, 40)
			
 
				+max_TMP = TMP0
			
 
				+
			
 
				+obs = [
			
 
				+    (TMP0 - 0.01) / 0.04,    # 例：0.025 → 0.375
			
 
				+    (3800 - 3800) / 2200,    # 0.0
			
 
				+    (40 - 40) / 20,          # 0.0
			
 
				+    (TMP0 - 0.01) / 0.04     # 0.375
			
 
				+]
			
 
				+```
			
 
				+
			
 
				+#### 第1步
			
 
				+
			
 
				+```python
			
 
				+# 1. 动作选择（epsilon=1.0，纯探索）
			
 
				+action = random.randint(0, 184)  # 假设选到92
			
 
				+
			
 
				+# 2. 解码动作
			
 
				+L_idx = 92 // 5 = 18
			
 
				+t_bw_idx = 92 % 5 = 2
			
 
				+L_s = 3800 + 18×60 = 4880
			
 
				+t_bw_s = 40 + 2×5 = 50
			
 
				+
			
 
				+# 3. 模拟执行
			
 
				+simulate_one_supercycle(p, 4880, 50):
			
 
				+    L_h = 4880 / 3600 = 1.356h
			
 
				+    k_bw = floor(48 / ((4880+50)/3600)) = 35次
			
 
				+    
			
 
				+    For i in range(35):
			
 
				+        # 产水
			
 
				+        dtmp = 1e-6 × 360^1.1 × 1.356 = 0.00074 MPa
			
 
				+        tmp_peak = 0.025 + 0.00074 = 0.02574 MPa
			
 
				+        
			
 
				+        # 检查约束
			
 
				+        tmp_peak < 0.06 ✓
			
 
				+        
			
 
				+        # 反洗
			
 
				+        phi = 0.836  # 通过model_bw计算
			
 
				+        tmp_after = 0.02574 - 0.836×0.00074 = 0.02512 MPa
			
 
				+        
			
 
				+        # 检查约束
			
 
				+        residual = 0.02512 - 0.025 = 0.00012 < 0.001 ✓
			
 
				+        
			
 
				+        tmp = 0.02512
			
 
				+    
			
 
				+    # CEB
			
 
				+    tmp = 0.025
			
 
				+    
			
 
				+    # 计算指标
			
 
				+    V_feed = 35 × 360 × 1.356 = 17089 m³
			
 
				+    V_loss = 35 × (1000×50/3600) + 30 = 517 m³
			
 
				+    V_net = 16572 m³
			
 
				+    recovery = 0.970
			
 
				+    net_rate = 16572 / 49.0 = 338.2 m³/h
			
 
				+    
			
 
				+    return True, {recovery: 0.970, net_rate: 338.2, ...}
			
 
				+
			
 
				+# 4. 计算奖励
			
 
				+rate_norm = 338.2 / 360 = 0.939
			
 
				+headroom_penalty = 1/(1+exp(-10×(0.02574/0.06-1))) = 0.00 (TMP很低)
			
 
				+base_reward = 0.8×0.970 + 0.2×0.939 - 0.2×0.00 = 0.964
			
 
				+amplified = (0.964-0.5)² × 5 = 1.076
			
 
				+
			
 
				+reward = 1.076  # 非常好的奖励！
			
 
				+
			
 
				+# 5. 下一状态
			
 
				+TMP0_new = 0.025
			
 
				+obs_new = [0.375, (4880-3800)/2200=0.491, (50-40)/20=0.5, 0.429]
			
 
				+
			
 
				+# 6. 存储经验
			
 
				+buffer.add(
			
 
				+    state=[0.375, 0.0, 0.0, 0.375],
			
 
				+    action=92,
			
 
				+    reward=1.076,
			
 
				+    next_state=[0.375, 0.491, 0.5, 0.429],
			
 
				+    done=False
			
 
				+)
			
 
				+
			
 
				+# 7. 不训练（step=1 < 200）
			
 
				+```
			
 
				+
			
 
				+#### 第204步（首次训练）
			
 
				+
			
 
				+```python
			
 
				+# 此时经验池已有204条经验，开始训练
			
 
				+
			
 
				+# 1. 从经验池随机采样32条
			
 
				+batch = buffer.sample(32)
			
 
				+# batch.state: [32, 4]
			
 
				+# batch.action: [32]
			
 
				+# batch.reward: [32]
			
 
				+# batch.next_state: [32, 4]
			
 
				+# batch.done: [32]
			
 
				+
			
 
				+# 2. 计算当前Q值
			
 
				+state_tensor = torch.FloatTensor(batch.state)  # [32, 4]
			
 
				+q_values = Q_network(state_tensor)  # [32, 185]
			
 
				+q_current = q_values.gather(1, batch.action.unsqueeze(1))  # [32, 1]
			
 
				+
			
 
				+# 3. 计算目标Q值
			
 
				+with torch.no_grad():
			
 
				+    next_q_values = Q_target(batch.next_state)  # [32, 185]
			
 
				+    next_q_max = next_q_values.max(dim=1).values  # [32]
			
 
				+    target = batch.reward + 0.95 × next_q_max × (1 - batch.done)  # [32]
			
 
				+
			
 
				+# 4. 计算TD误差
			
 
				+loss = F.mse_loss(q_current.squeeze(), target)
			
 
				+# 例：loss = 0.523
			
 
				+
			
 
				+# 5. 反向传播
			
 
				+optimizer.zero_grad()
			
 
				+loss.backward()
			
 
				+optimizer.step()
			
 
				+
			
 
				+# 6. 软更新目标网络
			
 
				+for param, target_param in zip(Q_network.parameters(), Q_target.parameters()):
			
 
				+    target_param.data.copy_(0.005 × param.data + 0.995 × target_param.data)
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 关键代码解析
			
 
				+
			
 
				+### Q网络结构（MlpPolicy默认）
			
 
				+
			
 
				+```python
			
 
				+# Stable-Baselines3的MlpPolicy默认架构
			
 
				+class QNetwork(nn.Module):
			
 
				+    def __init__(self, state_dim=4, action_dim=185):
			
 
				+        super().__init__()
			
 
				+        self.net = nn.Sequential(
			
 
				+            nn.Linear(4, 64),      # 输入层 → 隐藏层1
			
 
				+            nn.ReLU(),
			
 
				+            nn.Linear(64, 64),     # 隐藏层1 → 隐藏层2
			
 
				+            nn.ReLU(),
			
 
				+            nn.Linear(64, 185)     # 隐藏层2 → 输出层
			
 
				+        )
			
 
				+    
			
 
				+    def forward(self, state):
			
 
				+        # state: [batch, 4]
			
 
				+        return self.net(state)  # [batch, 185]
			
 
				+```
			
 
				+
			
 
				+**参数量**：
			
 
				+```
			
 
				+Layer 1: 4×64 + 64 = 320
			
 
				+Layer 2: 64×64 + 64 = 4160
			
 
				+Layer 3: 64×185 + 185 = 12025
			
 
				+Total: 16505 参数
			
 
				+```
			
 
				+
			
 
				+### ε-贪心策略实现
			
 
				+
			
 
				+```python
			
 
				+def predict(self, observation, epsilon):
			
 
				+    if np.random.random() < epsilon:
			
 
				+        # 探索：均匀随机
			
 
				+        return self.action_space.sample()
			
 
				+    else:
			
 
				+        # 利用：选择Q值最大的动作
			
 
				+        with torch.no_grad():
			
 
				+            obs_tensor = torch.FloatTensor(observation).unsqueeze(0)
			
 
				+            q_values = self.q_network(obs_tensor)
			
 
				+            return q_values.argmax(dim=1).item()
			
 
				+```
			
 
				+
			
 
				+### 经验回放采样
			
 
				+
			
 
				+```python
			
 
				+class ReplayBuffer:
			
 
				+    def sample(self, batch_size):
			
 
				+        # 均匀随机采样
			
 
				+        indices = np.random.randint(0, len(self.buffer), size=batch_size)
			
 
				+        
			
 
				+        batch = {
			
 
				+            'state': np.array([self.buffer[i][0] for i in indices]),
			
 
				+            'action': np.array([self.buffer[i][1] for i in indices]),
			
 
				+            'reward': np.array([self.buffer[i][2] for i in indices]),
			
 
				+            'next_state': np.array([self.buffer[i][3] for i in indices]),
			
 
				+            'done': np.array([self.buffer[i][4] for i in indices])
			
 
				+        }
			
 
				+        
			
 
				+        return batch
			
 
				+```
			
 
				+
			
 
				+### TensorBoard日志
			
 
				+
			
 
				+```python
			
 
				+# 自动记录的指标（由Monitor包装）
			
 
				+- rollout/ep_rew_mean: 平均episode奖励
			
 
				+- rollout/ep_len_mean: 平均episode长度
			
 
				+- time/fps: 训练速度（步/秒）
			
 
				+- train/loss: TD误差
			
 
				+- train/learning_rate: 当前学习率
			
 
				+- train/n_updates: 梯度更新次数
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 数据流与控制流
			
 
				+
			
 
				+### 数据流图
			
 
				+
			
 
				+```
			
 
				+输入数据流：
			
 
				+TMP0 (float) → [归一化] → state[0] (0~1)
			
 
				+    ↓
			
 
				+    ├─ last_L_s → state[1]
			
 
				+    ├─ last_t_bw_s → state[2]
			
 
				+    └─ max_TMP → state[3]
			
 
				+    
			
 
				+    state[4] → Q_network → q_values[185]
			
 
				+              ↓
			
 
				+          argmax → action (int)
			
 
				+              ↓
			
 
				+          decode → (L_s, t_bw_s)
			
 
				+              ↓
			
 
				+    simulate_one_supercycle()
			
 
				+              ↓
			
 
				+        ┌─────┴──────┐
			
 
				+        │  model_fp   │ → ΔTMP
			
 
				+        │  model_bw   │ → φ
			
 
				+        └─────┬──────┘
			
 
				+              ↓
			
 
				+       约束检查 → feasible (bool)
			
 
				+              ↓
			
 
				+       指标计算 → {recovery, net_rate, ...}
			
 
				+              ↓
			
 
				+       _score() → reward (float)
			
 
				+              ↓
			
 
				+       ReplayBuffer
			
 
				+```
			
 
				+
			
 
				+### 控制流图
			
 
				+
			
 
				+```
			
 
				+main() 入口
			
 
				+    ↓
			
 
				+set_global_seed(2025)
			
 
				+    ↓
			
 
				+创建UFParams
			
 
				+    ↓
			
 
				+创建UFSuperCycleEnv
			
 
				+    ↓
			
 
				+包装Monitor & DummyVecEnv
			
 
				+    ↓
			
 
				+创建DQN模型
			
 
				+    ↓
			
 
				+┌─────────────────────────┐
			
 
				+│   model.learn(50000)    │
			
 
				+│                         │
			
 
				+│   For step in range:    │
			
 
				+│       ├─ select_action  │
			
 
				+│       ├─ env.step()     │
			
 
				+│       ├─ buffer.add()   │
			
 
				+│       ├─ train_network  │← 每4步
			
 
				+│       ├─ update_target  │← 每1步（软更新）
			
 
				+│       └─ callback()     │
			
 
				+│                         │
			
 
				+│   If done: env.reset()  │
			
 
				+└─────────────────────────┘
			
 
				+    ↓
			
 
				+model.save("dqn_model.zip")
			
 
				+    ↓
			
 
				+打印统计信息
			
 
				+    ↓
			
 
				+结束
			
 
				+```
			
 
				+
			
 
				+### 并发与同步
			
 
				+
			
 
				+```
			
 
				+主线程：
			
 
				+    ├─ DQN训练循环
			
 
				+    │   ├─ 网络前向传播
			
 
				+    │   ├─ 环境交互
			
 
				+    │   └─ 网络反向传播
			
 
				+    │
			
 
				+    ├─ 回调线程（可选）
			
 
				+    │   └─ TensorBoard写入
			
 
				+    │
			
 
				+    └─ Monitor包装
			
 
				+        └─ 统计信息累积
			
 
				+
			
 
				+注意：
			
 
				+- DummyVecEnv是单进程向量化（伪并行）
			
 
				+- 如需真正并行，应使用SubprocVecEnv
			
 
				+- 当前代码未使用多进程/多线程
			
 
				+```
			
 
				+
			
 
				+### 内存管理
			
 
				+
			
 
				+```
			
 
				+主要内存占用：
			
 
				+1. 经验回放池：10000 × (4+1+1+4+1) × 4字节 ≈ 440KB
			
 
				+2. Q网络参数：16505 × 4字节 ≈ 66KB
			
 
				+3. 目标网络参数：16505 × 4字节 ≈ 66KB
			
 
				+4. 梯度缓存：约等于参数量 ≈ 66KB
			
 
				+5. 训练batch：32 × (4+4+1) × 4字节 ≈ 1.2KB
			
 
				+
			
 
				+总计：约650KB（非常小）
			
 
				+
			
 
				+峰值内存：
			
 
				+- 反向传播时临时张量 +100KB
			
 
				+- TensorBoard缓冲 +1MB
			
 
				+- 总峰值 < 2MB
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 训练监控与调试
			
 
				+
			
 
				+### TensorBoard可视化
			
 
				+
			
 
				+启动方式：
			
 
				+```bash
			
 
				+tensorboard --logdir=./uf_dqn_tensorboard
			
 
				+```
			
 
				+
			
 
				+关键曲线：
			
 
				+1. **rollout/ep_rew_mean**：episode平均奖励（核心指标）
			
 
				+   - 期望：从负值逐渐上升到正值
			
 
				+   - 收敛标志：曲线稳定在0.5以上
			
 
				+
			
 
				+2. **train/loss**：TD误差（训练稳定性）
			
 
				+   - 期望：从高值逐渐下降
			
 
				+   - 警告：如果持续震荡或发散，说明学习不稳定
			
 
				+
			
 
				+3. **rollout/ep_len_mean**：episode平均长度
			
 
				+   - 期望：保持在10-20之间
			
 
				+   - 异常：突然下降说明策略变差（频繁违反约束）
			
 
				+
			
 
				+### 调试技巧
			
 
				+
			
 
				+#### 检查奖励分布
			
 
				+```python
			
 
				+# 在callback中添加
			
 
				+rewards_hist = []
			
 
				+def _on_step(self):
			
 
				+    rewards_hist.append(self.locals["rewards"][0])
			
 
				+    if len(rewards_hist) == 1000:
			
 
				+        print(f"Reward分布：min={min(rewards_hist)}, "
			
 
				+              f"mean={np.mean(rewards_hist)}, "
			
 
				+              f"max={max(rewards_hist)}")
			
 
				+        rewards_hist.clear()
			
 
				+```
			
 
				+
			
 
				+#### 检查约束违反率
			
 
				+```python
			
 
				+constraint_violations = 0
			
 
				+total_steps = 0
			
 
				+
			
 
				+def _on_step(self):
			
 
				+    global constraint_violations, total_steps
			
 
				+    total_steps += 1
			
 
				+    if self.locals["rewards"][0] == -20:
			
 
				+        constraint_violations += 1
			
 
				+    
			
 
				+    if total_steps % 1000 == 0:
			
 
				+        violation_rate = constraint_violations / total_steps
			
 
				+        print(f"约束违反率：{violation_rate:.2%}")
			
 
				+```
			
 
				+
			
 
				+#### 检查Q值范围
			
 
				+```python
			
 
				+# 每1000步记录Q值统计
			
 
				+if step % 1000 == 0:
			
 
				+    with torch.no_grad():
			
 
				+        sample_states = buffer.sample_states(100)
			
 
				+        q_values = model.q_network(sample_states)
			
 
				+        print(f"Q值范围：[{q_values.min():.2f}, {q_values.max():.2f}]")
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 总结
			
 
				+
			
 
				+### 训练流程核心要点
			
 
				+
			
 
				+1. **环境模拟**：基于简化的数学模型，不是真实物理数据
			
 
				+2. **状态表示**：4维向量，信息相对简单
			
 
				+3. **动作空间**：185个离散动作（37×5网格）
			
 
				+4. **奖励设计**：多目标加权+非线性变换
			
 
				+5. **算法选择**：DQN（经典RL算法）
			
 
				+6. **训练策略**：ε-贪心探索，经验回放，软更新目标网络
			
 
				+
			
 
				+### 关键超参数
			
 
				+
			
 
				+| 参数 | 值 | 影响 |
			
 
				+|------|-----|------|
			
 
				+| total_timesteps | 50000 | 训练总步数 |
			
 
				+| buffer_size | 10000 | 经验池大小 |
			
 
				+| learning_rate | 1e-4 | 学习速度 |
			
 
				+| gamma | 0.95 | 长期规划能力 |
			
 
				+| exploration | 1.0→0.02 | 探索能力 |
			
 
				+
			
 
				+### 预期训练效果
			
 
				+
			
 
				+**良好训练的标志**：
			
 
				+- Episode奖励从负值上升到0.5+
			
 
				+- 约束违反率从80%降到5%以下
			
 
				+- 回收率稳定在0.96+
			
 
				+- Q值逐渐收敛（不再剧烈波动）
			
 
				+
			
 
				+**训练失败的标志**：
			
 
				+- 奖励曲线持续震荡
			
 
				+- 约束违反率居高不下
			
 
				+- Q值爆炸或消失
			
 
				+- Episode长度急剧下降
			
 
				+
			
--- a/models/uf-rl/超滤训练源码/UF_decide.py
+++ b/models/uf-rl/超滤训练源码/UF_decide.py
@@ -0,0 +1,405 @@
 
				+# UF_decide.py
			
 
				+from dataclasses import dataclass
			
 
				+import numpy as np
			
 
				+
			
 
				+@dataclass
			
 
				+class UFParams:
			
 
				+    # —— 膜与运行参数 ——
			
 
				+    q_UF: float = 360.0           # 过滤进水流量（m^3/h）
			
 
				+    TMP0: float = 0.03            # 初始TMP（MPa）
			
 
				+    TMP_max: float = 0.06         # TMP硬上限（MPa）
			
 
				+
			
 
				+    # —— 膜污染动力学 ——
			
 
				+    alpha: float = 1e-6           # TMP增长系数
			
 
				+    belta: float = 1.1            # 幂指数
			
 
				+
			
 
				+    # —— 反洗参数（固定） ——
			
 
				+    q_bw_m3ph: float = 1000.0     # 物理反洗流量（m^3/h）
			
 
				+
			
 
				+    # —— CEB参数（固定） ——
			
 
				+    T_ceb_interval_h: float = 48.0  # 固定每 k 小时做一次CEB
			
 
				+    v_ceb_m3: float = 30.0        # CEB用水体积（m^3）
			
 
				+    t_ceb_s: float = 40 * 60.0    # CEB时长（s）
			
 
				+    phi_ceb: float = 1.0          # CEB去除比例（简化：完全恢复到TMP0）
			
 
				+
			
 
				+    # —— 约束与收敛 ——
			
 
				+    dTMP: float = 0.0005          # 单次产水结束时，相对TMP0最大升幅（MPa）
			
 
				+
			
 
				+    # —— 搜索范围（秒） ——
			
 
				+    L_min_s: float = 3600.0       # 过滤时长下限（s）
			
 
				+    L_max_s: float = 4200.0       # 过滤时长上限（s）
			
 
				+    t_bw_min_s: float = 40.0      # 物洗时长下限（s）
			
 
				+    t_bw_max_s: float = 60.0      # 物洗时长上限（s）
			
 
				+
			
 
				+    # —— 物理反洗恢复函数参数 ——
			
 
				+    phi_bw_min: float = 0.7       # 物洗去除比例最小值
			
 
				+    phi_bw_max: float = 1.0       # 物洗去除比例最大值
			
 
				+    L_ref_s: float = 4000.0       # 过滤时长影响时间尺度
			
 
				+    tau_bw_s: float = 30.0        # 物洗时长影响时间尺度
			
 
				+    gamma_t: float = 1.0          # 物洗时长作用指数
			
 
				+    
			
 
				+    # —— 网格 ——
			
 
				+    L_step_s: float = 60.0        # 过滤时长步长（s）
			
 
				+    t_bw_step_s: float = 5.0      # 物洗时长步长（s）
			
 
				+
			
 
				+    # 多目标加权及高TMP惩罚
			
 
				+    w_rec: float = 0.8            # 回收率权重
			
 
				+    w_rate: float = 0.2           # 净供水率权重
			
 
				+    w_headroom: float = 0.3       # 贴边惩罚权重
			
 
				+    r_headroom: float = 2.0       # 贴边惩罚幂次
			
 
				+    headroom_hardcap: float = 0.98 # 超过此比例直接视为不可取
			
 
				+
			
 
				+def _delta_tmp(p: UFParams, L_h: float) -> float:
			
 
				+    # 过滤时段TMP上升量
			
 
				+    return float(p.alpha * (p.q_UF ** p.belta) * L_h)
			
 
				+
			
 
				+def _v_bw_m3(p: UFParams, t_bw_s: float) -> float:
			
 
				+    # 物理反洗水耗
			
 
				+    return float(p.q_bw_m3ph * (float(t_bw_s) / 3600.0))
			
 
				+
			
 
				+def phi_bw_of(p: UFParams, L_s: float, t_bw_s: float) -> float:
			
 
				+    # 物洗去除比例：随过滤时长增长上界收缩，随物洗时长增长趋饱和
			
 
				+    L = max(float(L_s), 1.0)
			
 
				+    t = max(float(t_bw_s), 1e-6)
			
 
				+    upper_L = p.phi_bw_min + (p.phi_bw_max - p.phi_bw_min) * np.exp(- L / p.L_ref_s)
			
 
				+    time_gain = 1.0 - np.exp(- (t / p.tau_bw_s) ** p.gamma_t)
			
 
				+    phi = upper_L * time_gain
			
 
				+    return float(np.clip(phi, 0.0, 0.999))
			
 
				+
			
 
				+def simulate_one_supercycle(p: UFParams, L_s: float, t_bw_s: float):
			
 
				+    """
			
 
				+    返回 (是否可行, 指标字典)
			
 
				+    - 支持动态CEB次数：48h固定间隔
			
 
				+    - 增加日均产水时间和吨水电耗
			
 
				+    """
			
 
				+    L_h = float(L_s) / 3600.0  # 小周期过滤时间(h)
			
 
				+
			
 
				+    tmp = p.TMP0
			
 
				+    max_tmp_during_filtration = tmp
			
 
				+    max_residual_increase = 0.0
			
 
				+
			
 
				+    # 小周期总时长(h)
			
 
				+    t_small_cycle_h = (L_s + t_bw_s) / 3600.0
			
 
				+
			
 
				+    # 计算超级周期内CEB次数
			
 
				+    k_bw_per_ceb = int(np.floor(p.T_ceb_interval_h / t_small_cycle_h))
			
 
				+    if k_bw_per_ceb < 1:
			
 
				+        k_bw_per_ceb = 1  # 至少一个小周期
			
 
				+
			
 
				+    # ton水电耗查表
			
 
				+    energy_lookup = {
			
 
				+        3600: 0.1034, 3660: 0.1031, 3720: 0.1029, 3780: 0.1026,
			
 
				+        3840: 0.1023, 3900: 0.1021, 3960: 0.1019, 4020: 0.1017,
			
 
				+        4080: 0.1015, 4140: 0.1012, 4200: 0.1011
			
 
				+    }
			
 
				+
			
 
				+    for _ in range(k_bw_per_ceb):
			
 
				+        tmp_run_start = tmp
			
 
				+
			
 
				+        # 过滤阶段TMP增长
			
 
				+        dtmp = _delta_tmp(p, L_h)
			
 
				+        tmp_peak = tmp_run_start + dtmp
			
 
				+
			
 
				+        # 约束1：峰值不得超过硬上限
			
 
				+        if tmp_peak > p.TMP_max + 1e-12:
			
 
				+            return False, {"reason": "TMP_max violated during filtration", "TMP_peak": tmp_peak}
			
 
				+
			
 
				+        if tmp_peak > max_tmp_during_filtration:
			
 
				+            max_tmp_during_filtration = tmp_peak
			
 
				+
			
 
				+        # 物理反洗
			
 
				+        phi = phi_bw_of(p, L_s, t_bw_s)
			
 
				+        tmp_after_bw = tmp_peak - phi * (tmp_peak - tmp_run_start)
			
 
				+
			
 
				+        # 约束2：单次残余增量控制
			
 
				+        residual_inc = tmp_after_bw - tmp_run_start
			
 
				+        if residual_inc > p.dTMP + 1e-12:
			
 
				+            return False, {
			
 
				+                "reason": "residual TMP increase after BW exceeded dTMP",
			
 
				+                "residual_increase": residual_inc,
			
 
				+                "limit_dTMP": p.dTMP
			
 
				+            }
			
 
				+        if residual_inc > max_residual_increase:
			
 
				+            max_residual_increase = residual_inc
			
 
				+
			
 
				+        tmp = tmp_after_bw
			
 
				+
			
 
				+    # CEB
			
 
				+    tmp_after_ceb = p.TMP0
			
 
				+
			
 
				+    # 体积与回收率
			
 
				+    V_feed_super = k_bw_per_ceb * p.q_UF * L_h
			
 
				+    V_loss_super = k_bw_per_ceb * _v_bw_m3(p, t_bw_s) + p.v_ceb_m3
			
 
				+    V_net = max(0.0, V_feed_super - V_loss_super)
			
 
				+    recovery = max(0.0, V_net / max(V_feed_super, 1e-12))
			
 
				+
			
 
				+    # 时间与净供水率
			
 
				+    T_super_h = k_bw_per_ceb * (L_s + t_bw_s) / 3600.0 + p.t_ceb_s / 3600.0
			
 
				+    net_delivery_rate_m3ph = V_net / max(T_super_h, 1e-12)
			
 
				+
			
 
				+    # 贴边比例与硬限
			
 
				+    headroom_ratio = max_tmp_during_filtration / max(p.TMP_max, 1e-12)
			
 
				+    if headroom_ratio > p.headroom_hardcap + 1e-12:
			
 
				+        return False, {"reason": "headroom hardcap exceeded", "headroom_ratio": headroom_ratio}
			
 
				+
			
 
				+    # —— 新增指标 1：日均产水时间（h/d） ——
			
 
				+    daily_prod_time_h = k_bw_per_ceb * L_h / T_super_h * 24.0
			
 
				+
			
 
				+    # —— 新增指标 2：吨水电耗（kWh/m³） ——
			
 
				+    closest_L = min(energy_lookup.keys(), key=lambda x: abs(x - L_s))
			
 
				+    ton_water_energy = energy_lookup[closest_L]
			
 
				+
			
 
				+    info = {
			
 
				+        "recovery": recovery,
			
 
				+        "V_feed_super_m3": V_feed_super,
			
 
				+        "V_loss_super_m3": V_loss_super,
			
 
				+        "V_net_super_m3": V_net,
			
 
				+        "supercycle_time_h": T_super_h,
			
 
				+        "net_delivery_rate_m3ph": net_delivery_rate_m3ph,
			
 
				+        "max_TMP_during_filtration": max_tmp_during_filtration,
			
 
				+        "max_residual_increase_per_run": max_residual_increase,
			
 
				+        "phi_bw_effective": phi,
			
 
				+        "TMP_after_ceb": tmp_after_ceb,
			
 
				+        "headroom_ratio": headroom_ratio,
			
 
				+        "daily_prod_time_h": daily_prod_time_h,
			
 
				+        "ton_water_energy_kWh_per_m3": ton_water_energy,
			
 
				+        "k_bw_per_ceb": k_bw_per_ceb
			
 
				+    }
			
 
				+
			
 
				+    return True, info
			
 
				+
			
 
				+def _score(p: UFParams, rec: dict) -> float:
			
 
				+    """综合评分：越大越好。不同TMP0会改变max_TMP→改变惩罚→得到不同解。"""
			
 
				+    # 无量纲化净供水率
			
 
				+    rate_norm = rec["net_delivery_rate_m3ph"] / max(p.q_UF, 1e-12)
			
 
				+    headroom_penalty = (rec["max_TMP_during_filtration"] / max(p.TMP_max, 1e-12)) ** p.r_headroom
			
 
				+    return (p.w_rec * rec["recovery"]
			
 
				+            + p.w_rate * rate_norm
			
 
				+            - p.w_headroom * headroom_penalty)
			
 
				+
			
 
				+def optimize_2d(p: UFParams,
			
 
				+                L_min_s=None, L_max_s=None, L_step_s=None,
			
 
				+                t_bw_min_s=None, t_bw_max_s=None, t_bw_step_s=None):
			
 
				+    # 网格生成
			
 
				+    L_lo = p.L_min_s if L_min_s is None else float(L_min_s)
			
 
				+    L_hi = p.L_max_s if L_max_s is None else float(L_max_s)
			
 
				+    L_st = p.L_step_s if L_step_s is None else float(L_step_s)
			
 
				+
			
 
				+    t_lo = p.t_bw_min_s if t_bw_min_s is None else float(t_bw_min_s)
			
 
				+    t_hi = p.t_bw_max_s if t_bw_max_s is None else float(t_bw_max_s)
			
 
				+    t_st = p.t_bw_step_s if t_bw_step_s is None else float(t_bw_step_s)
			
 
				+
			
 
				+    L_vals = np.arange(L_lo, L_hi + 1e-9, L_st)
			
 
				+    t_vals = np.arange(t_lo, t_hi + 1e-9, t_st)
			
 
				+
			
 
				+    best = None
			
 
				+    best_score = -np.inf
			
 
				+
			
 
				+    for L_s in L_vals:
			
 
				+        for t_bw_s in t_vals:
			
 
				+            feasible, info = simulate_one_supercycle(p, L_s, t_bw_s)
			
 
				+            if not feasible:
			
 
				+                continue
			
 
				+
			
 
				+            rec = {"L_s": float(L_s), "t_bw_s": float(t_bw_s)}
			
 
				+            rec.update(info)
			
 
				+
			
 
				+            score = _score(p, rec)
			
 
				+
			
 
				+            if score > best_score + 1e-14:
			
 
				+                best_score = score
			
 
				+                best = rec.copy()
			
 
				+                best["score"] = float(score)
			
 
				+            # 若分数相同，偏好回收率更高，再偏好净供水率更高
			
 
				+            elif abs(score - best_score) <= 1e-14:
			
 
				+                if (rec["recovery"] > best["recovery"] + 1e-12) or (
			
 
				+                    abs(rec["recovery"] - best["recovery"]) <= 1e-12 and
			
 
				+                    rec["net_delivery_rate_m3ph"] > best["net_delivery_rate_m3ph"] + 1e-12
			
 
				+                ):
			
 
				+                    best = rec.copy()
			
 
				+                    best["score"] = float(score)
			
 
				+
			
 
				+    if best is None:
			
 
				+        return {"status": "no-feasible-solution"}
			
 
				+    best["status"] = "feasible"
			
 
				+    return best
			
 
				+
			
 
				+def run_uf_decision(TMP0: float = None) -> dict:
			
 
				+    if TMP0 is None:
			
 
				+        rng = np.random.default_rng()
			
 
				+        TMP0 = rng.uniform(0.03, 0.04)  # 初始TMP随机
			
 
				+
			
 
				+    params = UFParams(
			
 
				+        q_UF=360.0,
			
 
				+        TMP_max=0.05,
			
 
				+        alpha=1.2e-6,
			
 
				+        belta=1.0,
			
 
				+        q_bw_m3ph=1000.0,
			
 
				+        T_ceb_interval_h=48,
			
 
				+        v_ceb_m3=30.0,
			
 
				+        t_ceb_s=40*60.0,
			
 
				+        phi_ceb=1.0,
			
 
				+        dTMP=0.001,
			
 
				+
			
 
				+        L_min_s=3600.0, L_max_s=4200.0, L_step_s=30.0,
			
 
				+        t_bw_min_s=90.0, t_bw_max_s=100.0, t_bw_step_s=2.0,
			
 
				+
			
 
				+        phi_bw_min=0.70, phi_bw_max=1.00,
			
 
				+        L_ref_s=500.0, tau_bw_s=40.0, gamma_t=1.0,
			
 
				+
			
 
				+        TMP0=TMP0,
			
 
				+
			
 
				+        w_rec=0.7, w_rate=0.3, w_headroom=0.3, r_headroom=2.0, headroom_hardcap=0.9
			
 
				+    )
			
 
				+
			
 
				+    result = optimize_2d(params)
			
 
				+    if result.get("status") == "feasible":
			
 
				+        return {
			
 
				+            "L_s": result["L_s"],
			
 
				+            "t_bw_s": result["t_bw_s"],
			
 
				+            "recovery": result["recovery"],
			
 
				+            "k_bw_per_ceb": result["k_bw_per_ceb"],
			
 
				+            "daily_prod_time_h": result["daily_prod_time_h"],
			
 
				+            "ton_water_energy_kWh_per_m3": result["ton_water_energy_kWh_per_m3"]
			
 
				+        }
			
 
				+
			
 
				+    # 若没有可行解，返回最小过滤时间和默认值
			
 
				+    return {
			
 
				+        "L_s": params.L_min_s,
			
 
				+        "t_bw_s": params.t_bw_min_s,
			
 
				+        "recovery": 0.0,
			
 
				+        "k_bw_per_ceb": 1,
			
 
				+        "daily_prod_time_h": 0.0,
			
 
				+        "ton_water_energy_kWh_per_m3": 0.0
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def generate_plc_instructions(current_L_s, current_t_bw_s, model_prev_L_s, model_prev_t_bw_s, model_L_s, model_t_bw_s):
			
 
				+    """
			
 
				+    根据工厂当前值、模型上一轮决策值和模型当前轮决策值，生成PLC指令。
			
 
				+
			
 
				+    新增功能：
			
 
				+    1. 处理None值情况：如果模型上一轮值为None，则使用工厂当前值；
			
 
				+       如果工厂当前值也为None，则返回None并提示错误。
			
 
				+    """
			
 
				+    # 参数配置保持不变
			
 
				+    params = UFParams(
			
 
				+        L_min_s=3600.0, L_max_s=6000.0, L_step_s=60.0,
			
 
				+        t_bw_min_s=40.0, t_bw_max_s=60.0, t_bw_step_s=5.0,
			
 
				+    )
			
 
				+
			
 
				+    # 参数解包
			
 
				+    L_step_s = params.L_step_s
			
 
				+    t_bw_step_s = params.t_bw_step_s
			
 
				+    L_min_s = params.L_min_s
			
 
				+    L_max_s = params.L_max_s
			
 
				+    t_bw_min_s = params.t_bw_min_s
			
 
				+    t_bw_max_s = params.t_bw_max_s
			
 
				+    adjustment_threshold = 1.0
			
 
				+
			
 
				+    # 处理None值情况
			
 
				+    if model_prev_L_s is None:
			
 
				+        if current_L_s is None:
			
 
				+            print("错误: 过滤时长的工厂当前值和模型上一轮值均为None")
			
 
				+            return None, None
			
 
				+        else:
			
 
				+            # 使用工厂当前值作为基准
			
 
				+            effective_current_L = current_L_s
			
 
				+            source_L = "工厂当前值(模型上一轮值为None)"
			
 
				+    else:
			
 
				+        # 模型上一轮值不为None，继续检查工厂当前值
			
 
				+        if current_L_s is None:
			
 
				+            effective_current_L = model_prev_L_s
			
 
				+            source_L = "模型上一轮值(工厂当前值为None)"
			
 
				+        else:
			
 
				+            # 两个值都不为None，比较哪个更接近模型当前建议值
			
 
				+            current_to_model_diff = abs(current_L_s - model_L_s)
			
 
				+            prev_to_model_diff = abs(model_prev_L_s - model_L_s)
			
 
				+
			
 
				+            if current_to_model_diff <= prev_to_model_diff:
			
 
				+                effective_current_L = current_L_s
			
 
				+                source_L = "工厂当前值"
			
 
				+            else:
			
 
				+                effective_current_L = model_prev_L_s
			
 
				+                source_L = "模型上一轮值"
			
 
				+
			
 
				+    # 对反洗时长进行同样的处理
			
 
				+    if model_prev_t_bw_s is None:
			
 
				+        if current_t_bw_s is None:
			
 
				+            print("错误: 反洗时长的工厂当前值和模型上一轮值均为None")
			
 
				+            return None, None
			
 
				+        else:
			
 
				+            effective_current_t_bw = current_t_bw_s
			
 
				+            source_t_bw = "工厂当前值(模型上一轮值为None)"
			
 
				+    else:
			
 
				+        if current_t_bw_s is None:
			
 
				+            effective_current_t_bw = model_prev_t_bw_s
			
 
				+            source_t_bw = "模型上一轮值(工厂当前值为None)"
			
 
				+        else:
			
 
				+            current_to_model_t_bw_diff = abs(current_t_bw_s - model_t_bw_s)
			
 
				+            prev_to_model_t_bw_diff = abs(model_prev_t_bw_s - model_t_bw_s)
			
 
				+
			
 
				+            if current_to_model_t_bw_diff <= prev_to_model_t_bw_diff:
			
 
				+                effective_current_t_bw = current_t_bw_s
			
 
				+                source_t_bw = "工厂当前值"
			
 
				+            else:
			
 
				+                effective_current_t_bw = model_prev_t_bw_s
			
 
				+                source_t_bw = "模型上一轮值"
			
 
				+
			
 
				+    # 检测所有输入值是否在规定范围内（只对非None值进行检查）
			
 
				+    # 工厂当前值检查（警告）
			
 
				+    if current_L_s is not None and not (L_min_s <= current_L_s <= L_max_s):
			
 
				+        print(f"警告: 当前过滤时长 {current_L_s} 秒不在允许范围内 [{L_min_s}, {L_max_s}]")
			
 
				+    if current_t_bw_s is not None and not (t_bw_min_s <= current_t_bw_s <= t_bw_max_s):
			
 
				+        print(f"警告: 当前反洗时长 {current_t_bw_s} 秒不在允许范围内 [{t_bw_min_s}, {t_bw_max_s}]")
			
 
				+
			
 
				+    # 模型上一轮决策值检查（警告）
			
 
				+    if model_prev_L_s is not None and not (L_min_s <= model_prev_L_s <= L_max_s):
			
 
				+        print(f"警告: 模型上一轮过滤时长 {model_prev_L_s} 秒不在允许范围内 [{L_min_s}, {L_max_s}]")
			
 
				+    if model_prev_t_bw_s is not None and not (t_bw_min_s <= model_prev_t_bw_s <= t_bw_max_s):
			
 
				+        print(f"警告: 模型上一轮反洗时长 {model_prev_t_bw_s} 秒不在允许范围内 [{t_bw_min_s}, {t_bw_max_s}]")
			
 
				+
			
 
				+    # 模型当前轮决策值检查（错误）
			
 
				+    if model_L_s is None:
			
 
				+        raise ValueError("错误: 决策模型建议的过滤时长不能为None")
			
 
				+    elif not (L_min_s <= model_L_s <= L_max_s):
			
 
				+        raise ValueError(f"错误: 决策模型建议的过滤时长 {model_L_s} 秒不在允许范围内 [{L_min_s}, {L_max_s}]")
			
 
				+
			
 
				+    if model_t_bw_s is None:
			
 
				+        raise ValueError("错误: 决策模型建议的反洗时长不能为None")
			
 
				+    elif not (t_bw_min_s <= model_t_bw_s <= t_bw_max_s):
			
 
				+        raise ValueError(f"错误: 决策模型建议的反洗时长 {model_t_bw_s} 秒不在允许范围内 [{t_bw_min_s}, {t_bw_max_s}]")
			
 
				+
			
 
				+    print(f"过滤时长基准: {source_L}, 值: {effective_current_L}")
			
 
				+    print(f"反洗时长基准: {source_t_bw}, 值: {effective_current_t_bw}")
			
 
				+
			
 
				+    # 使用选定的基准值进行计算调整
			
 
				+    L_diff = model_L_s - effective_current_L
			
 
				+    L_adjustment = 0
			
 
				+    if abs(L_diff) > adjustment_threshold * L_step_s:
			
 
				+        if L_diff > 0:
			
 
				+            L_adjustment = L_step_s
			
 
				+        else:
			
 
				+            L_adjustment = -L_step_s
			
 
				+    next_L_s = effective_current_L + L_adjustment
			
 
				+
			
 
				+    t_bw_diff = model_t_bw_s - effective_current_t_bw
			
 
				+    t_bw_adjustment = 0
			
 
				+    if abs(t_bw_diff) > adjustment_threshold * t_bw_step_s:
			
 
				+        if t_bw_diff > 0:
			
 
				+            t_bw_adjustment = t_bw_step_s
			
 
				+        else:
			
 
				+            t_bw_adjustment = -t_bw_step_s
			
 
				+    next_t_bw_s = effective_current_t_bw + t_bw_adjustment
			
 
				+
			
 
				+    return next_L_s, next_t_bw_s
			
 
				+
			
 
				+
			
 
				+current_L_s = 3920
			
 
				+current_t_bw_s = 98
			
 
				+model_prev_L_s = None
			
 
				+model_prev_t_bw_s = None
			
 
				+model_L_s = 4160
			
 
				+model_t_bw_s = 96
			
 
				+next_L_s, next_t_bw_s = generate_plc_instructions(current_L_s, current_t_bw_s, model_prev_L_s, model_prev_t_bw_s, model_L_s, model_t_bw_s)
			
 
				+print(f"next_L_s={next_L_s}, next_t_bw_s={next_t_bw_s}")
			
--- a/models/uf-rl/超滤训练源码/UF_models.py
+++ b/models/uf-rl/超滤训练源码/UF_models.py
@@ -0,0 +1,33 @@
 
				+import torch
			
 
				+import numpy as np
			
 
				+
			
 
				+# TMP 上升量模型
			
 
				+class TMPIncreaseModel(torch.nn.Module):
			
 
				+    def __init__(self):
			
 
				+        super().__init__()
			
 
				+    def forward(self, p, L_h):
			
 
				+        return float(p.alpha * (p.q_UF ** p.belta) * L_h)
			
 
				+
			
 
				+# 反洗 TMP 去除模型
			
 
				+class TMPDecreaseModel(torch.nn.Module):
			
 
				+    def __init__(self):
			
 
				+        super().__init__()
			
 
				+    def forward(self, p, L_s, t_bw_s):
			
 
				+        L = max(float(L_s), 1.0)
			
 
				+        t = max(float(t_bw_s), 1e-6)
			
 
				+        upper_L = p.phi_bw_min + (p.phi_bw_max - p.phi_bw_min) * np.exp(- L / p.L_ref_s)
			
 
				+        time_gain = 1.0 - np.exp(- (t / p.tau_bw_s) ** p.gamma_t)
			
 
				+        phi = upper_L * time_gain
			
 
				+        return float(np.clip(phi, 0.0, 0.999))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    model_fp = TMPIncreaseModel()
			
 
				+    model_bw = TMPDecreaseModel()
			
 
				+
			
 
				+
			
 
				+    torch.save(model_fp.state_dict(), "uf_fp.pth")
			
 
				+    torch.save(model_bw.state_dict(), "uf_bw.pth")
			
 
				+
			
 
				+
			
 
				+    print("模型已安全保存为 uf_fp.pth、uf_bw.pth")
			
--- a/models/uf-rl/超滤训练源码/resistance_model_bw.pth
+++ b/models/uf-rl/超滤训练源码/resistance_model_bw.pth
--- a/models/uf-rl/超滤训练源码/resistance_model_fp.pth
+++ b/models/uf-rl/超滤训练源码/resistance_model_fp.pth
--- a/models/uf-rl/超滤训练源码/uf_bw.pth
+++ b/models/uf-rl/超滤训练源码/uf_bw.pth
--- a/models/uf-rl/超滤训练源码/uf_fp.pth
+++ b/models/uf-rl/超滤训练源码/uf_fp.pth
--- a/models/uf-rl/超滤训练源码/uf_resistance_models.py
+++ b/models/uf-rl/超滤训练源码/uf_resistance_models.py
@@ -0,0 +1,61 @@
 
				+import torch
			
 
				+import numpy as np
			
 
				+
			
 
				+# ===== 膜阻力上升模型 =====
			
 
				+class ResistanceIncreaseModel(torch.nn.Module):
			
 
				+    def __init__(self):
			
 
				+        super().__init__()
			
 
				+
			
 
				+    def forward(self, p, L_s):
			
 
				+        """
			
 
				+        计算膜阻力上升量 ΔR
			
 
				+        """
			
 
				+        A = 128 * 40.0
			
 
				+        J = p.q_UF / A / 3600
			
 
				+        # 膜阻力上升模型(已缩放)
			
 
				+        dR = p.nuK * J * L_s
			
 
				+        return float(dR)
			
 
				+
			
 
				+
			
 
				+# ===== 膜阻力下降模型 =====
			
 
				+class ResistanceDecreaseModel(torch.nn.Module):
			
 
				+    def __init__(self):
			
 
				+        super().__init__()
			
 
				+
			
 
				+    def forward(self, p, R0, R_end, L_h_start, L_h_next_start, t_bw_s):
			
 
				+        """
			
 
				+        计算物理反冲洗污染去除比例（受反洗时间影响），最大可去除的可逆膜阻力（受过滤时间影响）
			
 
				+        """
			
 
				+
			
 
				+        # 计算单次不可逆膜阻力（线性依赖于进水时间）
			
 
				+        # 周期起点和下次起点的理论阻力
			
 
				+        R_start = R0 + p.slope * (L_h_start ** p.power)
			
 
				+        R_next_start = R0 + p.slope * (L_h_next_start ** p.power)
			
 
				+
			
 
				+        # 不可逆污染（反洗后残余增加量）
			
 
				+        irreversible_R = max(R_next_start - R_start, 0.0)
			
 
				+
			
 
				+        # 本周期的总污染增长量
			
 
				+        total_increase = max(R_end - R_start, 0.0)
			
 
				+
			
 
				+        # 可逆污染量 = 本周期总增长 - 不可逆残留
			
 
				+        reversible_R = max(total_increase - irreversible_R, 0.0)
			
 
				+
			
 
				+        # 时间因子：反洗时间越长，效果越充分
			
 
				+        time_gain = 1.0 - np.exp(- (t_bw_s / p.tau_bw_s))
			
 
				+
			
 
				+        # 实际去除的膜阻力（随机在可去除区间内，乘以时间因子）
			
 
				+        dR_bw = reversible_R * time_gain
			
 
				+
			
 
				+        return float(np.clip(dR_bw, 0.0, reversible_R))
			
 
				+
			
 
				+
			
 
				+# ===== 主程序 =====
			
 
				+if __name__ == "__main__":
			
 
				+    model_fp = ResistanceIncreaseModel()
			
 
				+    model_bw = ResistanceDecreaseModel()
			
 
				+
			
 
				+    torch.save(model_fp.state_dict(), "resistance_model_fp.pth")
			
 
				+    torch.save(model_bw.state_dict(), "resistance_model_bw.pth")
			
 
				+
			
 
				+    print("模型已安全保存为 resistance_model_fp.pth、resistance_model_bw.pth")