|
|
@@ -42,25 +42,25 @@ class UFParams:
|
|
|
|
|
|
# —— 搜索范围(秒) ——
|
|
|
L_min_s: float = 3800.0 # 过滤时长下限(s)
|
|
|
- L_max_s: float = 4200.0 # 过滤时长上限(s)
|
|
|
- t_bw_min_s: float = 90.0 # 物洗时长下限(s)
|
|
|
- t_bw_max_s: float = 100.0 # 物洗时长上限(s)
|
|
|
+ L_max_s: float = 6000.0 # 过滤时长上限(s)
|
|
|
+ t_bw_min_s: float = 40.0 # 物洗时长下限(s)
|
|
|
+ t_bw_max_s: float = 60.0 # 物洗时长上限(s)
|
|
|
|
|
|
# —— 物理反洗恢复函数参数 ——
|
|
|
phi_bw_min: float = 0.7 # 物洗去除比例最小值
|
|
|
phi_bw_max: float = 1.0 # 物洗去除比例最大值
|
|
|
L_ref_s: float = 4000.0 # 过滤时长影响时间尺度
|
|
|
- tau_bw_s: float = 30.0 # 物洗时长影响时间尺度
|
|
|
+ tau_bw_s: float = 20.0 # 物洗时长影响时间尺度
|
|
|
gamma_t: float = 1.0 # 物洗时长作用指数
|
|
|
|
|
|
# —— 网格 ——
|
|
|
L_step_s: float = 60.0 # 过滤时长步长(s)
|
|
|
- t_bw_step_s: float = 2.0 # 物洗时长步长(s)
|
|
|
+ t_bw_step_s: float = 5.0 # 物洗时长步长(s)
|
|
|
|
|
|
# 多目标加权及高TMP惩罚
|
|
|
w_rec: float = 0.8 # 回收率权重
|
|
|
w_rate: float = 0.2 # 净供水率权重
|
|
|
- w_headroom: float = 0.3 # 贴边惩罚权重
|
|
|
+ w_headroom: float = 0.2 # 贴边惩罚权重
|
|
|
r_headroom: float = 2.0 # 贴边惩罚幂次
|
|
|
headroom_hardcap: float = 0.98 # 超过此比例直接视为不可取
|
|
|
|
|
|
@@ -75,28 +75,28 @@ class DQNParams:
|
|
|
learning_rate: float = 1e-4
|
|
|
|
|
|
# 经验回放缓冲区大小(步数)
|
|
|
- buffer_size: int = 2000
|
|
|
+ buffer_size: int = 10000
|
|
|
|
|
|
# 学习开始前需要收集的步数
|
|
|
learning_starts: int = 200
|
|
|
|
|
|
# 每次从经验池中采样的样本数量
|
|
|
- batch_size: int = 16
|
|
|
+ batch_size: int = 32
|
|
|
|
|
|
# 折扣因子,越接近1越重视长期奖励
|
|
|
gamma: float = 0.95
|
|
|
|
|
|
# 每隔多少步训练一次
|
|
|
- train_freq: int = 1
|
|
|
+ train_freq: int = 4
|
|
|
|
|
|
# 目标网络更新间隔
|
|
|
- target_update_interval: int = 1000
|
|
|
+ target_update_interval: int = 2000
|
|
|
|
|
|
# 初始探索率 ε
|
|
|
exploration_initial_eps: float = 1.0
|
|
|
|
|
|
# 从初始ε衰减到最终ε所占的训练比例
|
|
|
- exploration_fraction: float = 0.6
|
|
|
+ exploration_fraction: float = 0.3
|
|
|
|
|
|
# 最终探索率 ε
|
|
|
exploration_final_eps: float = 0.02
|
|
|
@@ -147,17 +147,19 @@ def simulate_one_supercycle(p: UFParams, L_s: float, t_bw_s: float):
|
|
|
返回 (是否可行, 指标字典)
|
|
|
- 支持动态CEB次数:48h固定间隔
|
|
|
- 增加日均产水时间和吨水电耗
|
|
|
+ - 增加最小TMP记录
|
|
|
"""
|
|
|
L_h = float(L_s) / 3600.0 # 小周期过滤时间(h)
|
|
|
|
|
|
tmp = p.TMP0
|
|
|
max_tmp_during_filtration = tmp
|
|
|
+ min_tmp_during_filtration = tmp # 新增:初始化最小TMP
|
|
|
max_residual_increase = 0.0
|
|
|
|
|
|
- # 小周期总时长(h) 小周期总时长 = 过滤时长 + 物洗时长
|
|
|
+ # 小周期总时长(h)
|
|
|
t_small_cycle_h = (L_s + t_bw_s) / 3600.0
|
|
|
|
|
|
- # 计算超级周期内CEB次数 超级周期内CEB次数 = 48h / 小周期总时长
|
|
|
+ # 计算超级周期内CEB次数
|
|
|
k_bw_per_ceb = int(np.floor(p.T_ceb_interval_h / t_small_cycle_h))
|
|
|
if k_bw_per_ceb < 1:
|
|
|
k_bw_per_ceb = 1 # 至少一个小周期
|
|
|
@@ -172,192 +174,209 @@ def simulate_one_supercycle(p: UFParams, L_s: float, t_bw_s: float):
|
|
|
for _ in range(k_bw_per_ceb):
|
|
|
tmp_run_start = tmp
|
|
|
|
|
|
- # 过滤阶段TMP增长
|
|
|
+ # 过滤阶段TMP增长
|
|
|
dtmp = _delta_tmp(p, L_h)
|
|
|
- tmp_peak = tmp_run_start + dtmp # 过滤阶段TMP峰值 = 过滤阶段TMP开始值 + 过滤阶段TMP上升量
|
|
|
+ tmp_peak = tmp_run_start + dtmp
|
|
|
|
|
|
# 约束1:峰值不得超过硬上限
|
|
|
- if tmp_peak > p.TMP_max + 1e-12:
|
|
|
+ if tmp_peak > p.TMP_max + 1e-12:
|
|
|
return False, {"reason": "TMP_max violated during filtration", "TMP_peak": tmp_peak}
|
|
|
|
|
|
- if tmp_peak > max_tmp_during_filtration: # 如果过滤阶段TMP峰值超过当前最大值
|
|
|
- max_tmp_during_filtration = tmp_peak # 更新最大值
|
|
|
+ # 更新最大和最小TMP
|
|
|
+ if tmp_peak > max_tmp_during_filtration:
|
|
|
+ max_tmp_during_filtration = tmp_peak
|
|
|
+ if tmp_run_start < min_tmp_during_filtration: # 新增:记录运行开始时的最小TMP
|
|
|
+ min_tmp_during_filtration = tmp_run_start
|
|
|
|
|
|
# 物理反洗
|
|
|
- phi = phi_bw_of(p, L_s, t_bw_s) # 物洗去除比例
|
|
|
- tmp_after_bw = tmp_peak - phi * (tmp_peak - tmp_run_start) # 物理反洗后TMP = 过滤阶段TMP峰值 - 物洗去除比例 * (过滤阶段TMP峰值 - 过滤阶段TMP开始值)
|
|
|
+ phi = phi_bw_of(p, L_s, t_bw_s)
|
|
|
+ tmp_after_bw = tmp_peak - phi * (tmp_peak - tmp_run_start)
|
|
|
|
|
|
# 约束2:单次残余增量控制
|
|
|
- residual_inc = tmp_after_bw - tmp_run_start # 单次残余增量 = 物理反洗后TMP - 过滤阶段TMP开始值
|
|
|
- if residual_inc > p.dTMP + 1e-12: # 如果单次残余增量超过单次残余增量上限
|
|
|
+ residual_inc = tmp_after_bw - tmp_run_start
|
|
|
+ if residual_inc > p.dTMP + 1e-12:
|
|
|
return False, {
|
|
|
- "reason": "residual TMP increase after BW exceeded dTMP", # 返回不可行
|
|
|
- "residual_increase": residual_inc, # 单次残余增量
|
|
|
+ "reason": "residual TMP increase after BW exceeded dTMP",
|
|
|
+ "residual_increase": residual_inc,
|
|
|
"limit_dTMP": p.dTMP
|
|
|
}
|
|
|
- if residual_inc > max_residual_increase: # 如果单次残余增量超过当前最大值
|
|
|
- max_residual_increase = residual_inc # 更新最大值
|
|
|
+ if residual_inc > max_residual_increase:
|
|
|
+ max_residual_increase = residual_inc
|
|
|
|
|
|
- tmp = tmp_after_bw # 更新TMP
|
|
|
+ tmp = tmp_after_bw
|
|
|
|
|
|
# CEB
|
|
|
- tmp_after_ceb = p.TMP0 # 化学反洗后TMP
|
|
|
+ tmp_after_ceb = p.TMP0
|
|
|
|
|
|
# 体积与回收率
|
|
|
- V_feed_super = k_bw_per_ceb * p.q_UF * L_h # 进水体积 进水体积 = 超级周期内CEB次数 * 过滤流量 * 过滤时长
|
|
|
- V_loss_super = k_bw_per_ceb * _v_bw_m3(p, t_bw_s) + p.v_ceb_m3 # 损失体积 损失体积 = 物洗体积 + CEB用水体积
|
|
|
- V_net = max(0.0, V_feed_super - V_loss_super) # 净产水体积 净产水体积 = 进水体积 - 损失体积
|
|
|
- recovery = max(0.0, V_net / max(V_feed_super, 1e-12)) # 回收率 净产水体积 / 进水体积
|
|
|
+ V_feed_super = k_bw_per_ceb * p.q_UF * L_h
|
|
|
+ V_loss_super = k_bw_per_ceb * _v_bw_m3(p, t_bw_s) + p.v_ceb_m3
|
|
|
+ V_net = max(0.0, V_feed_super - V_loss_super)
|
|
|
+ recovery = max(0.0, V_net / max(V_feed_super, 1e-12))
|
|
|
|
|
|
# 时间与净供水率
|
|
|
- T_super_h = k_bw_per_ceb * (L_s + t_bw_s) / 3600.0 + p.t_ceb_s / 3600.0 # 超循环时间 超循环时间 = 超级周期内CEB次数 * (过滤时长 + 物洗时长) / 3600 + CEB时长 / 3600
|
|
|
- net_delivery_rate_m3ph = V_net / max(T_super_h, 1e-12) # 净供水率 净产水体积 / 超循环时间
|
|
|
+ T_super_h = k_bw_per_ceb * (L_s + t_bw_s) / 3600.0 + p.t_ceb_s / 3600.0
|
|
|
+ net_delivery_rate_m3ph = V_net / max(T_super_h, 1e-12)
|
|
|
|
|
|
# 贴边比例与硬限
|
|
|
- headroom_ratio = max_tmp_during_filtration / max(p.TMP_max, 1e-12) # 贴边比例 过滤时段TMP峰值 / 硬上限
|
|
|
- if headroom_ratio > p.headroom_hardcap + 1e-12: # 如果贴边比例超过硬上限
|
|
|
- return False, {"reason": "headroom hardcap exceeded", "headroom_ratio": headroom_ratio} # 返回不可行
|
|
|
+ headroom_ratio = max_tmp_during_filtration / max(p.TMP_max, 1e-12)
|
|
|
+ if headroom_ratio > p.headroom_hardcap + 1e-12:
|
|
|
+ return False, {"reason": "headroom hardcap exceeded", "headroom_ratio": headroom_ratio}
|
|
|
|
|
|
# —— 新增指标 1:日均产水时间(h/d) ——
|
|
|
- daily_prod_time_h = k_bw_per_ceb * L_h / T_super_h * 24.0 # 日均产水时间 日均产水时间 = 超级周期内CEB次数 * 过滤时长 / 超循环时间 * 24
|
|
|
+ daily_prod_time_h = k_bw_per_ceb * L_h / T_super_h * 24.0
|
|
|
|
|
|
# —— 新增指标 2:吨水电耗(kWh/m³) ——
|
|
|
- closest_L = min(energy_lookup.keys(), key=lambda x: abs(x - L_s)) # 最接近的过滤时长
|
|
|
- ton_water_energy = energy_lookup[closest_L] # 吨水电耗 最接近的过滤时长对应的吨水电耗
|
|
|
+ closest_L = min(energy_lookup.keys(), key=lambda x: abs(x - L_s))
|
|
|
+ ton_water_energy = energy_lookup[closest_L]
|
|
|
|
|
|
info = {
|
|
|
- "recovery": recovery, # 回收率
|
|
|
- "V_feed_super_m3": V_feed_super, # 进水体积
|
|
|
- "V_loss_super_m3": V_loss_super, # 损失体积
|
|
|
- "V_net_super_m3": V_net, # 净产水体积
|
|
|
- "supercycle_time_h": T_super_h, # 超循环时间
|
|
|
- "net_delivery_rate_m3ph": net_delivery_rate_m3ph, # 净供水率
|
|
|
- "max_TMP_during_filtration": max_tmp_during_filtration, # 过滤时段TMP峰值
|
|
|
- "max_residual_increase_per_run": max_residual_increase, # 单次残余增量最大值
|
|
|
- "phi_bw_effective": phi, # 物洗去除比例
|
|
|
- "TMP_after_ceb": tmp_after_ceb, # 物理反洗后TMP
|
|
|
- "headroom_ratio": headroom_ratio, # 贴边比例
|
|
|
- "daily_prod_time_h": daily_prod_time_h, # 日均产水时间
|
|
|
- "ton_water_energy_kWh_per_m3": ton_water_energy, # 吨水电耗
|
|
|
- "k_bw_per_ceb": k_bw_per_ceb # 超级周期内CEB次数
|
|
|
+ "recovery": recovery,
|
|
|
+ "V_feed_super_m3": V_feed_super,
|
|
|
+ "V_loss_super_m3": V_loss_super,
|
|
|
+ "V_net_super_m3": V_net,
|
|
|
+ "supercycle_time_h": T_super_h,
|
|
|
+ "net_delivery_rate_m3ph": net_delivery_rate_m3ph,
|
|
|
+ "max_TMP_during_filtration": max_tmp_during_filtration,
|
|
|
+ "min_TMP_during_filtration": min_tmp_during_filtration, # 新增:最小TMP
|
|
|
+ "max_residual_increase_per_run": max_residual_increase,
|
|
|
+ "phi_bw_effective": phi,
|
|
|
+ "TMP_after_ceb": tmp_after_ceb,
|
|
|
+ "headroom_ratio": headroom_ratio,
|
|
|
+ "daily_prod_time_h": daily_prod_time_h,
|
|
|
+ "ton_water_energy_kWh_per_m3": ton_water_energy,
|
|
|
+ "k_bw_per_ceb": k_bw_per_ceb
|
|
|
}
|
|
|
|
|
|
return True, info
|
|
|
|
|
|
def _score(p: UFParams, rec: dict) -> float:
|
|
|
- """综合评分:越大越好。不同TMP0会改变max_TMP→改变惩罚→得到不同解。"""
|
|
|
- # 无量纲化净供水率
|
|
|
- rate_norm = rec["net_delivery_rate_m3ph"] / max(p.q_UF, 1e-12) # 无量纲化净供水率 净供水率 / 过滤流量 1000m3/h / 360m3/h = 2.7778
|
|
|
- headroom_penalty = (rec["max_TMP_during_filtration"] / max(p.TMP_max, 1e-12)) ** p.r_headroom # 贴边惩罚
|
|
|
- reward = (p.w_rec * rec["recovery"] + p.w_rate * rate_norm - p.w_headroom * headroom_penalty) # 奖励
|
|
|
- return reward
|
|
|
+ """综合评分:越大越好。通过非线性放大奖励差异,强化区分好坏动作"""
|
|
|
+
|
|
|
+ # —— 无量纲化净供水率 ——
|
|
|
+ rate_norm = rec["net_delivery_rate_m3ph"] / max(p.q_UF, 1e-12)
|
|
|
+
|
|
|
+ # —— TMP soft penalty (sigmoid) ——
|
|
|
+ tmp_ratio = rec["max_TMP_during_filtration"] / max(p.TMP_max, 1e-12)
|
|
|
+ k = 10.0
|
|
|
+ headroom_penalty = 1.0 / (1.0 + np.exp(-k * (tmp_ratio - 1.0)))
|
|
|
+
|
|
|
+ # —— 基础 reward(0.6~0.9左右)——
|
|
|
+ base_reward = (
|
|
|
+ p.w_rec * rec["recovery"]
|
|
|
+ + p.w_rate * rate_norm
|
|
|
+ - p.w_headroom * headroom_penalty
|
|
|
+ )
|
|
|
+
|
|
|
+ # —— 非线性放大:平方映射 + 缩放 ——
|
|
|
+ # 目的是放大好坏动作差异,同时限制最大值,避免 TD-error 过大
|
|
|
+ amplified_reward = (base_reward - 0.5) ** 2 * 5.0
|
|
|
+
|
|
|
+ # —— 可选:保留符号,区分负奖励
|
|
|
+ if base_reward < 0.5:
|
|
|
+ amplified_reward = -amplified_reward
|
|
|
+
|
|
|
+ return amplified_reward
|
|
|
+
|
|
|
|
|
|
def set_global_seed(seed: int):
|
|
|
"""固定全局随机种子,保证训练可复现"""
|
|
|
- random.seed(seed) # 随机种子
|
|
|
- np.random.seed(seed) # 随机种子
|
|
|
- torch.manual_seed(seed) # 随机种子
|
|
|
+ random.seed(seed)
|
|
|
+ np.random.seed(seed)
|
|
|
+ torch.manual_seed(seed)
|
|
|
torch.cuda.manual_seed_all(seed) # 如果使用GPU
|
|
|
- torch.backends.cudnn.deterministic = True # 确定性
|
|
|
- torch.backends.cudnn.benchmark = False # 不使用GPU
|
|
|
+ torch.backends.cudnn.deterministic = True
|
|
|
+ torch.backends.cudnn.benchmark = False
|
|
|
|
|
|
class UFSuperCycleEnv(gym.Env):
|
|
|
"""超滤系统环境(超级周期级别决策)"""
|
|
|
|
|
|
metadata = {"render_modes": ["human"]}
|
|
|
|
|
|
- def __init__(self, base_params, max_episode_steps: int = 10):
|
|
|
- super(UFSuperCycleEnv, self).__init__() # 初始化环境
|
|
|
+ def __init__(self, base_params, max_episode_steps: int = 20):
|
|
|
+ super(UFSuperCycleEnv, self).__init__()
|
|
|
|
|
|
- self.base_params = base_params # UFParams 实例
|
|
|
- self.current_params = copy.deepcopy(base_params) # UFParams 实例
|
|
|
- self.max_episode_steps = max_episode_steps # 最大步数
|
|
|
- self.current_step = 0 # 当前步数
|
|
|
+ self.base_params = base_params
|
|
|
+ self.current_params = copy.deepcopy(base_params)
|
|
|
+ self.max_episode_steps = max_episode_steps
|
|
|
+ self.current_step = 0
|
|
|
|
|
|
# 计算离散动作空间
|
|
|
self.L_values = np.arange(
|
|
|
- self.base_params.L_min_s, # 过滤时长下限
|
|
|
- self.base_params.L_max_s + self.base_params.L_step_s, # 过滤时长上限
|
|
|
- self.base_params.L_step_s # 过滤时长步长
|
|
|
+ self.base_params.L_min_s,
|
|
|
+ self.base_params.L_max_s + self.base_params.L_step_s,
|
|
|
+ self.base_params.L_step_s
|
|
|
)
|
|
|
self.t_bw_values = np.arange(
|
|
|
- self.base_params.t_bw_min_s, # 物洗时长下限
|
|
|
- self.base_params.t_bw_max_s + self.base_params.t_bw_step_s, # 物洗时长上限
|
|
|
- self.base_params.t_bw_step_s # 物洗时长步长
|
|
|
+ self.base_params.t_bw_min_s,
|
|
|
+ self.base_params.t_bw_max_s + self.base_params.t_bw_step_s,
|
|
|
+ self.base_params.t_bw_step_s
|
|
|
)
|
|
|
|
|
|
- self.num_L = len(self.L_values) # 过滤时长步数
|
|
|
- self.num_bw = len(self.t_bw_values) # 物洗时长步数
|
|
|
+ self.num_L = len(self.L_values)
|
|
|
+ self.num_bw = len(self.t_bw_values)
|
|
|
|
|
|
- # 单一离散动作空间,spaces.Discrete(n) 定义了一个包含 n 个离散动作或观测值的空间。这个空间包含从 0 到 n-1 的整数值
|
|
|
- self.action_space = spaces.Discrete(self.num_L * self.num_bw) # 动作空间,离散动作空间
|
|
|
+ # 单一离散动作空间
|
|
|
+ self.action_space = spaces.Discrete(self.num_L * self.num_bw)
|
|
|
|
|
|
- # 状态空间:归一化的[TMP0], 用于定义 连续的空间,通常用于表示那些具有连续值的观测空间或动作空间
|
|
|
+ # 状态空间增加 TMP0, 上一次动作(L_s, t_bw_s), 本周期最高 TMP
|
|
|
+ # 状态归一化均在 _get_obs 内处理
|
|
|
self.observation_space = spaces.Box(
|
|
|
- low=np.array([0.0], dtype=np.float32), # 单一维度,只有TMP0
|
|
|
- high=np.array([1.0], dtype=np.float32), # 单一维度,只有TMP0
|
|
|
- dtype=np.float32,
|
|
|
- shape=(1,) # 明确指定形状为1维
|
|
|
+ low=np.zeros(4, dtype=np.float32),
|
|
|
+ high=np.ones(4, dtype=np.float32),
|
|
|
+ dtype=np.float32
|
|
|
)
|
|
|
|
|
|
# 初始化状态
|
|
|
- self.reset(seed=None) # 重置环境
|
|
|
+ self.last_action = (self.base_params.L_min_s, self.base_params.t_bw_min_s)
|
|
|
+ self.max_TMP_during_filtration = self.current_params.TMP0
|
|
|
+ self.reset(seed=None)
|
|
|
|
|
|
def _get_obs(self):
|
|
|
- # 原始状态
|
|
|
- TMP0 = self.current_params.TMP0
|
|
|
- # 状态归一化
|
|
|
- TMP0_norm = (TMP0 - 0.01) / (0.05 - 0.01)
|
|
|
+ TMP0 = self.current_params.TMP0
|
|
|
+ TMP0_norm = (TMP0 - 0.01) / (0.05 - 0.01)
|
|
|
+
|
|
|
+ L_s, t_bw_s = self.last_action
|
|
|
+ L_norm = (L_s - self.base_params.L_min_s) / (self.base_params.L_max_s - self.base_params.L_min_s)
|
|
|
+ t_bw_norm = (t_bw_s - self.base_params.t_bw_min_s) / (self.base_params.t_bw_max_s - self.base_params.t_bw_min_s)
|
|
|
|
|
|
- return np.array([TMP0_norm], dtype=np.float32) # 状态
|
|
|
+ max_TMP_norm = (self.max_TMP_during_filtration - 0.01) / (0.05 - 0.01)
|
|
|
+
|
|
|
+ return np.array([TMP0_norm, L_norm, t_bw_norm, max_TMP_norm], dtype=np.float32)
|
|
|
|
|
|
def _get_action_values(self, action):
|
|
|
- """解析离散动作"""
|
|
|
- L_idx = action // self.num_bw # 过滤时长索引
|
|
|
- t_bw_idx = action % self.num_bw # 物洗时长索引
|
|
|
- return self.L_values[L_idx], self.t_bw_values[t_bw_idx] # 动作
|
|
|
+ L_idx = action // self.num_bw
|
|
|
+ t_bw_idx = action % self.num_bw
|
|
|
+ return self.L_values[L_idx], self.t_bw_values[t_bw_idx]
|
|
|
|
|
|
def reset(self, seed=None, options=None):
|
|
|
- """重置环境"""
|
|
|
super().reset(seed=seed)
|
|
|
-
|
|
|
- # 随机初始化 TMP0
|
|
|
- self.current_params.TMP0 = np.random.uniform(0.01, 0.05)
|
|
|
- # 初始化步数
|
|
|
+ self.current_params.TMP0 = np.random.uniform(0.01, 0.03)
|
|
|
self.current_step = 0
|
|
|
-
|
|
|
- return self._get_obs(), {} # Gymnasium要求返回(obs, info)
|
|
|
+ self.last_action = (self.base_params.L_min_s, self.base_params.t_bw_min_s)
|
|
|
+ self.max_TMP_during_filtration = self.current_params.TMP0
|
|
|
+ return self._get_obs(), {}
|
|
|
|
|
|
def step(self, action):
|
|
|
- """执行一个超级周期"""
|
|
|
self.current_step += 1
|
|
|
-
|
|
|
- # 解析动作 对应过滤时长和物洗时长
|
|
|
- L_s, t_bw_s = self._get_action_values(action)
|
|
|
-
|
|
|
- # 确保过滤时长和物洗时长在范围内 np.clip:限制在范围内
|
|
|
- L_s = np.clip(L_s, self.base_params.L_min_s, self.base_params.L_max_s)
|
|
|
- t_bw_s = np.clip(t_bw_s, self.base_params.t_bw_min_s, self.base_params.t_bw_max_s)
|
|
|
-
|
|
|
- # 记录当前状态 归一化状态
|
|
|
- current_obs = self._get_obs()
|
|
|
+ L_s, t_bw_s = self._get_action_values(action)
|
|
|
+ L_s = np.clip(L_s, self.base_params.L_min_s, self.base_params.L_max_s)
|
|
|
+ t_bw_s = np.clip(t_bw_s, self.base_params.t_bw_min_s, self.base_params.t_bw_max_s)
|
|
|
|
|
|
# 模拟超级周期
|
|
|
feasible, info = simulate_one_supercycle(self.current_params, L_s, t_bw_s)
|
|
|
|
|
|
- # 计算奖励
|
|
|
if feasible:
|
|
|
- reward = _score(self.current_params, info)
|
|
|
+ reward = _score(self.current_params, info)
|
|
|
self.current_params.TMP0 = info["TMP_after_ceb"]
|
|
|
+ self.max_TMP_during_filtration = info["max_TMP_during_filtration"]
|
|
|
terminated = False
|
|
|
else:
|
|
|
reward = -20
|
|
|
terminated = True
|
|
|
|
|
|
- # 检查是否达到最大步数
|
|
|
truncated = self.current_step >= self.max_episode_steps
|
|
|
-
|
|
|
- # 获取新状态
|
|
|
+ self.last_action = (L_s, t_bw_s)
|
|
|
next_obs = self._get_obs()
|
|
|
|
|
|
info["feasible"] = feasible
|
|
|
@@ -370,39 +389,39 @@ class UFEpisodeRecorder:
|
|
|
"""记录episode中的决策和结果"""
|
|
|
|
|
|
def __init__(self):
|
|
|
- self.episode_data = [] # 记录episode中的决策和结果
|
|
|
+ self.episode_data = []
|
|
|
self.current_episode = []
|
|
|
|
|
|
def record_step(self, obs, action, reward, done, info):
|
|
|
"""记录一步"""
|
|
|
step_data = {
|
|
|
- "obs": obs.copy(), # 新状态
|
|
|
- "action": action.copy(), # 动作
|
|
|
- "reward": reward, # 奖励
|
|
|
- "done": done, # 是否终止
|
|
|
- "info": info.copy() if info else {} # 信息
|
|
|
+ "obs": obs.copy(),
|
|
|
+ "action": action.copy(),
|
|
|
+ "reward": reward,
|
|
|
+ "done": done,
|
|
|
+ "info": info.copy() if info else {}
|
|
|
}
|
|
|
- self.current_episode.append(step_data) # 记录episode中的决策和结果
|
|
|
+ self.current_episode.append(step_data)
|
|
|
|
|
|
if done:
|
|
|
- self.episode_data.append(self.current_episode) # 记录episode中的决策和结果
|
|
|
- self.current_episode = []
|
|
|
+ self.episode_data.append(self.current_episode)
|
|
|
+ self.current_episode = []
|
|
|
|
|
|
def get_episode_stats(self, episode_idx=-1):
|
|
|
"""获取episode统计信息"""
|
|
|
if not self.episode_data:
|
|
|
return {}
|
|
|
|
|
|
- episode = self.episode_data[episode_idx] # 记录episode中的决策和结果
|
|
|
- total_reward = sum(step["reward"] for step in episode) # 总奖励
|
|
|
- avg_recovery = np.mean([step["info"].get("recovery", 0) for step in episode if "recovery" in step["info"]]) # 平均回收率
|
|
|
- feasible_steps = sum(1 for step in episode if step["info"].get("feasible", False)) # 可行的步数
|
|
|
+ episode = self.episode_data[episode_idx]
|
|
|
+ total_reward = sum(step["reward"] for step in episode)
|
|
|
+ avg_recovery = np.mean([step["info"].get("recovery", 0) for step in episode if "recovery" in step["info"]])
|
|
|
+ feasible_steps = sum(1 for step in episode if step["info"].get("feasible", False))
|
|
|
|
|
|
return {
|
|
|
- "total_reward": total_reward, # 总奖励
|
|
|
- "avg_recovery": avg_recovery, # 平均回收率
|
|
|
- "feasible_steps": feasible_steps, # 可行的步数
|
|
|
- "total_steps": len(episode) # 总步数
|
|
|
+ "total_reward": total_reward,
|
|
|
+ "avg_recovery": avg_recovery,
|
|
|
+ "feasible_steps": feasible_steps,
|
|
|
+ "total_steps": len(episode)
|
|
|
}
|
|
|
|
|
|
|
|
|
@@ -416,23 +435,23 @@ class UFTrainingCallback(BaseCallback):
|
|
|
"""
|
|
|
|
|
|
def __init__(self, recorder, verbose=0):
|
|
|
- super(UFTrainingCallback, self).__init__(verbose)
|
|
|
- self.recorder = recorder
|
|
|
+ super(UFTrainingCallback, self).__init__(verbose)
|
|
|
+ self.recorder = recorder
|
|
|
|
|
|
def _on_step(self) -> bool:
|
|
|
try:
|
|
|
- new_obs = self.locals.get("new_obs") # 新状态
|
|
|
- actions = self.locals.get("actions") # 动作
|
|
|
- rewards = self.locals.get("rewards") # 奖励
|
|
|
- dones = self.locals.get("dones") # 是否终止
|
|
|
- infos = self.locals.get("infos") # 信息
|
|
|
+ new_obs = self.locals.get("new_obs")
|
|
|
+ actions = self.locals.get("actions")
|
|
|
+ rewards = self.locals.get("rewards")
|
|
|
+ dones = self.locals.get("dones")
|
|
|
+ infos = self.locals.get("infos")
|
|
|
|
|
|
if len(new_obs) > 0:
|
|
|
- step_obs = new_obs[0] # 新状态
|
|
|
- step_action = actions[0] if actions is not None else None # 动作
|
|
|
- step_reward = rewards[0] if rewards is not None else 0.0 # 奖励
|
|
|
- step_done = dones[0] if dones is not None else False # 是否终止
|
|
|
- step_info = infos[0] if infos is not None else {} # 信息
|
|
|
+ step_obs = new_obs[0]
|
|
|
+ step_action = actions[0] if actions is not None else None
|
|
|
+ step_reward = rewards[0] if rewards is not None else 0.0
|
|
|
+ step_done = dones[0] if dones is not None else False
|
|
|
+ step_info = infos[0] if infos is not None else {}
|
|
|
|
|
|
# 打印当前 step 的信息
|
|
|
if self.verbose:
|
|
|
@@ -440,11 +459,11 @@ class UFTrainingCallback(BaseCallback):
|
|
|
|
|
|
# 记录数据
|
|
|
self.recorder.record_step(
|
|
|
- obs=step_obs, # 新状态
|
|
|
- action=step_action, # 动作
|
|
|
- reward=step_reward, # 奖励
|
|
|
- done=step_done, # 是否终止
|
|
|
- info=step_info, # 信息
|
|
|
+ obs=step_obs,
|
|
|
+ action=step_action,
|
|
|
+ reward=step_reward,
|
|
|
+ done=step_done,
|
|
|
+ info=step_info,
|
|
|
)
|
|
|
|
|
|
except Exception as e:
|
|
|
@@ -454,112 +473,82 @@ class UFTrainingCallback(BaseCallback):
|
|
|
return True
|
|
|
|
|
|
|
|
|
+
|
|
|
+
|
|
|
class DQNTrainer:
|
|
|
def __init__(self, env, params, callback=None):
|
|
|
- """
|
|
|
- 初始化 DQN 训练器
|
|
|
- :param env: 强化学习环境
|
|
|
- :param params: DQNParams 实例
|
|
|
- :param callback: 可选,训练回调器
|
|
|
- """
|
|
|
- self.env = env # 环境
|
|
|
- self.params = params # DQNParams 实例
|
|
|
- self.callback = callback # 训练回调器
|
|
|
- self.log_dir = self._create_log_dir() # 日志文件夹
|
|
|
- self.model = self._create_model() # 模型
|
|
|
+ self.env = env
|
|
|
+ self.params = params
|
|
|
+ self.callback = callback
|
|
|
+ self.log_dir = self._create_log_dir()
|
|
|
+ self.model = self._create_model()
|
|
|
|
|
|
def _create_log_dir(self):
|
|
|
- """
|
|
|
- 自动生成日志文件夹名:包含核心超参数 + 时间戳
|
|
|
- """
|
|
|
- timestamp = time.strftime("%Y%m%d-%H%M%S") # 时间戳
|
|
|
+ timestamp = time.strftime("%Y%m%d-%H%M%S")
|
|
|
log_name = (
|
|
|
- f"DQN_lr{self.params.learning_rate}_buf{self.params.buffer_size}_bs{self.params.batch_size}" # 日志文件夹名
|
|
|
- f"_gamma{self.params.gamma}_exp{self.params.exploration_fraction}" # 日志文件夹名
|
|
|
- f"_{self.params.remark}_{timestamp}" # 日志文件夹名
|
|
|
+ f"DQN_lr{self.params.learning_rate}_buf{self.params.buffer_size}_bs{self.params.batch_size}"
|
|
|
+ f"_gamma{self.params.gamma}_exp{self.params.exploration_fraction}"
|
|
|
+ f"_{self.params.remark}_{timestamp}"
|
|
|
)
|
|
|
- log_dir = os.path.join("./uf_dqn_tensorboard", log_name) # 日志文件夹
|
|
|
- os.makedirs(log_dir, exist_ok=True) # 创建日志文件夹
|
|
|
+ log_dir = os.path.join("./uf_dqn_tensorboard", log_name)
|
|
|
+ os.makedirs(log_dir, exist_ok=True)
|
|
|
return log_dir
|
|
|
|
|
|
def _create_model(self):
|
|
|
- """
|
|
|
- 根据参数创建 DQN 模型
|
|
|
- """
|
|
|
return DQN(
|
|
|
- policy="MlpPolicy", # 策略网络
|
|
|
- env=self.env, # 环境
|
|
|
- learning_rate=self.params.learning_rate, # 学习率
|
|
|
- buffer_size=self.params.buffer_size, # 经验回放缓冲区大小
|
|
|
- learning_starts=self.params.learning_starts, # 学习开始前需要收集的步数
|
|
|
- batch_size=self.params.batch_size, # 每次从经验池中采样的样本数量
|
|
|
- gamma=self.params.gamma, # 折扣因子,越接近1越重视长期奖励
|
|
|
- train_freq=self.params.train_freq, # 每隔多少步训练一次
|
|
|
- target_update_interval=self.params.target_update_interval, # 目标网络更新间隔
|
|
|
- exploration_initial_eps=self.params.exploration_initial_eps, # 初始探索率 ε
|
|
|
- exploration_fraction=self.params.exploration_fraction, # 从初始ε衰减到最终ε所占的训练比例
|
|
|
- exploration_final_eps=self.params.exploration_final_eps, # 最终探索率 ε
|
|
|
+ policy="MlpPolicy",
|
|
|
+ env=self.env,
|
|
|
+ learning_rate=self.params.learning_rate,
|
|
|
+ buffer_size=self.params.buffer_size, # 大缓冲保证经验多样性
|
|
|
+ learning_starts=self.params.learning_starts,
|
|
|
+ batch_size=self.params.batch_size,
|
|
|
+ gamma=self.params.gamma,
|
|
|
+ train_freq=self.params.train_freq,
|
|
|
+ target_update_interval=1,
|
|
|
+ tau=0.005,
|
|
|
+ exploration_initial_eps=self.params.exploration_initial_eps,
|
|
|
+ exploration_fraction=self.params.exploration_fraction,
|
|
|
+ exploration_final_eps=self.params.exploration_final_eps,
|
|
|
verbose=1,
|
|
|
tensorboard_log=self.log_dir
|
|
|
+ # 不再指定 replay_buffer_class,默认使用 ReplayBuffer
|
|
|
)
|
|
|
|
|
|
def train(self, total_timesteps: int):
|
|
|
- """
|
|
|
- 训练 DQN 模型,支持自定义回调器
|
|
|
- """
|
|
|
if self.callback:
|
|
|
- self.model.learn(total_timesteps=total_timesteps, callback=self.callback) # 支持自定义回调器
|
|
|
+ self.model.learn(total_timesteps=total_timesteps, callback=self.callback)
|
|
|
else:
|
|
|
- self.model.learn(total_timesteps=total_timesteps) # 不支持自定义回调器
|
|
|
+ self.model.learn(total_timesteps=total_timesteps)
|
|
|
print(f"模型训练完成,日志保存在:{self.log_dir}")
|
|
|
|
|
|
def save(self, path=None):
|
|
|
- """
|
|
|
- 保存模型到指定路径
|
|
|
- """
|
|
|
if path is None:
|
|
|
- path = os.path.join(self.log_dir, "dqn_model.zip") # 模型文件名
|
|
|
+ path = os.path.join(self.log_dir, "dqn_model.zip")
|
|
|
self.model.save(path)
|
|
|
print(f"模型已保存到:{path}")
|
|
|
|
|
|
def load(self, path):
|
|
|
- """
|
|
|
- 从指定路径加载模型
|
|
|
- """
|
|
|
- self.model = DQN.load(path, env=self.env) # 加载模型
|
|
|
+ self.model = DQN.load(path, env=self.env)
|
|
|
print(f"模型已从 {path} 加载")
|
|
|
|
|
|
-def train_uf_rl_agent(params: UFParams, total_timesteps: int = 10000, seed: int = 2025):
|
|
|
- """训练超滤系统RL代理(固定随机种子)"""
|
|
|
|
|
|
- # === 1. 固定全局随机种子 ===
|
|
|
+def train_uf_rl_agent(params: UFParams, total_timesteps: int = 10000, seed: int = 2025):
|
|
|
set_global_seed(seed)
|
|
|
+ recorder = UFEpisodeRecorder()
|
|
|
+ callback = UFTrainingCallback(recorder, verbose=1)
|
|
|
|
|
|
- # === 2. 创建回调器 ===
|
|
|
- recorder = UFEpisodeRecorder() # 记录每一步的数据
|
|
|
- callback = UFTrainingCallback(recorder, verbose=1) # 训练回调器
|
|
|
-
|
|
|
- # === 3. 创建环境并固定种子 ===
|
|
|
def make_env():
|
|
|
- env = UFSuperCycleEnv(params) # 创建环境
|
|
|
- env = Monitor(env) # 监控环境
|
|
|
+ env = UFSuperCycleEnv(params)
|
|
|
+ env = Monitor(env)
|
|
|
return env
|
|
|
|
|
|
- env = DummyVecEnv([make_env]) # 创建环境 多进程
|
|
|
+ env = DummyVecEnv([make_env])
|
|
|
|
|
|
- # === 4. 定义DQN参数 ===
|
|
|
dqn_params = DQNParams()
|
|
|
-
|
|
|
- # === 5. 创建训练器 ===
|
|
|
- trainer = DQNTrainer(env, dqn_params, callback=callback)
|
|
|
-
|
|
|
- # === 6. 训练模型 ===
|
|
|
+ trainer = DQNTrainer(env, dqn_params, callback=callback)
|
|
|
trainer.train(total_timesteps)
|
|
|
-
|
|
|
- # === 7. 保存模型 ===
|
|
|
trainer.save()
|
|
|
|
|
|
- # === 8. 输出训练统计信息 ===
|
|
|
stats = callback.recorder.get_episode_stats()
|
|
|
print(f"训练完成 - 总奖励: {stats.get('total_reward', 0):.2f}, 平均回收率: {stats.get('avg_recovery', 0):.3f}")
|
|
|
|
|
|
@@ -573,5 +562,5 @@ if __name__ == "__main__":
|
|
|
|
|
|
# 训练RL代理
|
|
|
print("开始训练RL代理...")
|
|
|
- train_uf_rl_agent(params, total_timesteps=8000)
|
|
|
+ train_uf_rl_agent(params, total_timesteps=50000)
|
|
|
|