"""
超滤强化学习环境模块
========================
本模块定义了超滤系统的强化学习环境，包括：
1. UFParams: 超滤系统参数配置类
2. 膜阻力与跨膜压差转换函数
3. simulate_one_supercycle: 超级周期模拟函数
4. calculate_reward: 奖励函数
5. is_dead_cycle: 失败判定函数
6. UFSuperCycleEnv: Gymnasium环境类

模块设计说明：
- 基于 Gymnasium (原OpenAI Gym) 标准接口
- 模拟超滤膜的"超级周期"运行（多次物理反洗 + 一次化学反洗）
- 强化学习智能体通过优化过滤时长和反洗时长来最大化回收率并控制污染累积
"""

import numpy as np
import gymnasium as gym
from gymnasium import spaces
from env.env_params import UFState, UFStateBounds, UFRewardParams, UFActionSpec
from env.uf_physics import UFPhysicsModel
from env.env_reset import ResetSampler
import copy


class UFSuperCycleEnv(gym.Env):
    """
    超滤系统强化学习环境（Gymnasium标准接口）
    
    功能：
    - 模拟超滤膜的超级周期运行
    - 智能体在每个超级周期选择过滤时长和反洗时长
    - 目标：最大化回收率同时控制污染累积
    
    状态空间 (8维，归一化到 [0,1])：
        1. TMP0: 初始跨膜压差
        2. q_UF: 过滤流量
        3. temp: 水温
        4. R0: 初始膜阻力
        5. nuK: 短期污染系数
        6. slope: 长期污染斜率
        7. power: 长期污染幂次
        8. ceb_removal: CEB去除能力
    
    动作空间 (离散)：
        - 二维离散动作组合：(过滤时长, 反洗时长)
        - 过滤时长: L_min_s ~ L_max_s，步长 L_step_s
        - 反洗时长: t_bw_min_s ~ t_bw_max_s，步长 t_bw_step_s
        - 总动作数 = len(L_values) × len(t_bw_values)
    
    奖励机制：
        - 基于回收率和残余污染的平衡
        - 失败 (TMP超限、回收率过低、污染过快) 时给予大负奖励 (-10)
    
    终止条件：
        - terminated: 违反运行约束（失败）
        - truncated: 达到最大步数 (max_episode_steps)
    """

    metadata = {"render_modes": ["human"]}

    def __init__(
            self,
            physics: UFPhysicsModel,
            reward_params: UFRewardParams,
            action_spec:UFActionSpec,
            statebounds:UFStateBounds,
            real_state_pool,
            max_episode_steps: int = 45,
            RANDOM_SEED = 1024
    ):
        """
        超滤强化学习环境

        参数：
            physics(UFPhysicsModel): 超滤物理模型
            reward_params(UFRewardParams): 奖励函数参数
            max_episode_steps (int): 每个episode的最大步数，默认45
                注：每步代表一个超级周期（约2-3天），45步约三个月
        """

        super(UFSuperCycleEnv, self).__init__()

        self.RANDOM_SEED = RANDOM_SEED
        self.physics = physics
        self.reward_params = reward_params
        self.max_episode_steps = max_episode_steps

        self.current_step = 0


        # -------- 动作空间 --------
        self.action_spec = action_spec

        self.L_values = np.arange(
            self.action_spec.L_min_s,
            self.action_spec.L_max_s ,
            self.action_spec.L_step_s,
        )

        self.t_bw_values = np.arange(
            self.action_spec.t_bw_min_s,
            self.action_spec.t_bw_max_s,
            self.action_spec.t_bw_step_s,
        )

        self.num_L = len(self.L_values)
        self.num_bw = len(self.t_bw_values)

        self.action_space = spaces.Discrete(self.num_L * self.num_bw)

        # -------- 状态空间 --------
        self.observation_space = spaces.Box(
            low=0.0,
            high=1.0,
            shape=(8,),
            dtype=np.float32,
        )

        self.state_bounds = statebounds # 状态边界
        self.real_state_pool = real_state_pool

        self.reset_sampler = ResetSampler(
            bounds=self.state_bounds,
            physics=physics,
            real_state_pool=self.real_state_pool,
            max_resample_attempts=50,
            random_state=np.random.RandomState(RANDOM_SEED)
        )


    def _generate_initial_state(self) -> UFState | None:
        """
        在 UFStateBounds 定义的范围内采样一个【合法】初始状态。
        若采样失败（约束不满足）返回 None，由 reset() 负责重试。
        """

        b = self.state_bounds
        A = 128 * 40.0  # 有效膜面积

        # ---- 1. 基础工况 ----
        # ---- 随机生成 TMP、q_UF、温度 ----
        TMP0 = np.random.uniform(b.TMP0_min, b.TMP0_max)
        q_UF = np.random.uniform(b.q_UF_min, b.q_UF_max)
        temp = np.random.uniform(b.temp_min, b.temp_max)

        # ---- 2. 污染增长参数 ----
        slope = np.random.uniform(b.slope_min, b.slope_max)
        power = np.random.uniform(b.power_min, b.power_max)

        # ---- 3. 约束：污染增长速率可实现 ----
        t_max = 60 if power >= 1 else 1
        required_nuK_min = slope * power * (t_max ** (power - 1)) * (A / q_UF)

        # 若 required_nuK_min 超过可选范围 → 初始状态非法
        if required_nuK_min > b.nuK_max:
            return None
        # 在可行范围中采样 nuK
        nuK = np.random.uniform(
            max(required_nuK_min, b.nuK_min),
            b.nuK_max
        )

        # ---- 4. CEB 去除率 ----
        ceb_removal = np.random.uniform(
            b.ceb_removal_min,
            b.ceb_removal_max
        )

        # ---- 5. 初始膜阻力（物理模型） ----
        R0 = self.physics.calculate_initial_resistance(
            TMP=TMP0,
            q_UF=q_UF,
            temp=temp
        )

        return UFState(
            TMP=TMP0,
            q_UF=q_UF,
            temp=temp,
            R=R0,
            slope=slope,
            power=power,
            nuK=nuK,
            ceb_removal=ceb_removal,
        )

    def _get_training_progress(self) -> float:
        """
        返回训练进度，用于 reset_sampler 的 curriculum sampling
        """
        return min(1.0, self.current_step / self.max_episode_steps )

    def reset(self, seed=None, options=None, max_attempts: int = 10000):
        super().reset(seed=seed)

        progress = self._get_training_progress()

        for _ in range(max_attempts):
            state = self.reset_sampler.sample(progress)
            if state is None:
                continue

            ok_run = self.physics.check_dead_initial_state(
                init_state=state,
                max_steps=self.max_episode_steps,
                L_s=self.action_spec.L_min_s,
                t_bw_s=self.action_spec.t_bw_max_s
            )

            if ok_run:
                self.state = state
                break
        else:
            raise RuntimeError("无法生成可行初始状态")

        self.current_step = 0
        self.tmp_over_limit_flag = False
        self.last_action = None
        self.max_TMP_during_filtration = self.state.TMP

        return self.get_obs(self.state), {}

    def _get_state_copy(self):
        return copy.deepcopy(self.state)

    def get_obs(self, state):
        """
        构建当前环境归一化状态向量
        """
        # === 1. 从 state 读取动态参数 ===
        TMP = state.TMP
        q_UF = state.q_UF
        temp = state.temp

        # === 2. 计算本周期初始膜阻力 ===
        R = state.R

        # === 3. 从 self.state 读取膜阻力增长模型参数 ===
        nuk = state.nuK
        slope = state.slope
        power = state.power
        ceb_removal = state.ceb_removal

        # === 4. 从 current_params 动态读取上下限 ===
        TMP0_min, TMP0_max = self.state_bounds.TMP0_min, self.state_bounds.global_TMP_hard_limit
        q_UF_min, q_UF_max = self.state_bounds.q_UF_min, self.state_bounds.q_UF_max
        temp_min, temp_max = self.state_bounds.temp_min, self.state_bounds.temp_max
        nuK_min, nuK_max = self.state_bounds.nuK_min, self.state_bounds.nuK_max
        slope_min, slope_max = self.state_bounds.slope_min, self.state_bounds.slope_max
        power_min, power_max = self.state_bounds.power_min, self.state_bounds.power_max
        ceb_min, ceb_max = self.state_bounds.ceb_removal_min, self.state_bounds.ceb_removal_max

        # === 5. 归一化计算（clip防止越界） ===
        TMP0_norm = np.clip((TMP - TMP0_min) / (TMP0_max - TMP0_min), 0, 1)
        q_UF_norm = np.clip((q_UF - q_UF_min) / (q_UF_max - q_UF_min), 0, 1)
        temp_norm = np.clip((temp - temp_min) / (temp_max - temp_min), 0, 1)

        # R0 不在 current_params 中定义上下限，设定经验范围
        R0_norm = np.clip((R - 100.0) / (800.0 - 100.0), 0, 1)

        short_term_norm = np.clip((nuk - nuK_min) / (nuK_max - nuK_min), 0, 1)
        long_term_slope_norm = np.clip((slope - slope_min) / (slope_max - slope_min), 0, 1)
        long_term_power_norm = np.clip((power - power_min) / (power_max - power_min), 0, 1)
        ceb_removal_norm = np.clip((ceb_removal - ceb_min) / (ceb_max - ceb_min), 0, 1)

        # === 6. 构建观测向量 ===
        obs = np.array([
            TMP0_norm,
            q_UF_norm,
            temp_norm,
            R0_norm,
            short_term_norm,
            long_term_slope_norm,
            long_term_power_norm,
            ceb_removal_norm
        ], dtype=np.float32)

        return obs

    def get_action_values(self, action):
        """
        将动作还原为实际时长
        """
        L_idx = action // self.num_bw
        t_bw_idx = action % self.num_bw
        return self.L_values[L_idx], self.t_bw_values[t_bw_idx]

    def step(self, action):
        self.current_step += 1
        L_s, t_bw_s = self.get_action_values(action)
        L_s = np.clip(L_s, self.action_spec.L_min_s, self.action_spec.L_max_s)
        t_bw_s = np.clip(t_bw_s, self.action_spec.t_bw_min_s, self.action_spec.t_bw_max_s)

        # 模拟超级周期
        info, next_state = self.physics.simulate_one_supercycle(state=self.state,L_s=L_s, t_bw_s=t_bw_s)
        # 根据 info 判断是否成功
        feasible = self.physics.is_dead_cycle(info)  # True 表示成功循环，False 表示失败

        if info["max_TMP_during_filtration"] >= self.reward_params.global_TMP_hard_limit:
            self.tmp_over_limit_flag = True

        # ================== 孤立观察下一周期 ==================
        info_next = None
        if info["max_TMP_during_filtration"] > self.reward_params.global_TMP_soft_limit:
            info_next, _ = self.physics.simulate_one_supercycle(state=next_state,L_s=L_s,t_bw_s=t_bw_s)

        reward,tmp_penalty,econ_reward,res_penalty= self._calculate_reward(info, info_next)
        info["tmp_penalty"] = tmp_penalty
        info["econ_reward"] = econ_reward
        info["res_penalty"] = res_penalty

        self.state = next_state
        terminated = False

        # 判断是否到达最大步数
        truncated = self.current_step >= self.max_episode_steps

        self.last_action = (L_s, t_bw_s)
        next_obs = self.get_obs(next_state)

        info["feasible"] = feasible
        info["step"] = self.current_step
        info["L_s"] = L_s.copy()
        info["t_bw_s"] = t_bw_s.copy()

        # # ===================== 测试终末奖励：鼓励 TMP 接近初始状态 =====================
        # # 仅在 episode 自然结束（满步但未提前失败）时触发
        # if truncated and not terminated:
        #     TMP_initial = self.TMP0  # reset 时记录的初始 TMP
        #     TMP_final = next_obs[0]  # next_obs 提供的最终 TMP
        #
        #     delta_ratio = abs((TMP_final - TMP_initial) / TMP_initial)
        #
        #     alpha = 4.0  # TMP 偏差敏感度
        #     gamma = 5.0  # 奖励幅度
        #     stability_reward = gamma * (np.exp(-alpha * delta_ratio) - 1) # 量级在0到-5之间
        #
        #     reward += stability_reward
        #     terminated = True  # episode 正式结束

        # # ===================== 测试结果 =====================
        # 增加该奖励后强化学习依然能保证奖励收敛，但是损失函数在2-3之间反复震荡，无法降低,见reward_test&loss_test
        # 原设想是只能听在大额偏移发生前能通过该奖励学习到提前减小偏移步伐，但是实际训练时该惩罚反复被触发
        # 推测是终末的大额奖惩无法有效传递回过往时间步引导智能体学习，可能由于状态中缺少预测值，智能体会将其观测为不可控事件，暂时不添加该奖励，TODO：等待优化

        return next_obs, reward, terminated, truncated, info

    def _calculate_reward(self, info: dict, info_next=None):
        """
        计算强化学习奖励函数（经济性 + 系统稳定性）

        奖励结构：
            Reward = 经济奖励 + 污染控制奖励 + TMP风险惩罚

        经济奖励：
            基于吨水电耗 + 吨水药耗

        稳定性奖励：
            - 残余污染控制
            - TMP软限制
            - TMP增长趋势

        返回：
            total_reward,
            tmp_penalty,
            econ_reward,
            res_penalty
        """

        # ==============================
        # TMP 状态惩罚
        # ==============================

        tmp = info["max_TMP_during_filtration"]
        tmp_soft = self.reward_params.global_TMP_soft_limit
        tmp_hard = self.reward_params.global_TMP_hard_limit

        if self.tmp_over_limit_flag:
            tmp_state_penalty = -self.reward_params.w_tmp_hard

        elif tmp <= tmp_soft:
            tmp_state_penalty = 0.0

        elif tmp < tmp_hard:
            x = (tmp - tmp_soft) / (tmp_hard - tmp_soft)

            tmp_state_penalty = -self.reward_params.w_tmp * (
                    x ** self.reward_params.p
            )

        else:
            tmp_state_penalty = -self.reward_params.w_tmp_hard

        # ==============================
        # TMP 趋势惩罚
        # ==============================

        tmp_trend_penalty = 0.0

        if info_next is not None:
            delta_tmp = (
                    info_next["max_TMP_during_filtration"] - tmp
            )

            # 只惩罚TMP上升
            delta_tmp = max(delta_tmp, 0)

            tmp_trend_penalty = -self.reward_params.w_trend * delta_tmp

        tmp_penalty = tmp_state_penalty + tmp_trend_penalty

        # ==============================
        # 残余污染惩罚
        # ==============================

        residual_ratio = info["residual_ratio"]

        ref_residual = 1 / self.max_episode_steps

        res_penalty = -np.tanh(
            self.reward_params.k_res *
            (residual_ratio / ref_residual - 1)
        )

        # ==============================
        # 经济成本（电耗 + 药耗）
        # ==============================

        energy = info["ton_water_energy"]
        chemical = info["ton_water_chem"]

        chemical_price = self.reward_params.chemical_price
        energy_price =self.reward_params.energy_price

        cost = energy * energy_price + chemical * chemical_price * 100

        # 成本归一化范围
        cost_low = self.reward_params.cost_low
        cost_high = self.reward_params.cost_high

        cost_norm = (
                (cost - cost_low) /
                (cost_high - cost_low)
        )

        econ_reward = -np.tanh(
            self.reward_params.k_cost *
            (cost_norm - 0.5)
        )

        # ==============================
        # 总奖励
        # ==============================

        total_reward = (
                econ_reward
                + res_penalty
                + tmp_penalty
        )

        return (
            total_reward,
            tmp_penalty,
            econ_reward,
            res_penalty
        )