فهرست منبع

删除 'models/pressure-predictor/20分钟TMP预测模型源码/predict.py'

zhanghao 5 ماه پیش
والد
کامیت
523cf7eb4a
1فایلهای تغییر یافته به همراه0 افزوده شده و 314 حذف شده
  1. 0 314
      models/pressure-predictor/20分钟TMP预测模型源码/predict.py

+ 0 - 314
models/pressure-predictor/20分钟TMP预测模型源码/predict.py

@@ -1,314 +0,0 @@
-import os
-import torch
-import pandas as pd
-import numpy as np
-import joblib
-import pywt
-from datetime import datetime, timedelta
-from torch.utils.data import DataLoader, TensorDataset
-from gat_lstm import GAT_LSTM    # 导入自定义的GAT-LSTM模型
-from tqdm import tqdm
-
-def set_seed(seed):
-    """设置随机种子,保证实验可重复性"""
-    import random
-    random.seed(seed)
-    os.environ['PYTHONHASHSEED'] = str(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-
-class Predictor:
-    """预测器类,用于加载数据、模型并执行预测流程"""
-    def __init__(self):
-        self.seq_len = 10    # 输入序列长度(历史时间步)
-        self.output_size = 5    # 预测步长(未来预测的时间步数)
-        self.labels_num = 16    # 预测目标数量(16个待预测的指标)
-        self.feature_num = 79   # 输入特征总维度
-        self.step_size = 5      # 数据采样步长(每隔step_size取一个样本)
-        self.dropout = 0        # dropout概率(防止过拟合)
-        self.lr = 0.01          # 学习率(训练时使用,预测时仅作参数记录)
-        self.num_heads = 8      # 注意力头数(模型结构参数)
-        self.hidden_size = 32   # 隐藏层维度
-        self.batch_size = 512   # 批处理大小
-        self.num_layers = 1     # LSTM层数
-        self.resolution = 60    # 数据分辨率(原始数据每隔60条取一条,下采样)
-        self.test_start_date = '2025-07-01'  # 测试集起始日期(初始值,会动态更新)
-        self.wavelet = 'db4'    # 小波变换类型(预留,未实际使用)
-        self.level = 3          # 小波分解层数(预留)
-        self.level_after = 4    # 后续小波处理层数(预留)
-        self.mode = 'soft'      # 小波阈值模式(预留)
-        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  # 计算设备(GPU优先)
-        self.model_path = 'model.pth'   # 模型权重保存路径
-        self.output_csv_path = 'predictions.csv'   # 预测结果保存路径
-        self.random_seed = 1314    # 随机种子
-        self.uf_threshold = 0.001   # UF指标阈值(预留)
-        self.ro_threshold = 0.01    # RO指标阈值(预留)
-        self.flow_threshold = 1.0   # 流量阈值(预留)
-
-        set_seed(self.random_seed)    # 初始化随机种子
-        self.scaler = joblib.load('scaler.pkl')    # 加载数据归一化器(训练时保存)
-        self.model = None         # 模型实例(后续加载)
-        self.edge_index = None    # 图结构边索引(图模型用)
-        self.test_loader = None   # 测试数据加载器(后续创建)
-        
-    def reorder_columns(self, df):
-        """
-        调整数据列顺序,确保与训练时的特征顺序一致
-        避免因列顺序不一致导致模型输入特征错位
-        """
-        desired_order = [
-            'index',
-            'C.M.FT_ZGJJY1@out','C.M.RO1_FT_JS@out','C.M.RO2_FT_JS@out','C.M.RO3_FT_JS@out',
-            'C.M.RO4_FT_JS@out','C.M.UF1_FT_JS@out','C.M.UF2_FT_JS@out','C.M.UF3_FT_JS@out',
-            'C.M.UF4_FT_JS@out','C.M.UF_FT_ZCS@out','C.M.FT_ZGJJY2@out','C.M.FT_ZGJJY3@out',
-            'C.M.FT_ZGJJY4@out','C.M.RO1_PT_JS@out','C.M.RO2_PT_JS@out','C.M.RO3_PT_JS@out',
-            'C.M.UF1_PT_JS@out','C.M.UF2_PT_JS@out','C.M.UF3_PT_JS@out','C.M.UF4_PT_JS@out',
-            'C.M.LT_JSC@out','C.M.RO1_PT_CS@out','C.M.RO1_PT_DJ2@out','C.M.RO2_PT_CS@out',
-            'C.M.RO2_PT_DJ2@out','C.M.RO3_PT_CS@out','C.M.RO3_PT_DJ2@out','C.M.RO4_PT_CS@out',
-            'C.M.RO4_PT_DJ2@out','C.M.RO4_PT_JS@out','C.M.LT_HCl@out','C.M.LT_NaClO@out',
-            'C.M.LT_PAC@out','C.M.LT_QSC@out','C.M.RO_Cond_ZCS@out','C.M.RO_TT_ZJS@out',
-            'C.M.UF1_JSF_kd@out','C.M.UF2_JSF_kd@out','C.M.UF_GSB4_fre@out','C.M.UF_ORP_ZCS@out',
-            'C.M.JYB2_ZGJ1_fre@out','C.M.JYB2_ZGJ2_fre@out','C.M.JYB2_ZGJ3_fre@out','C.M.JYB2_ZGJ4_fre@out',
-            'C.M.RO1_GYB_fre@out','C.M.RO2_GYB_fre@out','C.M.RO3_GYB_fre@out','C.M.RO4_GYB_fre@out',
-            'C.M.UF3_JSF_kd@out','C.M.UF4_JSF_kd@out','C.M.UF_FXB2_fre@out','C.M.RO1_DJB_fre@out',
-            'C.M.RO1_GYBF_kd@out','C.M.RO2_DJB_fre@out','C.M.RO2_GYBF_kd@out','C.M.RO3_DJB_fre@out',
-            'C.M.RO3_GYBF_kd@out','C.M.RO4_DJB_fre@out','C.M.RO4_GYBF_kd@out',
-            'C.M.UF1_DB@press_PV','C.M.UF2_DB@press_PV','C.M.UF3_DB@press_PV','C.M.UF4_DB@press_PV',
-            'C.M.RO1_DB@DPT_1','C.M.RO2_DB@DPT_1','C.M.RO3_DB@DPT_1','C.M.RO4_DB@DPT_1',
-            'C.M.RO1_DB@DPT_2','C.M.RO2_DB@DPT_2','C.M.RO3_DB@DPT_2','C.M.RO4_DB@DPT_2',
-            'RO1_CSFlow','RO2_CSFlow','RO3_CSFlow','RO4_CSFlow'
-        ]
-        return df.loc[:, desired_order]
-
-    def process_date(self, data):
-        """
-        处理日期列,生成周期性时间特征(捕捉时间周期性模式)
-        包括:分钟级正弦/余弦特征(每日周期)、年中日正弦/余弦特征(年度周期)
-        """
-        if 'index' in data.columns:
-            data = data.rename(columns={'index': 'date'})
-        data['date'] = pd.to_datetime(data['date'])
-        data['minute_of_day'] = data['date'].dt.hour * 60 + data['date'].dt.minute
-        data['day_of_year'] = data['date'].dt.dayofyear
-        
-        # 周期性编码(将时间转换为正弦/余弦值,确保周期性连续)
-        data['minute_sin'] = np.sin(2 * np.pi * data['minute_of_day'] / 1440)  # 分钟正弦特征
-        data['minute_cos'] = np.cos(2 * np.pi * data['minute_of_day'] / 1440)  # 分钟余弦特征
-        data['day_year_sin'] = np.sin(2 * np.pi * data['day_of_year'] / 366)   # 年中日正弦特征
-        data['day_year_cos'] = np.cos(2 * np.pi * data['day_of_year'] / 366)   # 年中日余弦特征
-        # 移除原始时间列(仅保留编码后的特征)
-        data.drop(columns=['minute_of_day', 'day_of_year'], inplace=True)
-        
-        # 调整列顺序:日期 + 时间特征 + 其他特征
-        time_features = ['minute_sin', 'minute_cos', 'day_year_sin', 'day_year_cos']
-        other_columns = [col for col in data.columns if col not in ['date'] + time_features]
-        return data[['date'] + time_features + other_columns]
-
-    def scaler_data(self, data):
-        """
-        对数据进行归一化(使用训练时保存的scaler)
-        保持与训练数据的归一化方式一致(0-1缩放)
-        """
-        date_col = data[['date']]
-        data_to_scale = data.drop(columns=['date'])
-        scaled = self.scaler.transform(data_to_scale)
-        scaled_df = pd.DataFrame(scaled, columns=data_to_scale.columns)
-        # 拼接日期列和归一化后的特征列
-        return pd.concat([date_col.reset_index(drop=True), scaled_df], axis=1)
-    
-    def remove_outliers(self, predictions):
-        """
-        用四分位法处理预测结果中的异常值
-        异常值定义:小于Q1-1.5*IQR或大于Q3+1.5*IQR的值
-        异常值替换为正常值的平均值(避免极端值影响)
-        """
-        cleaned = predictions.copy()
-        # 遍历每个特征列(16个标签)
-        for col in range(cleaned.shape[1]):
-            values = cleaned[:, col]
-            # 计算四分位数
-            q1 = np.percentile(values, 25)
-            q3 = np.percentile(values, 75)
-            iqr = q3 - q1
-            # 异常值边界
-            lower_bound = q1 - 1.5 * iqr
-            upper_bound = q3 + 1.5 * iqr
-            # 筛选正常值
-            normal_values = values[(values >= lower_bound) & (values <= upper_bound)]
-            # 用正常值的平均值替换异常值
-            if len(normal_values) > 0:
-                mean_normal = np.mean(normal_values)
-                cleaned[(values < lower_bound) | (values > upper_bound), col] = mean_normal
-        return cleaned
-    
-    def smooth_predictions(self, predictions):
-        """
-        对预测结果进行加权平滑处理,减少预测波动
-        采用滑动窗口加权平均:中间值权重为2,前后邻居权重为1(边缘值特殊处理)
-        """
-        smoothed = predictions.copy()
-        n_timesteps = predictions.shape[0]
-        if n_timesteps <= 1:
-            return smoothed
-        
-        # 遍历每个特征列
-        for col in range(predictions.shape[1]):
-            values = predictions[:, col]
-            # 第一个值:加权前两个值(避免边缘过度平滑)
-            smoothed[0, col] = (2 * values[0] + values[1]) / 3
-            # 中间值:加权前后邻居(核心平滑)
-            for i in range(1, n_timesteps - 1):
-                smoothed[i, col] = (values[i-1] + 2 * values[i] + values[i+1]) / 4
-            # 最后一个值:加权最后两个值(避免边缘过度平滑)
-            smoothed[-1, col] = (values[-2] + 2 * values[-1]) / 3
-        return smoothed
-
-    def create_test_loader(self, df):
-        """
-        构建测试数据加载器(将原始数据转换为模型输入格式)
-        输入:预处理后的DataFrame
-        输出:PyTorch DataLoader(批量加载模型输入)
-        """
-        df['date'] = pd.to_datetime(df['date'])
-        # 计算时间间隔(根据分辨率,单位:分钟)
-        time_interval = pd.Timedelta(minutes=(4 * self.resolution / 60))
-        # 计算窗口时间跨度(确保能覆盖输入序列长度+预测步长)
-        window_time_span = time_interval * (self.seq_len + 2)
-        # 调整测试集起始时间(确保有足够的历史数据构建输入序列)
-        adjusted_test_start = pd.to_datetime(self.test_start_date) - window_time_span
-        # 筛选所需的历史数据
-        test_df = df[df['date'] >= adjusted_test_start].reset_index(drop=True)
-
-        test_df = test_df.drop(columns=['date'])
-
-        # 构建监督学习数据集(输入序列+目标序列的占位)
-        feature_columns = test_df.columns.tolist()
-        cols = []
-        
-        # 构建输入序列(历史seq_len个时间步的特征)
-        for col in feature_columns:
-            for i in range(self.seq_len - 1, -1, -1):
-                cols.append(test_df[[col]].shift(i))   # 滞后i步的特征(t-0到t-(seq_len-1))
-                
-        # 构建目标序列占位(未来output_size个时间步的标签,预测时不使用真实值)
-        for i in range(1, self.output_size + 1):
-            for col in feature_columns[-self.labels_num:]:
-                cols.append(test_df[[col]].shift(-i))    # 超前i步的标签(t+1到t+output_size)
-                
-        # 合并列并按步长采样,最后取最后一行作为预测输入(最新的历史数据)
-        dataset = pd.concat(cols, axis=1).iloc[::self.step_size]
-        dataset = dataset.iloc[[-1]]
-    
-        # 提取输入特征(前n_features_total列)
-        n_features_total = self.feature_num * self.seq_len
-        supervised_data = dataset.iloc[:, :n_features_total]
-
-        # 转换为模型输入格式:[样本数, 序列长度, 特征数]
-        X = supervised_data.values.reshape(-1, self.seq_len, self.feature_num)
-        X = torch.tensor(X, dtype=torch.float32).to(self.device)
-        tensor_dataset = TensorDataset(X)
-        loader = DataLoader(tensor_dataset, batch_size=self.batch_size, shuffle=False)
-        return loader
-
-    def load_data(self, df):
-        """
-        数据加载主流程:重排列、下采样、日期处理、归一化、创建测试加载器
-        确保输入数据格式与训练时一致
-        """
-        df = self.reorder_columns(df)
-        df = df.iloc[::self.resolution, :].reset_index(drop=True)
-        df = self.process_date(df)
-        df = self.scaler_data(df)
-        self.test_loader = self.create_test_loader(df)
-        self.edge_index = torch.load('edge_index.pt', weights_only=True)
-
-    def load_model(self):
-        """加载模型结构和预训练权重,并设置为评估模式"""
-        self.model = GAT_LSTM(self).to(self.device)
-        if self.edge_index is not None:
-            self.model.set_edge_index(self.edge_index.to(self.device))   # 设置图边索引
-        self.model.load_state_dict(torch.load(self.model_path, map_location=self.device, weights_only=True))
-        self.model.eval()
-
-    def predict(self, df):
-        """
-        执行预测主流程:更新测试起始时间、加载数据、加载模型、执行预测、反归一化
-        输入:原始数据DataFrame
-        输出:反归一化后的预测结果(numpy数组)
-        """
-        
-        # 更新测试起始时间为输入数据最新时间+4分钟(预测起始点)
-        self.test_start_date = (pd.to_datetime(df['index']).max() + timedelta(minutes=4)).strftime("%Y-%m-%d %H:%M:%S")
-        self.load_data(df)
-        self.load_model()
-
-        all_predictions = []
-        with torch.no_grad():
-            for batch in self.test_loader:
-                inputs = batch[0].to(self.device)
-                outputs = self.model(inputs)
-                all_predictions.append(outputs.cpu().numpy())
-        
-        # 拼接所有批次的预测结果,并重塑为[时间步, 标签数]
-        predictions = np.concatenate(all_predictions, axis=0).reshape(-1, self.labels_num)
-        
-        # 反归一化(仅对标签列,使用训练时的scaler参数)
-        from sklearn.preprocessing import MinMaxScaler
-        inverse_scaler = MinMaxScaler()
-        inverse_scaler.min_ = self.scaler.min_[-self.labels_num:]
-        inverse_scaler.scale_ = self.scaler.scale_[-self.labels_num:]
-        predictions = inverse_scaler.inverse_transform(predictions)
-        
-        # 可选:异常值处理和平滑(当前注释掉,可根据需求启用)
-        # predictions = self.remove_outliers(predictions)  # 处理异常值
-        # predictions = self.smooth_predictions(predictions)  # 平滑处理
-        return predictions
-
-    def save_predictions(self, predictions):
-        """
-        将预测结果保存为CSV文件,包含时间戳和各指标的预测值
-        输入:反归一化后的预测结果(numpy数组)
-        """
-        start_time = datetime.strptime(self.test_start_date, "%Y-%m-%d %H:%M:%S")
-        time_interval = timedelta(minutes=(4 * self.resolution / 60))
-        timestamps = [start_time + i * time_interval for i in range(len(predictions))]
-
-        # 定义16个预测目标的原始列名
-        base_columns = [
-            'C.M.UF1_DB@press_PV', 'C.M.UF2_DB@press_PV', 'C.M.UF3_DB@press_PV', 'C.M.UF4_DB@press_PV',
-            'C.M.RO1_DB@DPT_1', 'C.M.RO2_DB@DPT_1', 'C.M.RO3_DB@DPT_1', 'C.M.RO4_DB@DPT_1',
-            'C.M.RO1_DB@DPT_2', 'C.M.RO2_DB@DPT_2', 'C.M.RO3_DB@DPT_2', 'C.M.RO4_DB@DPT_2',
-            'RO1_CSFlow', 'RO2_CSFlow', 'RO3_CSFlow', 'RO4_CSFlow'
-        ]
-        pred_columns = [f'{col}_pred' for col in base_columns]
-        df_result = pd.DataFrame(predictions, columns=pred_columns)
-        df_result.insert(0, 'date', timestamps)
-        df_result.to_csv(self.output_csv_path, index=False)
-        print(f"预测结果保存至:{self.output_csv_path}")
-
-if __name__ == '__main__':
-    """主函数:初始化预测器、加载数据、执行预测并保存结果"""
-    predictor = Predictor()
-    
-    base_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
-    data_dir = os.path.join(base_dir, 'datasets_xishan')
-    file_pattern = 'data_process_{}.csv'
-    file_indices = range(46, 50)
-
-    dfs = []
-    for i in file_indices:
-        file_path = os.path.join(data_dir, file_pattern.format(i))
-        if not os.path.exists(file_path):
-            raise FileNotFoundError(f"未找到文件: {file_path}")
-        print(f"读取文件:{file_path}")
-        df = pd.read_csv(file_path)
-        dfs.append(df)
-
-    df = pd.concat(dfs, ignore_index=True)
-    
-    predictions = predictor.predict(df)
-    predictor.save_predictions(predictions)