import os import torch import pandas as pd import numpy as np import joblib import pywt from datetime import datetime, timedelta from torch.utils.data import DataLoader, TensorDataset from gat_lstm import GAT_LSTM # 导入自定义的GAT-LSTM模型 from tqdm import tqdm def set_seed(seed): """设置随机种子,保证实验可重复性""" import random random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False class Predictor: """预测器类,用于加载数据、模型并执行预测流程""" def __init__(self): self.seq_len = 10 # 输入序列长度(历史时间步) self.output_size = 5 # 预测步长(未来预测的时间步数) self.labels_num = 16 # 预测目标数量(16个待预测的指标) self.feature_num = 79 # 输入特征总维度 self.step_size = 5 # 数据采样步长(每隔step_size取一个样本) self.dropout = 0 # dropout概率(防止过拟合) self.lr = 0.01 # 学习率(训练时使用,预测时仅作参数记录) self.num_heads = 8 # 注意力头数(模型结构参数) self.hidden_size = 64 # 隐藏层维度 self.batch_size = 512 # 批处理大小 self.num_layers = 1 # LSTM层数 self.resolution = 60 # 数据分辨率(原始数据每隔60条取一条,下采样) self.test_start_date = '2025-07-01' # 测试集起始日期(初始值,会动态更新) self.wavelet = 'db4' # 小波变换类型(预留,未实际使用) self.level = 3 # 小波分解层数(预留) self.level_after = 4 # 后续小波处理层数(预留) self.mode = 'soft' # 小波阈值模式(预留) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # 计算设备(GPU优先) self.model_path = '20min_model.pth' # 模型权重保存路径 self.output_csv_path = '20min_predictions.csv' # 预测结果保存路径 self.random_seed = 1314 # 随机种子 self.min_rows = 600 # 定义最小数据行数要求(600行) self.uf_threshold = 0.001 # UF指标阈值(预留) self.ro_threshold = 0.01 # RO指标阈值(预留) self.flow_threshold = 1.0 # 流量阈值(预留) # 定义16个预测目标的原始列名 self.target_columns = [ 'C.M.UF1_DB@press_PV', 'C.M.UF2_DB@press_PV', 'C.M.UF3_DB@press_PV', 'C.M.UF4_DB@press_PV', 'UF1Per','UF2Per','UF3Per','UF4Per', 'C.M.RO1_DB@DPT_1', 'C.M.RO2_DB@DPT_1', 'C.M.RO3_DB@DPT_1', 'C.M.RO4_DB@DPT_1', 'C.M.RO1_DB@DPT_2', 'C.M.RO2_DB@DPT_2', 'C.M.RO3_DB@DPT_2', 'C.M.RO4_DB@DPT_2', ] self.raw_input_data = None set_seed(self.random_seed) # 初始化随机种子 self.scaler = joblib.load('20min_scaler.pkl') # 加载数据归一化器(训练时保存) self.model = None # 模型实例(后续加载) self.edge_index = None # 图结构边索引(图模型用) self.test_loader = None # 测试数据加载器(后续创建) def ensure_min_rows(self, df): """ 确保数据至少有600行,不足则进行前后补充 向前补充:使用最早的数据向前扩展 向后补充:使用最新的数据向后扩展 """ current_rows = len(df) if current_rows >= self.min_rows: return df # 计算需要补充的行数 need_rows = self.min_rows - current_rows print(f"数据行数不足{self.min_rows}行(当前{current_rows}行),需要补充{need_rows}行") # 计算时间间隔(假设数据是均匀采样的) time_col = 'index' df[time_col] = pd.to_datetime(df[time_col]) time_diff = (df[time_col].iloc[1] - df[time_col].iloc[0]).total_seconds() # 向前补充(使用最早的数据) forward_rows = need_rows // 2 if forward_rows > 0: earliest_data = df.iloc[0:1].copy() forward_data = [] for i in range(1, forward_rows + 1): new_row = earliest_data.copy() new_row[time_col] = earliest_data[time_col] - timedelta(seconds=time_diff * i) forward_data.append(new_row) forward_df = pd.concat(forward_data, ignore_index=True) df = pd.concat([forward_df, df], ignore_index=True) # 检查是否还需要向后补充 current_rows = len(df) if current_rows < self.min_rows: backward_rows = self.min_rows - current_rows latest_data = df.iloc[-1:].copy() backward_data = [] for i in range(1, backward_rows + 1): new_row = latest_data.copy() new_row[time_col] = latest_data[time_col] + timedelta(seconds=time_diff * i) backward_data.append(new_row) backward_df = pd.concat(backward_data, ignore_index=True) df = pd.concat([df, backward_df], ignore_index=True) print(f"数据补充完成,当前行数:{len(df)}行") return df def reorder_columns(self, df): """ 调整数据列顺序,确保与训练时的特征顺序一致 避免因列顺序不一致导致模型输入特征错位 """ desired_order = [ 'index', 'C.M.FT_ZGJJY1@out','C.M.RO1_FT_JS@out','C.M.RO2_FT_JS@out','C.M.RO3_FT_JS@out', 'C.M.RO4_FT_JS@out','C.M.UF1_FT_JS@out','C.M.UF2_FT_JS@out','C.M.UF3_FT_JS@out', 'C.M.UF4_FT_JS@out','C.M.UF_FT_ZCS@out','C.M.FT_ZGJJY2@out','C.M.FT_ZGJJY3@out', 'C.M.FT_ZGJJY4@out','C.M.RO1_PT_JS@out','C.M.RO2_PT_JS@out','C.M.RO3_PT_JS@out', 'C.M.UF1_PT_JS@out','C.M.UF2_PT_JS@out','C.M.UF3_PT_JS@out','C.M.UF4_PT_JS@out', 'C.M.LT_JSC@out','C.M.RO1_PT_CS@out','C.M.RO1_PT_DJ2@out','C.M.RO2_PT_CS@out', 'C.M.RO2_PT_DJ2@out','C.M.RO3_PT_CS@out','C.M.RO3_PT_DJ2@out','C.M.RO4_PT_CS@out', 'C.M.RO4_PT_DJ2@out','C.M.RO4_PT_JS@out','C.M.LT_HCl@out','C.M.LT_NaClO@out', 'C.M.LT_PAC@out','C.M.LT_QSC@out','C.M.RO_Cond_ZCS@out','C.M.RO_TT_ZJS@out', 'C.M.UF1_JSF_kd@out','C.M.UF2_JSF_kd@out','C.M.UF_GSB4_fre@out','C.M.UF_ORP_ZCS@out', 'C.M.JYB2_ZGJ1_fre@out','C.M.JYB2_ZGJ2_fre@out','C.M.JYB2_ZGJ3_fre@out','C.M.JYB2_ZGJ4_fre@out', 'C.M.RO1_GYB_fre@out','C.M.RO2_GYB_fre@out','C.M.RO3_GYB_fre@out','C.M.RO4_GYB_fre@out', 'C.M.UF3_JSF_kd@out','C.M.UF4_JSF_kd@out','C.M.UF_FXB2_fre@out','C.M.RO1_DJB_fre@out', 'C.M.RO1_GYBF_kd@out','C.M.RO2_DJB_fre@out','C.M.RO2_GYBF_kd@out','C.M.RO3_DJB_fre@out', 'C.M.RO3_GYBF_kd@out','C.M.RO4_DJB_fre@out','C.M.RO4_GYBF_kd@out', 'C.M.UF1_DB@press_PV','C.M.UF2_DB@press_PV','C.M.UF3_DB@press_PV','C.M.UF4_DB@press_PV', 'UF1Per','UF2Per','UF3Per','UF4Per', 'C.M.RO1_DB@DPT_1','C.M.RO2_DB@DPT_1','C.M.RO3_DB@DPT_1','C.M.RO4_DB@DPT_1', 'C.M.RO1_DB@DPT_2','C.M.RO2_DB@DPT_2','C.M.RO3_DB@DPT_2','C.M.RO4_DB@DPT_2', ] return df.loc[:, desired_order] def process_date(self, data): """ 处理日期列,生成周期性时间特征(捕捉时间周期性模式) 包括:分钟级正弦/余弦特征(每日周期)、年中日正弦/余弦特征(年度周期) """ if 'index' in data.columns: data = data.rename(columns={'index': 'date'}) data['date'] = pd.to_datetime(data['date']) data['minute_of_day'] = data['date'].dt.hour * 60 + data['date'].dt.minute data['day_of_year'] = data['date'].dt.dayofyear # 周期性编码(将时间转换为正弦/余弦值,确保周期性连续) data['minute_sin'] = np.sin(2 * np.pi * data['minute_of_day'] / 1440) # 分钟正弦特征 data['minute_cos'] = np.cos(2 * np.pi * data['minute_of_day'] / 1440) # 分钟余弦特征 data['day_year_sin'] = np.sin(2 * np.pi * data['day_of_year'] / 366) # 年中日正弦特征 data['day_year_cos'] = np.cos(2 * np.pi * data['day_of_year'] / 366) # 年中日余弦特征 # 移除原始时间列(仅保留编码后的特征) data.drop(columns=['minute_of_day', 'day_of_year'], inplace=True) # 调整列顺序:日期 + 时间特征 + 其他特征 time_features = ['minute_sin', 'minute_cos', 'day_year_sin', 'day_year_cos'] other_columns = [col for col in data.columns if col not in ['date'] + time_features] return data[['date'] + time_features + other_columns] def scaler_data(self, data): """ 对数据进行归一化(使用训练时保存的scaler) 保持与训练数据的归一化方式一致(0-1缩放) """ date_col = data[['date']] data_to_scale = data.drop(columns=['date']) scaled = self.scaler.transform(data_to_scale) scaled_df = pd.DataFrame(scaled, columns=data_to_scale.columns) # 拼接日期列和归一化后的特征列 return pd.concat([date_col.reset_index(drop=True), scaled_df], axis=1) def remove_outliers(self, predictions): """ 用四分位法处理预测结果中的异常值 异常值定义:小于Q1-1.5*IQR或大于Q3+1.5*IQR的值 异常值替换为正常值的平均值(避免极端值影响) """ cleaned = predictions.copy() # 遍历每个特征列(16个标签) for col in range(cleaned.shape[1]): values = cleaned[:, col] # 计算四分位数 q1 = np.percentile(values, 25) q3 = np.percentile(values, 75) iqr = q3 - q1 # 异常值边界 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr # 筛选正常值 normal_values = values[(values >= lower_bound) & (values <= upper_bound)] # 用正常值的平均值替换异常值 if len(normal_values) > 0: mean_normal = np.mean(normal_values) cleaned[(values < lower_bound) | (values > upper_bound), col] = mean_normal return cleaned def smooth_predictions(self, predictions): """ 对预测结果进行加权平滑处理,减少预测波动 采用滑动窗口加权平均:中间值权重为2,前后邻居权重为1(边缘值特殊处理) """ smoothed = predictions.copy() n_timesteps = predictions.shape[0] if n_timesteps <= 1: return smoothed # 遍历每个特征列 for col in range(predictions.shape[1]): values = predictions[:, col] # 第一个值:加权前两个值(避免边缘过度平滑) smoothed[0, col] = (2 * values[0] + values[1]) / 3 # 中间值:加权前后邻居(核心平滑) for i in range(1, n_timesteps - 1): smoothed[i, col] = (values[i-1] + 2 * values[i] + values[i+1]) / 4 # 最后一个值:加权最后两个值(避免边缘过度平滑) smoothed[-1, col] = (values[-2] + 2 * values[-1]) / 3 return smoothed def create_test_loader(self, df): """ 构建测试数据加载器(将原始数据转换为模型输入格式) 输入:预处理后的DataFrame 输出:PyTorch DataLoader(批量加载模型输入) """ df['date'] = pd.to_datetime(df['date']) # 计算时间间隔(根据分辨率,单位:分钟) time_interval = pd.Timedelta(minutes=(4 * self.resolution / 60)) # 计算窗口时间跨度(确保能覆盖输入序列长度+预测步长) window_time_span = time_interval * (self.seq_len + 20) # 调整测试集起始时间(确保有足够的历史数据构建输入序列) adjusted_test_start = pd.to_datetime(self.test_start_date) - window_time_span # 筛选所需的历史数据 test_df = df[df['date'] >= adjusted_test_start].reset_index(drop=True) test_df = test_df.drop(columns=['date']) # 构建监督学习数据集(输入序列+目标序列的占位) feature_columns = test_df.columns.tolist() cols = [] # 构建输入序列(历史seq_len个时间步的特征) for col in feature_columns: for i in range(self.seq_len - 1, -1, -1): cols.append(test_df[[col]].shift(i)) # 滞后i步的特征(t-0到t-(seq_len-1)) # 构建目标序列占位(未来output_size个时间步的标签,预测时不使用真实值) for i in range(1, self.output_size + 1): for col in feature_columns[-self.labels_num:]: cols.append(test_df[[col]].shift(-i)) # 超前i步的标签(t+1到t+output_size) # 合并列并按步长采样,最后取最后一行作为预测输入(最新的历史数据) dataset = pd.concat(cols, axis=1).iloc[::self.step_size] dataset = dataset.iloc[[-1]] # 提取输入特征(前n_features_total列) n_features_total = self.feature_num * self.seq_len supervised_data = dataset.iloc[:, :n_features_total] # 转换为模型输入格式:[样本数, 序列长度, 特征数] X = supervised_data.values.reshape(-1, self.seq_len, self.feature_num) X = torch.tensor(X, dtype=torch.float32).to(self.device) tensor_dataset = TensorDataset(X) loader = DataLoader(tensor_dataset, batch_size=self.batch_size, shuffle=False) return loader def load_data(self, df): """ 数据加载主流程:重排列、下采样、日期处理、归一化、创建测试加载器 确保输入数据格式与训练时一致 """ df = self.reorder_columns(df) df = df.iloc[::self.resolution, :].reset_index(drop=True) df = self.process_date(df) df = self.scaler_data(df) self.test_loader = self.create_test_loader(df) self.edge_index = torch.load('edge_index.pt', map_location=self.device, weights_only=True) def load_model(self): """加载模型结构和预训练权重,并设置为评估模式""" self.model = GAT_LSTM(self).to(self.device) if self.edge_index is not None: self.model.set_edge_index(self.edge_index.to(self.device)) # 设置图边索引 self.model.load_state_dict(torch.load(self.model_path, map_location=self.device, weights_only=True)) self.model.eval() def get_recent_values_as_fallback(self): """从原始输入数据中获取最近的output_size条记录作为备用输出""" # 确保原始数据已保存 if self.raw_input_data is None: raise ValueError("原始输入数据未保存,无法获取备用值") # 按时间排序并取最近的output_size条 recent_data = self.raw_input_data.sort_values('index').tail(self.output_size) # 若数据不足,用最后一条补充 if len(recent_data) < self.output_size: last_row = recent_data.iloc[-1:] if not recent_data.empty else pd.DataFrame( {col: [0.0] for col in self.target_columns}, index=[0]) while len(recent_data) < self.output_size: recent_data = pd.concat([recent_data, last_row], ignore_index=True) # 提取目标列值并返回 fallback_values = recent_data[self.target_columns].values return fallback_values def predict(self, df): """ 执行预测主流程:更新测试起始时间、加载数据、加载模型、执行预测、反归一化 输入:原始数据DataFrame 输出:反归一化后的预测结果(numpy数组) """ # 保存原始输入数据用于可能的降级策略 self.raw_input_data = df.copy() # 确保数据行数不少于600行 df = self.ensure_min_rows(df) # 更新测试起始时间为输入数据最新时间+4分钟(预测起始点) self.test_start_date = (pd.to_datetime(df['index']).max() + timedelta(minutes=4)).strftime("%Y-%m-%d %H:%M:%S") self.load_data(df) self.load_model() all_predictions = [] with torch.no_grad(): for batch in self.test_loader: inputs = batch[0].to(self.device) outputs = self.model(inputs) all_predictions.append(outputs.cpu().numpy()) # 拼接所有批次的预测结果,并重塑为[时间步, 标签数] predictions = np.concatenate(all_predictions, axis=0).reshape(-1, self.labels_num) # 反归一化(仅对标签列,使用训练时的scaler参数) from sklearn.preprocessing import MinMaxScaler inverse_scaler = MinMaxScaler() inverse_scaler.min_ = self.scaler.min_[-self.labels_num:] inverse_scaler.scale_ = self.scaler.scale_[-self.labels_num:] predictions = inverse_scaler.inverse_transform(predictions) # 可选:异常值处理和平滑(当前注释掉,可根据需求启用) # predictions = self.remove_outliers(predictions) # 处理异常值 # predictions = self.smooth_predictions(predictions) # 平滑处理 if np.isnan(predictions).any(): # 用备用值替换 predictions = self.get_recent_values_as_fallback() return predictions def save_predictions(self, predictions): """ 将预测结果保存为CSV文件,包含时间戳和各指标的预测值 输入:反归一化后的预测结果(numpy数组) """ start_time = datetime.strptime(self.test_start_date, "%Y-%m-%d %H:%M:%S") time_interval = timedelta(minutes=(4 * self.resolution / 60)) timestamps = [start_time + i * time_interval for i in range(len(predictions))] pred_columns = [f'{col}_pred' for col in self.target_columns] df_result = pd.DataFrame(predictions, columns=pred_columns) df_result.insert(0, 'date', timestamps) df_result.to_csv(self.output_csv_path, index=False) print(f"预测结果保存至:{self.output_csv_path}") if __name__ == '__main__': """主函数:初始化预测器、加载数据、执行预测并保存结果""" import json # 用于解析JSON结构 import os import pandas as pd from datetime import timedelta predictor = Predictor() # 读取JSON文件作为输入数据 json_file_path = 'pp.json' # pp.json文件路径,可根据实际位置修改 if not os.path.exists(json_file_path): raise FileNotFoundError(f"未找到文件: {json_file_path}") print(f"读取文件:{json_file_path}") # 解析JSON并提取data字段(不使用try,直接判断) with open(json_file_path, 'r', encoding='utf-8') as f: json_data = json.load(f) # 检查data字段存在性及格式 if 'data' not in json_data: raise ValueError("JSON文件中未找到'data'字段,请检查结构") data_list = json_data['data'] if not isinstance(data_list, list) or len(data_list) == 0: raise ValueError("'data'字段必须是非空列表") # 转换为DataFrame df = pd.DataFrame(data_list) # 检查并处理datetime列 if 'datetime' not in df.columns: raise ValueError("数据中未找到'datetime'字段,请检查键名") df = df.rename(columns={'datetime': 'index'}) # 转换index列为datetime格式 df['index'] = pd.to_datetime(df['index']) # 若格式错误会直接抛出异常 # 执行预测并保存结果 predictions = predictor.predict(df) # predictor.save_predictions(predictions)