regression.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. import sys
  2. sys.path.append("..")
  3. from Analysis.pearsonr import DFMat, PearsonrMat
  4. from Database.database_ import DatabaseParam
  5. import config
  6. import os
  7. import json
  8. import pandas as pd
  9. from sklearn.preprocessing import StandardScaler
  10. from sklearn.linear_model import Lasso, LassoCV, LinearRegression
  11. from sklearn.model_selection import TimeSeriesSplit
  12. import numpy as np
  13. import matplotlib.pyplot as plt
  14. from sklearn.metrics import r2_score
  15. import scipy.stats as stats
  16. from utils.tools import set_chinese_font
  17. from sklearn.metrics import mean_squared_error, mean_absolute_error
  18. from sklearn.model_selection import cross_val_score
  19. from statsmodels.stats.outliers_influence import OLSInfluence
  20. import statsmodels.api as sm
  21. class RegressionBox(PearsonrMat):
  22. """Lasso回归模型+OLS最小回归"""
  23. def __init__(self, keys_file_dir: str, min_records:int, db_param: DatabaseParam, transfer_file_dir:str, is_from_local:bool=True):
  24. super().__init__(keys_file_dir=keys_file_dir, min_records=min_records, db_param=db_param, transfer_file_dir=transfer_file_dir, is_from_local=is_from_local)
  25. self.lasso_info = {"help":"x,自变量名;y,因变量名;alpha,最佳参数;coef,自变量权重;intercept,截距;n_iter,迭代次数;dual_gap,对偶间隙;tol,对偶容忍"}
  26. self.ols_info = {"help":"x,自变量名;y,因变量名;最佳参数;coef,自变量权重;intercept,截距;n_iter,迭代次数;score,R2决定系数;"}
  27. self.ols_model = None # 最终的线性OLS回归模型
  28. def read_features_file(self):
  29. """加载特征文件,确定因变量Y和自变量X的标签"""
  30. path = config.LASSO_FEATURE_FILE_PATH
  31. if not os.path.exists(path):
  32. raise FileNotFoundError('文件未发现:', path)
  33. with open(path, "r", encoding="utf-8") as f:
  34. json_data = json.load(f)
  35. return json_data.get('targets'), json_data.get('features')
  36. def load_features(self)->tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
  37. y_label_name, x_label_name = self.read_features_file()
  38. # name转换为code
  39. y_label_code = [self.name_2code_dict.get(i) for i in y_label_name if self.name_2code_dict.get(i) in self.df_merge.columns.tolist()]
  40. x_label_code = [self.name_2code_dict.get(i) for i in x_label_name if self.name_2code_dict.get(i) in self.df_merge.columns.tolist()]
  41. if len(y_label_code) ==0 or len(x_label_code) == 0:
  42. raise ValueError('需要拟合的特征为空,请检查建模字段是否存在', (y_label_code, x_label_code))
  43. targets = self.df_merge.loc[:, y_label_code].copy()
  44. features = self.df_merge.loc[:, x_label_code].copy()
  45. time = self.df_merge.loc[:, ['time']].copy()
  46. return targets, features, time
  47. def select_features(self):
  48. pass
  49. def lasso_(self, y_value:np.ndarray, scaler_x_value:np.ndarray,n_splits:int=5, max_iter:int=10000):
  50. """实现Lasso回归分析,选择字段"""
  51. tscv = TimeSeriesSplit(n_splits=n_splits)
  52. # 寻找最优alphas
  53. lasso_model = LassoCV(alphas=100,
  54. cv=tscv,
  55. max_iter=max_iter,
  56. random_state=42,
  57. n_jobs=-1)
  58. lasso_model.fit(scaler_x_value, y_value)
  59. # 记录最优alphas
  60. self.lasso_info['alpha'] = lasso_model.alpha_
  61. # 记录截距
  62. self.lasso_info['intercept'] = lasso_model.intercept_
  63. # 记录迭代次数
  64. self.lasso_info['n_iter'] = lasso_model.n_iter_
  65. # 记录对偶间隙
  66. self.lasso_info['dual_gap'] = lasso_model.dual_gap_
  67. # 记录对偶容忍
  68. self.lasso_info['tol'] = lasso_model.tol
  69. # 记录权重
  70. self.lasso_info['coef'] = lasso_model.coef_
  71. def ols_(self, y_value:np.ndarray, scaler_x_value:np.ndarray)->LinearRegression:
  72. """OLS回归"""
  73. model = LinearRegression()
  74. model.fit(scaler_x_value, y_value)
  75. # 记录截距
  76. self.ols_info['intercept'] = model.intercept_
  77. # 记录权重
  78. self.ols_info['coef'] = model.coef_
  79. # 记录R²
  80. self.ols_info['score'] = model.score(scaler_x_value, y_value)
  81. return model
  82. def any_regression_full(self, target_name:str):
  83. """对任意输入字段进行全字段回归建模"""
  84. pass
  85. def any_regression_r_rank(self, target_name:str):
  86. """基于皮尔逊系数排序对字段进行回归建模"""
  87. # 所有需要建模的字段
  88. y_label_name = target_name
  89. x_label_name = self.query_r_rank_n(y_label_name) # 根据皮尔逊排序挑选相关性字段
  90. # 剔除自身字段
  91. if y_label_name in x_label_name:
  92. x_label_name.remove(y_label_name)
  93. # 拿到数据
  94. y_label_code = self.name_2code_dict[y_label_name]
  95. x_label_code = [self.name_2code_dict.get(name) for name in x_label_name]
  96. y = self.df_merge.loc[:, y_label_code].copy() # 真实值
  97. y = y.to_numpy()
  98. x = self.df_merge.loc[:, x_label_code].copy() # 预测值
  99. t = self.df_merge.loc[:, 'time'].copy() # 时间序列
  100. # 标准化
  101. scaler = StandardScaler()
  102. x = scaler.fit_transform(x)
  103. # Lasso回归,选择字段
  104. self.lasso_(y_value=y, scaler_x_value=x)
  105. self.lasso_info['x'] = x_label_name
  106. self.lasso_info['y'] = y_label_name
  107. # Lasso模型诊断与可视化
  108. print('\n===========Lasso训练结果==================')
  109. print(f'最优lambda:{self.lasso_info.get('alpha')}')
  110. print(f'Y:{self.lasso_info.get('y')}')
  111. print(f"Lasso系数:")
  112. for feat, coef in zip(x_label_name, self.lasso_info.get('coef')):
  113. print(f" {feat}: {coef}")
  114. print(f'截距:{self.lasso_info.get('intercept')}')
  115. print(f'迭代次数:{self.lasso_info.get('n_iter')}')
  116. print(f'对偶间隙:{self.lasso_info.get('dual_gap')}')
  117. print(f'对偶间隙容忍:{self.lasso_info.get('tol')}')
  118. # OLS回归,筛选系数不为零的向量
  119. mask = self.lasso_info.get('coef') != 0
  120. x_label_name = list(np.array(x_label_name)[mask])
  121. x_label_code = list(np.array(x_label_code)[mask])
  122. x = self.df_merge.loc[:, x_label_code] # 没进行归一化/标准化
  123. self.ols_model = self.ols_(y_value=y, scaler_x_value=x)
  124. self.ols_info['x'] = x_label_name
  125. self.ols_info['y'] = y_label_name
  126. # OLS模型诊断
  127. print('\n===========OLS训练结果==================')
  128. print(f"OLS 截距: {self.ols_info.get('intercept')}")
  129. print(f"OLS 系数:")
  130. for feat, coef in zip(x_label_name, self.ols_info.get('coef')):
  131. print(f" {feat}: {coef}")
  132. print(f"OLS R² (训练集): {self.ols_info.get('score'):.4f}")
  133. # 基本指标评价
  134. y_pred = self.ols_model.predict(x)
  135. residuals = y - y_pred
  136. mse = mean_squared_error(y, y_pred)
  137. rmse = np.sqrt(mse)
  138. mae = mean_absolute_error(y, y_pred)
  139. r2 = r2_score(y, y_pred)
  140. # 调整R²
  141. n = len(y)
  142. p = x.shape[1]
  143. adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
  144. print("\n===========模型性能指标==================:")
  145. print(f"均方误差 (MSE): {mse:.4f}")
  146. print(f"均方根误差 (RMSE): {rmse:.4f}")
  147. print(f"平均绝对误差 (MAE): {mae:.4f}")
  148. print(f"决定系数 (R²): {r2:.4f}")
  149. print(f"调整R²: {adj_r2:.4f}")
  150. # 创建诊断图
  151. set_chinese_font()
  152. fig, axes = plt.subplots(2, 3, figsize=(15, 10))
  153. # 1. 残差 vs 拟合值图(检查同方差性和线性关系)
  154. axes[0, 0].scatter(y_pred, residuals, alpha=0.6)
  155. axes[0, 0].axhline(y=0, color='red', linestyle='--')
  156. axes[0, 0].set_xlabel('拟合值')
  157. axes[0, 0].set_ylabel('残差')
  158. axes[0, 0].set_title('残差 vs 拟合值')
  159. # 2. 正态Q-Q图(检查残差正态性)
  160. stats.probplot(residuals, dist="norm", plot=axes[0, 1])
  161. axes[0, 1].set_title('Q-Q图(检查正态性)')
  162. # 3. 残差直方图
  163. axes[0, 2].hist(residuals, bins=30, density=True, alpha=0.7)
  164. axes[0, 2].set_xlabel('残差')
  165. axes[0, 2].set_ylabel('密度')
  166. axes[0, 2].set_title('残差分布')
  167. # 4. 观测值 vs 拟合值
  168. axes[1, 0].scatter(y, y_pred, alpha=0.6)
  169. min_val = min(y.min(), y_pred.min())
  170. max_val = max(y.max(), y_pred.max())
  171. axes[1, 0].plot([min_val, max_val], [min_val, max_val], 'red', linestyle='--')
  172. axes[1, 0].set_xlabel('实际值')
  173. axes[1, 0].set_ylabel('预测值')
  174. axes[1, 0].set_title('实际值 vs 预测值')
  175. r2 = r2_score(y, y_pred)
  176. axes[1, 0].text(0.05, 0.95, f'R² = {r2:.3f}', transform=axes[1, 0].transAxes)
  177. # 5. 残差的时间序列图(如果是时间序列数据)
  178. axes[1, 1].plot(residuals)
  179. axes[1, 1].axhline(y=0, color='red', linestyle='--')
  180. axes[1, 1].set_xlabel('时间/观测序号')
  181. axes[1, 1].set_ylabel('残差')
  182. axes[1, 1].set_title('残差时间序列')
  183. # 6. 尺度-位置图(检查同方差性)
  184. standardized_residuals = residuals / np.std(residuals)
  185. axes[1, 2].scatter(y_pred, np.sqrt(np.abs(standardized_residuals)), alpha=0.6)
  186. axes[1, 2].set_xlabel('拟合值')
  187. axes[1, 2].set_ylabel('√|标准化残差|')
  188. axes[1, 2].set_title('尺度-位置图')
  189. plt.tight_layout()
  190. plt.show()
  191. pass
  192. def any_regression_custom(self, target_name:str, path:str):
  193. """基于自定义字段进行回归建模,从文件读入建模字段"""
  194. def auto_fit(self, x_label_name:str, y_label_name:str, is_use_lasso:bool=True):
  195. """回归分析"""
  196. if __name__ == '__main__':
  197. # 数据库参数
  198. db_param = DatabaseParam(
  199. db_host=config.DB_HOST,
  200. db_user=config.DB_USER,
  201. db_password=config.DB_PASSWORD,
  202. db_name=config.DB_NAME,
  203. db_port=config.DB_PORT)
  204. my_box = RegressionBox(
  205. keys_file_dir=os.path.join(config.STATISTICS_FILE_DIR, config.STATISTICS_FILE_NAME),
  206. min_records = config.MIN_RECORDS, db_param = db_param,
  207. transfer_file_dir = os.path.join(config.ALL_ITEMS_FILE_DIR, config.TRANSFER_JSON_NAME))
  208. # 计算皮尔逊
  209. my_box.calculate_pearsonr_mat()
  210. # 进行回归分析
  211. my_box.any_regression_r_rank("RO1脱盐率")