detection.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. import os
  2. import pandas as pd
  3. import numpy as np
  4. import joblib
  5. from sklearn.preprocessing import MinMaxScaler
  6. from sklearn.ensemble import IsolationForest
  7. from sklearn.svm import OneClassSVM
  8. # 设置中文字体显示(用于本地可视化时中文不乱码)
  9. import matplotlib.pyplot as plt
  10. plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
  11. # 数据文件夹路径(批量 CSV 存放目录)
  12. data_folder = "datasets_export_xishan"
  13. # 定义要读取的文件模板与对应列名(逐批次 data_exportX_{i}.csv, i=1..26)
  14. # 关键字段含义(业务约定):
  15. # - UF1Per:UF1膜渗透率(UF1Per)
  16. # - C.M.RO1_DB@DPT_1:RO1一段压差
  17. # - C.M.RO1_DB@DPT_2:RO1二段压差
  18. # - RO1_CSFlow:RO1产水流量
  19. file_info = {
  20. "data_export5_{}.csv": ["UF1Per", "UF2Per", "UF3Per", "UF4Per"],
  21. "data_export8_{}.csv": ["C.M.RO1_DB@DPT_1", "C.M.RO1_DB@DPT_2",
  22. "C.M.RO2_DB@DPT_1", "C.M.RO2_DB@DPT_2"],
  23. "data_export9_{}.csv": ["C.M.RO3_DB@DPT_1", "C.M.RO3_DB@DPT_2",
  24. "C.M.RO4_DB@DPT_1", "C.M.RO4_DB@DPT_2"],
  25. "data_export11_{}.csv": ["RO1_CSFlow", "RO2_CSFlow", "RO3_CSFlow", "RO4_CSFlow"]
  26. }
  27. def load_and_merge_data():
  28. """加载并合并所有数据文件
  29. - 按 file_info 中的模板逐列读取各批 CSV(i=1..26),仅选取关心的指标列
  30. - 将成功读取的数据 DataFrame 纵向拼接(ignore_index=True)
  31. - 返回合并后的 DataFrame;若无任何成功数据则报错
  32. """
  33. all_data = []
  34. # 循环读取每个文件模板和对应的编号1-26
  35. for file_template, columns in file_info.items():
  36. for i in range(1, 27):
  37. # 构建完整的文件路径
  38. filename = file_template.format(i)
  39. file_path = os.path.join(data_folder, filename)
  40. try:
  41. # 读取CSV文件的指定列(仅保留关心指标,减小内存开销)
  42. df = pd.read_csv(file_path, usecols=columns)
  43. all_data.append(df)
  44. print(f"成功读取: {filename}")
  45. except Exception as e:
  46. print(f"读取文件 {filename} 时出错: {e}")
  47. # 合并所有数据(纵向堆叠)
  48. if not all_data:
  49. raise ValueError("没有成功读取任何数据文件")
  50. merged_df = pd.concat(all_data, ignore_index=True)
  51. print(f"数据合并完成,总样本数: {len(merged_df)}")
  52. return merged_df
  53. def normalize_data(df):
  54. """对数据进行归一化处理(逐列 Min-Max 到 [0,1])
  55. - 拟合并转换每一列;保存 scaler 至 `scaler.pkl`
  56. - 返回归一化后的 DataFrame 以及 scaler(便于线上/后续反归一化)
  57. """
  58. scaler = MinMaxScaler()
  59. scaled_data = scaler.fit_transform(df)
  60. scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
  61. # 保存归一化器
  62. joblib.dump(scaler, "scaler.pkl")
  63. print("归一化器已保存为 scaler.pkl")
  64. return scaled_df, scaler
  65. class IsolationForestModel:
  66. """孤立森林异常检测模型(逐列一维检测)
  67. - 特点:无需标注,适合检测孤立点;contamination='auto' 自动估计比例
  68. - 输出:predict → -1 表示异常,1 表示正常
  69. """
  70. def __init__(self):
  71. self.models = {} # 存储每列的模型
  72. def fit(self, df):
  73. """逐列训练孤立森林模型
  74. 参数:
  75. - df: 归一化后的 DataFrame(每列为一个监测指标)
  76. """
  77. for column in df.columns:
  78. # 准备数据(sklearn 需要二维输入)
  79. X = df[column].values.reshape(-1, 1)
  80. # 训练模型
  81. model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
  82. model.fit(X)
  83. self.models[column] = model
  84. print(f"已训练 {column} 的孤立森林模型")
  85. return self
  86. def predict(self, df):
  87. """预测异常值,-1 表示异常,1 表示正常(逐列)"""
  88. results = pd.DataFrame()
  89. for column in df.columns:
  90. if column not in self.models:
  91. raise ValueError(f"没有 {column} 的模型,请先训练")
  92. X = df[column].values.reshape(-1, 1)
  93. results[column] = self.models[column].predict(X)
  94. return results
  95. def save(self, filename="isolation_forest_models.pkl"):
  96. """保存模型"""
  97. joblib.dump(self, filename)
  98. print(f"孤立森林模型已保存为 {filename}")
  99. class ThreeSigmaModel:
  100. """3σ 异常检测模型(逐列基于均值±nσ 的阈值法)
  101. - 特点:简单、可解释;可调 n_sigma(默认 3)
  102. - 输出:-1 表示异常,1 表示正常
  103. """
  104. def __init__(self):
  105. self.stats = {} # 存储每列的均值和标准差
  106. def fit(self, df):
  107. """计算每列的均值和标准差,并缓存统计量"""
  108. for column in df.columns:
  109. mean = df[column].mean()
  110. std = df[column].std()
  111. self.stats[column] = (mean, std)
  112. print(f"已计算 {column} 的3σ统计量")
  113. return self
  114. def predict(self, df, n_sigma=3):
  115. """预测异常值(-1=异常,1=正常)
  116. 参数:
  117. - df: 归一化后的 DataFrame
  118. - n_sigma: 阈值宽度因子,默认 3,即 mean ± 3*std
  119. """
  120. results = pd.DataFrame()
  121. for column in df.columns:
  122. if column not in self.stats:
  123. raise ValueError(f"没有 {column} 的统计量,请先训练")
  124. mean, std = self.stats[column]
  125. # 计算上下限
  126. lower_bound = mean - n_sigma * std
  127. upper_bound = mean + n_sigma * std
  128. # 判断异常值
  129. is_outlier = (df[column] < lower_bound) | (df[column] > upper_bound)
  130. # 转换为-1(异常)和1(正常)
  131. results[column] = np.where(is_outlier, -1, 1)
  132. return results
  133. def save(self, filename="three_sigma_model.pkl"):
  134. """保存模型"""
  135. joblib.dump(self, filename)
  136. print(f"3σ模型已保存为 {filename}")
  137. '''
  138. class OneClassSVMModel:
  139. """One-Class SVM 异常检测模型(可选)
  140. - 对复杂边界更有表达力,但对参数与尺度敏感
  141. - 启用前请确保样本量与特征尺度合适
  142. """
  143. def __init__(self):
  144. self.models = {} # 存储每列的模型
  145. def fit(self, df):
  146. """逐列训练 One-Class SVM 模型"""
  147. for column in df.columns:
  148. # 准备数据(需要二维数组)
  149. X = df[column].values.reshape(-1, 1)
  150. # 训练模型
  151. model = OneClassSVM(nu=0.05, kernel='rbf', gamma='scale')
  152. model.fit(X)
  153. self.models[column] = model
  154. print(f"已训练 {column} 的One-Class SVM模型")
  155. return self
  156. def predict(self, df):
  157. """预测异常值,-1 表示异常,1 表示正常"""
  158. results = pd.DataFrame()
  159. for column in df.columns:
  160. if column not in self.models:
  161. raise ValueError(f"没有 {column} 的模型,请先训练")
  162. X = df[column].values.reshape(-1, 1)
  163. results[column] = self.models[column].predict(X)
  164. return results
  165. def save(self, filename="one_class_svm_models.pkl"):
  166. """保存模型"""
  167. joblib.dump(self, filename)
  168. print(f"One-Class SVM模型已保存为 {filename}")
  169. '''
  170. def main():
  171. # 1. 加载并合并数据(仅读取关心指标列)
  172. print("开始加载数据...")
  173. merged_data = load_and_merge_data()
  174. # 2. 数据归一化(逐列 Min-Max 到 [0,1] 并保存 scaler)
  175. print("\n开始数据归一化...")
  176. normalized_data, scaler = normalize_data(merged_data)
  177. # 3. 训练并保存孤立森林模型(逐列训练)
  178. print("\n开始训练孤立森林模型...")
  179. if_model = IsolationForestModel()
  180. if_model.fit(normalized_data)
  181. if_model.save()
  182. # 4. 训练并保存3σ模型(逐列计算统计量)
  183. print("\n开始训练3σ模型...")
  184. ts_model = ThreeSigmaModel()
  185. ts_model.fit(normalized_data)
  186. ts_model.save()
  187. # 5. 训练并保存 One-Class SVM 模型(可选,默认已注释)
  188. # 如需启用,请取消下方注释和类定义(第166-203行)的注释
  189. # print("\n开始训练One-Class SVM模型...")
  190. # ocsvm_model = OneClassSVMModel()
  191. # ocsvm_model.fit(normalized_data)
  192. # ocsvm_model.save()
  193. print("\n所有模型训练和保存完成!")
  194. # 使用模型进行预测(示例:随机抽样 100 条)
  195. sample_data = normalized_data.sample(min(100, len(normalized_data))) # 随机取100个样本或全部样本(如果不足100个)
  196. # 孤立森林预测
  197. if_predictions = if_model.predict(sample_data)
  198. print("\n孤立森林预测结果(-1表示异常,1表示正常):")
  199. print(if_predictions)
  200. # 3σ预测
  201. ts_predictions = ts_model.predict(sample_data)
  202. print("\n3σ预测结果(-1表示异常,1表示正常):")
  203. print(ts_predictions)
  204. # One-Class SVM预测(可选,默认已注释)
  205. # 如需启用,请取消下方注释
  206. # ocsvm_predictions = ocsvm_model.predict(sample_data)
  207. # print("\nOne-Class SVM预测结果(-1表示异常,1表示正常):")
  208. # print(ocsvm_predictions)
  209. if __name__ == "__main__":
  210. main()