detection.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. import os
  2. import pandas as pd
  3. import numpy as np
  4. import joblib
  5. from sklearn.preprocessing import MinMaxScaler
  6. from sklearn.ensemble import IsolationForest
  7. from sklearn.svm import OneClassSVM
  8. # 设置中文字体显示
  9. import matplotlib.pyplot as plt
  10. plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
  11. # 数据文件夹路径
  12. data_folder = "datasets_export_xishan"
  13. # 定义要读取的文件和对应的列
  14. file_info = {
  15. "data_export5_{}.csv": ["UF1Per", "UF2Per", "UF3Per", "UF4Per"],
  16. "data_export8_{}.csv": ["C.M.RO1_DB@DPT_1", "C.M.RO1_DB@DPT_2",
  17. "C.M.RO2_DB@DPT_1", "C.M.RO2_DB@DPT_2"],
  18. "data_export9_{}.csv": ["C.M.RO3_DB@DPT_1", "C.M.RO3_DB@DPT_2",
  19. "C.M.RO4_DB@DPT_1", "C.M.RO4_DB@DPT_2"],
  20. "data_export11_{}.csv": ["RO1_CSFlow", "RO2_CSFlow", "RO3_CSFlow", "RO4_CSFlow"]
  21. }
  22. def load_and_merge_data():
  23. """加载并合并所有数据文件"""
  24. all_data = []
  25. # 循环读取每个文件模板和对应的编号1-26
  26. for file_template, columns in file_info.items():
  27. for i in range(1, 27):
  28. # 构建完整的文件路径
  29. filename = file_template.format(i)
  30. file_path = os.path.join(data_folder, filename)
  31. try:
  32. # 读取CSV文件的指定列
  33. df = pd.read_csv(file_path, usecols=columns)
  34. all_data.append(df)
  35. print(f"成功读取: {filename}")
  36. except Exception as e:
  37. print(f"读取文件 {filename} 时出错: {e}")
  38. # 合并所有数据
  39. if not all_data:
  40. raise ValueError("没有成功读取任何数据文件")
  41. merged_df = pd.concat(all_data, ignore_index=True)
  42. print(f"数据合并完成,总样本数: {len(merged_df)}")
  43. return merged_df
  44. def normalize_data(df):
  45. """对数据进行归一化处理"""
  46. scaler = MinMaxScaler()
  47. scaled_data = scaler.fit_transform(df)
  48. scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
  49. # 保存归一化器
  50. joblib.dump(scaler, "scaler.pkl")
  51. print("归一化器已保存为 scaler.pkl")
  52. return scaled_df, scaler
  53. class IsolationForestModel:
  54. """孤立森林异常检测模型"""
  55. def __init__(self):
  56. self.models = {} # 存储每列的模型
  57. def fit(self, df):
  58. """逐列训练孤立森林模型"""
  59. for column in df.columns:
  60. # 准备数据(需要二维数组)
  61. X = df[column].values.reshape(-1, 1)
  62. # 训练模型
  63. model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
  64. model.fit(X)
  65. self.models[column] = model
  66. print(f"已训练 {column} 的孤立森林模型")
  67. return self
  68. def predict(self, df):
  69. """预测异常值,-1表示异常,1表示正常"""
  70. results = pd.DataFrame()
  71. for column in df.columns:
  72. if column not in self.models:
  73. raise ValueError(f"没有 {column} 的模型,请先训练")
  74. X = df[column].values.reshape(-1, 1)
  75. results[column] = self.models[column].predict(X)
  76. return results
  77. def save(self, filename="isolation_forest_models.pkl"):
  78. """保存模型"""
  79. joblib.dump(self, filename)
  80. print(f"孤立森林模型已保存为 {filename}")
  81. class ThreeSigmaModel:
  82. """3σ异常检测模型"""
  83. def __init__(self):
  84. self.stats = {} # 存储每列的均值和标准差
  85. def fit(self, df):
  86. """计算每列的均值和标准差"""
  87. for column in df.columns:
  88. mean = df[column].mean()
  89. std = df[column].std()
  90. self.stats[column] = (mean, std)
  91. print(f"已计算 {column} 的3σ统计量")
  92. return self
  93. def predict(self, df, n_sigma=3):
  94. """预测异常值,-1表示异常,1表示正常"""
  95. results = pd.DataFrame()
  96. for column in df.columns:
  97. if column not in self.stats:
  98. raise ValueError(f"没有 {column} 的统计量,请先训练")
  99. mean, std = self.stats[column]
  100. # 计算上下限
  101. lower_bound = mean - n_sigma * std
  102. upper_bound = mean + n_sigma * std
  103. # 判断异常值
  104. is_outlier = (df[column] < lower_bound) | (df[column] > upper_bound)
  105. # 转换为-1(异常)和1(正常)
  106. results[column] = np.where(is_outlier, -1, 1)
  107. return results
  108. def save(self, filename="three_sigma_model.pkl"):
  109. """保存模型"""
  110. joblib.dump(self, filename)
  111. print(f"3σ模型已保存为 {filename}")
  112. '''
  113. class OneClassSVMModel:
  114. """One-Class SVM异常检测模型"""
  115. def __init__(self):
  116. self.models = {} # 存储每列的模型
  117. def fit(self, df):
  118. """逐列训练One-Class SVM模型"""
  119. for column in df.columns:
  120. # 准备数据(需要二维数组)
  121. X = df[column].values.reshape(-1, 1)
  122. # 训练模型
  123. model = OneClassSVM(nu=0.05, kernel='rbf', gamma='scale')
  124. model.fit(X)
  125. self.models[column] = model
  126. print(f"已训练 {column} 的One-Class SVM模型")
  127. return self
  128. def predict(self, df):
  129. """预测异常值,-1表示异常,1表示正常"""
  130. results = pd.DataFrame()
  131. for column in df.columns:
  132. if column not in self.models:
  133. raise ValueError(f"没有 {column} 的模型,请先训练")
  134. X = df[column].values.reshape(-1, 1)
  135. results[column] = self.models[column].predict(X)
  136. return results
  137. def save(self, filename="one_class_svm_models.pkl"):
  138. """保存模型"""
  139. joblib.dump(self, filename)
  140. print(f"One-Class SVM模型已保存为 {filename}")
  141. '''
  142. def main():
  143. # 1. 加载并合并数据
  144. print("开始加载数据...")
  145. merged_data = load_and_merge_data()
  146. # 2. 数据归一化
  147. print("\n开始数据归一化...")
  148. normalized_data, scaler = normalize_data(merged_data)
  149. # 3. 训练并保存孤立森林模型
  150. print("\n开始训练孤立森林模型...")
  151. if_model = IsolationForestModel()
  152. if_model.fit(normalized_data)
  153. if_model.save()
  154. # 4. 训练并保存3σ模型
  155. print("\n开始训练3σ模型...")
  156. ts_model = ThreeSigmaModel()
  157. ts_model.fit(normalized_data)
  158. ts_model.save()
  159. # 5. 训练并保存One-Class SVM模型
  160. print("\n开始训练One-Class SVM模型...")
  161. ocsvm_model = OneClassSVMModel()
  162. ocsvm_model.fit(normalized_data)
  163. ocsvm_model.save()
  164. print("\n所有模型训练和保存完成!")
  165. # 使用模型进行预测
  166. sample_data = normalized_data.sample(min(100, len(normalized_data))) # 随机取100个样本或全部样本(如果不足10个)
  167. # 孤立森林预测
  168. if_predictions = if_model.predict(sample_data)
  169. print("\n孤立森林预测结果(-1表示异常,1表示正常):")
  170. print(if_predictions)
  171. # 3σ预测
  172. ts_predictions = ts_model.predict(sample_data)
  173. print("\n3σ预测结果(-1表示异常,1表示正常):")
  174. print(ts_predictions)
  175. # One-Class SVM预测
  176. ocsvm_predictions = ocsvm_model.predict(sample_data)
  177. print("\nOne-Class SVM预测结果(-1表示异常,1表示正常):")
  178. print(ocsvm_predictions)
  179. if __name__ == "__main__":
  180. main()