zhanghao 5 месяцев назад
Родитель
Сommit
ceb030f300
1 измененных файлов с 217 добавлено и 217 удалено
  1. 217 217
      models/anomaly_detection/detection.py

+ 217 - 217
models/anomaly_detection/detection.py

@@ -1,218 +1,218 @@
-import os
-import pandas as pd
-import numpy as np
-import joblib
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.ensemble import IsolationForest
-from sklearn.svm import OneClassSVM
-
-# 设置中文字体显示
-import matplotlib.pyplot as plt
-plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
-
-# 数据文件夹路径
-data_folder = "datasets_export_xishan"
-
-# 定义要读取的文件和对应的列
-file_info = {
-    "data_export5_{}.csv": ["UF1Per", "UF2Per", "UF3Per", "UF4Per"],
-    "data_export8_{}.csv": ["C.M.RO1_DB@DPT_1", "C.M.RO1_DB@DPT_2", 
-                           "C.M.RO2_DB@DPT_1", "C.M.RO2_DB@DPT_2"],
-    "data_export9_{}.csv": ["C.M.RO3_DB@DPT_1", "C.M.RO3_DB@DPT_2", 
-                           "C.M.RO4_DB@DPT_1", "C.M.RO4_DB@DPT_2"],
-    "data_export11_{}.csv": ["RO1_CSFlow", "RO2_CSFlow", "RO3_CSFlow", "RO4_CSFlow"]
-}
-
-def load_and_merge_data():
-    """加载并合并所有数据文件"""
-    all_data = []
-    
-    # 循环读取每个文件模板和对应的编号1-26
-    for file_template, columns in file_info.items():
-        for i in range(1, 27):
-            # 构建完整的文件路径
-            filename = file_template.format(i)
-            file_path = os.path.join(data_folder, filename)
-            
-            try:
-                # 读取CSV文件的指定列
-                df = pd.read_csv(file_path, usecols=columns)
-                all_data.append(df)
-                print(f"成功读取: {filename}")
-            except Exception as e:
-                print(f"读取文件 {filename} 时出错: {e}")
-    
-    # 合并所有数据
-    if not all_data:
-        raise ValueError("没有成功读取任何数据文件")
-    
-    merged_df = pd.concat(all_data, ignore_index=True)
-    print(f"数据合并完成,总样本数: {len(merged_df)}")
-    return merged_df
-
-def normalize_data(df):
-    """对数据进行归一化处理"""
-    scaler = MinMaxScaler()
-    scaled_data = scaler.fit_transform(df)
-    scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
-    
-    # 保存归一化器
-    joblib.dump(scaler, "scaler.pkl")
-    print("归一化器已保存为 scaler.pkl")
-    
-    return scaled_df, scaler
-
-class IsolationForestModel:
-    """孤立森林异常检测模型"""
-    def __init__(self):
-        self.models = {}  # 存储每列的模型
-    
-    def fit(self, df):
-        """逐列训练孤立森林模型"""
-        for column in df.columns:
-            # 准备数据(需要二维数组)
-            X = df[column].values.reshape(-1, 1)
-            # 训练模型
-            model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
-            model.fit(X)
-            self.models[column] = model
-            print(f"已训练 {column} 的孤立森林模型")
-        return self
-    
-    def predict(self, df):
-        """预测异常值,-1表示异常,1表示正常"""
-        results = pd.DataFrame()
-        for column in df.columns:
-            if column not in self.models:
-                raise ValueError(f"没有 {column} 的模型,请先训练")
-            
-            X = df[column].values.reshape(-1, 1)
-            results[column] = self.models[column].predict(X)
-        return results
-    
-    def save(self, filename="isolation_forest_models.pkl"):
-        """保存模型"""
-        joblib.dump(self, filename)
-        print(f"孤立森林模型已保存为 {filename}")
-
-class ThreeSigmaModel:
-    """3σ异常检测模型"""
-    def __init__(self):
-        self.stats = {}  # 存储每列的均值和标准差
-    
-    def fit(self, df):
-        """计算每列的均值和标准差"""
-        for column in df.columns:
-            mean = df[column].mean()
-            std = df[column].std()
-            self.stats[column] = (mean, std)
-            print(f"已计算 {column} 的3σ统计量")
-        return self
-    
-    def predict(self, df, n_sigma=3):
-        """预测异常值,-1表示异常,1表示正常"""
-        results = pd.DataFrame()
-        for column in df.columns:
-            if column not in self.stats:
-                raise ValueError(f"没有 {column} 的统计量,请先训练")
-            
-            mean, std = self.stats[column]
-            # 计算上下限
-            lower_bound = mean - n_sigma * std
-            upper_bound = mean + n_sigma * std
-            
-            # 判断异常值
-            is_outlier = (df[column] < lower_bound) | (df[column] > upper_bound)
-            # 转换为-1(异常)和1(正常)
-            results[column] = np.where(is_outlier, -1, 1)
-        return results
-    
-    def save(self, filename="three_sigma_model.pkl"):
-        """保存模型"""
-        joblib.dump(self, filename)
-        print(f"3σ模型已保存为 {filename}")
-
-'''
-class OneClassSVMModel:
-    """One-Class SVM异常检测模型"""
-    def __init__(self):
-        self.models = {}  # 存储每列的模型
-    
-    def fit(self, df):
-        """逐列训练One-Class SVM模型"""
-        for column in df.columns:
-            # 准备数据(需要二维数组)
-            X = df[column].values.reshape(-1, 1)
-            # 训练模型
-            model = OneClassSVM(nu=0.05, kernel='rbf', gamma='scale')
-            model.fit(X)
-            self.models[column] = model
-            print(f"已训练 {column} 的One-Class SVM模型")
-        return self
-    
-    def predict(self, df):
-        """预测异常值,-1表示异常,1表示正常"""
-        results = pd.DataFrame()
-        for column in df.columns:
-            if column not in self.models:
-                raise ValueError(f"没有 {column} 的模型,请先训练")
-            
-            X = df[column].values.reshape(-1, 1)
-            results[column] = self.models[column].predict(X)
-        return results
-    
-    def save(self, filename="one_class_svm_models.pkl"):
-        """保存模型"""
-        joblib.dump(self, filename)
-        print(f"One-Class SVM模型已保存为 {filename}")
-'''
-
-def main():
-    # 1. 加载并合并数据
-    print("开始加载数据...")
-    merged_data = load_and_merge_data()
-    
-    # 2. 数据归一化
-    print("\n开始数据归一化...")
-    normalized_data, scaler = normalize_data(merged_data)
-    
-    # 3. 训练并保存孤立森林模型
-    print("\n开始训练孤立森林模型...")
-    if_model = IsolationForestModel()
-    if_model.fit(normalized_data)
-    if_model.save()
-    
-    # 4. 训练并保存3σ模型
-    print("\n开始训练3σ模型...")
-    ts_model = ThreeSigmaModel()
-    ts_model.fit(normalized_data)
-    ts_model.save()
-    
-    # 5. 训练并保存One-Class SVM模型
-    print("\n开始训练One-Class SVM模型...")
-    ocsvm_model = OneClassSVMModel()
-    ocsvm_model.fit(normalized_data)
-    ocsvm_model.save()
-    
-    print("\n所有模型训练和保存完成!")
-    
-    # 使用模型进行预测
-    sample_data = normalized_data.sample(min(100, len(normalized_data)))  # 随机取100个样本或全部样本(如果不足10个)
-    
-    # 孤立森林预测
-    if_predictions = if_model.predict(sample_data)
-    print("\n孤立森林预测结果(-1表示异常,1表示正常):")
-    print(if_predictions)
-    
-    # 3σ预测
-    ts_predictions = ts_model.predict(sample_data)
-    print("\n3σ预测结果(-1表示异常,1表示正常):")
-    print(ts_predictions)
-    
-    # One-Class SVM预测
-    ocsvm_predictions = ocsvm_model.predict(sample_data)
-    print("\nOne-Class SVM预测结果(-1表示异常,1表示正常):")
-    print(ocsvm_predictions)
-
-if __name__ == "__main__":
+import os
+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.ensemble import IsolationForest
+from sklearn.svm import OneClassSVM
+
+# 设置中文字体显示
+import matplotlib.pyplot as plt
+plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
+
+# 数据文件夹路径
+data_folder = "datasets_export_xishan"
+
+# 定义要读取的文件和对应的列
+file_info = {
+    "data_export5_{}.csv": ["UF1Per", "UF2Per", "UF3Per", "UF4Per"],
+    "data_export8_{}.csv": ["C.M.RO1_DB@DPT_1", "C.M.RO1_DB@DPT_2", 
+                           "C.M.RO2_DB@DPT_1", "C.M.RO2_DB@DPT_2"],
+    "data_export9_{}.csv": ["C.M.RO3_DB@DPT_1", "C.M.RO3_DB@DPT_2", 
+                           "C.M.RO4_DB@DPT_1", "C.M.RO4_DB@DPT_2"],
+    "data_export11_{}.csv": ["RO1_CSFlow", "RO2_CSFlow", "RO3_CSFlow", "RO4_CSFlow"]
+}
+
+def load_and_merge_data():
+    """加载并合并所有数据文件"""
+    all_data = []
+    
+    # 循环读取每个文件模板和对应的编号1-26
+    for file_template, columns in file_info.items():
+        for i in range(1, 27):
+            # 构建完整的文件路径
+            filename = file_template.format(i)
+            file_path = os.path.join(data_folder, filename)
+            
+            try:
+                # 读取CSV文件的指定列
+                df = pd.read_csv(file_path, usecols=columns)
+                all_data.append(df)
+                print(f"成功读取: {filename}")
+            except Exception as e:
+                print(f"读取文件 {filename} 时出错: {e}")
+    
+    # 合并所有数据
+    if not all_data:
+        raise ValueError("没有成功读取任何数据文件")
+    
+    merged_df = pd.concat(all_data, ignore_index=True)
+    print(f"数据合并完成,总样本数: {len(merged_df)}")
+    return merged_df
+
+def normalize_data(df):
+    """对数据进行归一化处理"""
+    scaler = MinMaxScaler()
+    scaled_data = scaler.fit_transform(df)
+    scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
+    
+    # 保存归一化器
+    joblib.dump(scaler, "scaler.pkl")
+    print("归一化器已保存为 scaler.pkl")
+    
+    return scaled_df, scaler
+
+class IsolationForestModel:
+    """孤立森林异常检测模型"""
+    def __init__(self):
+        self.models = {}  # 存储每列的模型
+    
+    def fit(self, df):
+        """逐列训练孤立森林模型"""
+        for column in df.columns:
+            # 准备数据(需要二维数组)
+            X = df[column].values.reshape(-1, 1)
+            # 训练模型
+            model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
+            model.fit(X)
+            self.models[column] = model
+            print(f"已训练 {column} 的孤立森林模型")
+        return self
+    
+    def predict(self, df):
+        """预测异常值,-1表示异常,1表示正常"""
+        results = pd.DataFrame()
+        for column in df.columns:
+            if column not in self.models:
+                raise ValueError(f"没有 {column} 的模型,请先训练")
+            
+            X = df[column].values.reshape(-1, 1)
+            results[column] = self.models[column].predict(X)
+        return results
+    
+    def save(self, filename="isolation_forest_models.pkl"):
+        """保存模型"""
+        joblib.dump(self, filename)
+        print(f"孤立森林模型已保存为 {filename}")
+
+class ThreeSigmaModel:
+    """3σ异常检测模型"""
+    def __init__(self):
+        self.stats = {}  # 存储每列的均值和标准差
+    
+    def fit(self, df):
+        """计算每列的均值和标准差"""
+        for column in df.columns:
+            mean = df[column].mean()
+            std = df[column].std()
+            self.stats[column] = (mean, std)
+            print(f"已计算 {column} 的3σ统计量")
+        return self
+    
+    def predict(self, df, n_sigma=3):
+        """预测异常值,-1表示异常,1表示正常"""
+        results = pd.DataFrame()
+        for column in df.columns:
+            if column not in self.stats:
+                raise ValueError(f"没有 {column} 的统计量,请先训练")
+            
+            mean, std = self.stats[column]
+            # 计算上下限
+            lower_bound = mean - n_sigma * std
+            upper_bound = mean + n_sigma * std
+            
+            # 判断异常值
+            is_outlier = (df[column] < lower_bound) | (df[column] > upper_bound)
+            # 转换为-1(异常)和1(正常)
+            results[column] = np.where(is_outlier, -1, 1)
+        return results
+    
+    def save(self, filename="three_sigma_model.pkl"):
+        """保存模型"""
+        joblib.dump(self, filename)
+        print(f"3σ模型已保存为 {filename}")
+
+'''
+class OneClassSVMModel:
+    """One-Class SVM异常检测模型"""
+    def __init__(self):
+        self.models = {}  # 存储每列的模型
+    
+    def fit(self, df):
+        """逐列训练One-Class SVM模型"""
+        for column in df.columns:
+            # 准备数据(需要二维数组)
+            X = df[column].values.reshape(-1, 1)
+            # 训练模型
+            model = OneClassSVM(nu=0.05, kernel='rbf', gamma='scale')
+            model.fit(X)
+            self.models[column] = model
+            print(f"已训练 {column} 的One-Class SVM模型")
+        return self
+    
+    def predict(self, df):
+        """预测异常值,-1表示异常,1表示正常"""
+        results = pd.DataFrame()
+        for column in df.columns:
+            if column not in self.models:
+                raise ValueError(f"没有 {column} 的模型,请先训练")
+            
+            X = df[column].values.reshape(-1, 1)
+            results[column] = self.models[column].predict(X)
+        return results
+    
+    def save(self, filename="one_class_svm_models.pkl"):
+        """保存模型"""
+        joblib.dump(self, filename)
+        print(f"One-Class SVM模型已保存为 {filename}")
+'''
+
+def main():
+    # 1. 加载并合并数据
+    print("开始加载数据...")
+    merged_data = load_and_merge_data()
+    
+    # 2. 数据归一化
+    print("\n开始数据归一化...")
+    normalized_data, scaler = normalize_data(merged_data)
+    
+    # 3. 训练并保存孤立森林模型
+    print("\n开始训练孤立森林模型...")
+    if_model = IsolationForestModel()
+    if_model.fit(normalized_data)
+    if_model.save()
+    
+    # 4. 训练并保存3σ模型
+    print("\n开始训练3σ模型...")
+    ts_model = ThreeSigmaModel()
+    ts_model.fit(normalized_data)
+    ts_model.save()
+    
+    # 5. 训练并保存One-Class SVM模型
+    print("\n开始训练One-Class SVM模型...")
+    ocsvm_model = OneClassSVMModel()
+    ocsvm_model.fit(normalized_data)
+    ocsvm_model.save()
+    
+    print("\n所有模型训练和保存完成!")
+    
+    # 使用模型进行预测
+    sample_data = normalized_data.sample(min(100, len(normalized_data)))  # 随机取100个样本或全部样本(如果不足10个)
+    
+    # 孤立森林预测
+    if_predictions = if_model.predict(sample_data)
+    print("\n孤立森林预测结果(-1表示异常,1表示正常):")
+    print(if_predictions)
+    
+    # 3σ预测
+    ts_predictions = ts_model.predict(sample_data)
+    print("\n3σ预测结果(-1表示异常,1表示正常):")
+    print(ts_predictions)
+    
+    # One-Class SVM预测
+    ocsvm_predictions = ocsvm_model.predict(sample_data)
+    print("\nOne-Class SVM预测结果(-1表示异常,1表示正常):")
+    print(ocsvm_predictions)
+
+if __name__ == "__main__":
     main()