#!encoding:utf8 import pandas as pd from decimal import Decimal,InvalidOperation from numpy import float128 class Base(): @staticmethod def describe(series): return series.describe(include='all') @staticmethod def to_dataframe(data, target_item, dtype): s1v = [] htimev = [] for one in data: if dtype == str: s1v.append(one[0]) htimev.append(one[1]) else: try: v = Decimal(one[0]) except InvalidOperation: v = None finally: s1v.append(v) htimev.append(one[1]) s1 = pd.Series(s1v, dtype=dtype) s2 = pd.Series(htimev) return pd.DataFrame({target_item: s1, 'htime': s2}) @staticmethod def outlier_for_std(dataframe, source_item, bit=2) -> pd.DataFrame: ''' 通过标准方差来计算离群点 ''' mean, std = dataframe[source_item].mean(), dataframe[source_item].std() upb, lob = mean + std * bit, mean - std * bit outliers = df[(df[source_item] < lob) | (df[source_item] > upb)] return outliers @staticmethod def outlier_for_iqr(dataframe, source_item, bit=1.5) -> pd.DataFrame: ''' 通过四分位极差法来计算离群点 ''' q1, q3 = dataframe[source_item].quantile(0.25), dataframe[source_item].quantile(0.75) iqr = q3 - q1 upb, lob = mean + iqr * bit, mean - iqr * bit outliers = df[(df[source_item] < lob) | (df[source_item] > upb)] return outliers @staticmethod def fill_value_by_na(dataframe, source_item, fill_way): if fill_way == 'mean': return dataframe[source_item].mean() if fill_way == 'median': return dataframe[source_item].median() if fill_way == 'mode': if len(dataframe[source_item].mode()) >= 1: return dataframe[source_item].mode()[0] if fill_way == 'median': return dataframe[source_item].median() if fill_way == 'min': return dataframe[source_item].min() if fill_way == 'max': return dataframe[source_item].max() return None class DCTmp(Base): ''' 1. range_na 加载,并设置 na的取值范围 2. dropna 是否为 True, 2.1 True 则进行删除操作 2.2 False 则执行 3 3. 是否删除离群点 3.1 True时表示需要删除离群点 3.1.1 识别离群点 3.1.2 删除离群点数据 end 3.2 False 表示不删除离群点 4. 是否进行数据填充 4.1 False 放弃填充,原样返回 end 4.2 True 需要填充数据 4.2.1 读取填充方式,并计算出待填充的值 4.2.2 使用fillna进行填充 end ''' def clean(self, config, describe, dataframe) -> None: # step 1 # step 2 if config.dropna: # step 2.1 dataframe.dropna(subset=config.source_item, inplace=True) # step 4 elif config.fillna: # step 4.2.1 fill_value = Base.fill_value_by_na(dataframe, config.fillna_way) if fill_value is not None: # step 4.2.2 dataframe.fillna(value={config.source_item:fill_value}, inplace=True) # step 3 if config.drop_solitude: # step 3.1.1 outliers = Base.outlier_for_std(dataframe, config.source_item) if len(outliers) > 0: # step 3.1.2 dataframe.drop(index=outliers.index, inplace=True) else: # 3.2 pass