123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- #!encoding:utf8
- import pandas as pd
- from decimal import Decimal,InvalidOperation
- from numpy import float128
- class Base():
- @staticmethod
- def describe(series):
- return series.describe(include='all')
- @staticmethod
- def to_dataframe(data, target_item, dtype):
- s1v = []
- htimev = []
- for one in data:
- if dtype == str:
- s1v.append(one[0])
- htimev.append(one[1])
- else:
- try:
- v = Decimal(one[0])
- except InvalidOperation:
- v = None
- finally:
- s1v.append(v)
- htimev.append(one[1])
- s1 = pd.Series(s1v, dtype=dtype)
- s2 = pd.Series(htimev)
- return pd.DataFrame({target_item: s1, 'htime': s2})
- @staticmethod
- def outlier_for_std(dataframe, source_item, bit=2) -> pd.DataFrame:
- '''
- 通过标准方差来计算离群点
- '''
- mean, std = dataframe[source_item].mean(), dataframe[source_item].std()
- upb, lob = mean + std * bit, mean - std * bit
-
- outliers = df[(df[source_item] < lob) | (df[source_item] > upb)]
- return outliers
- @staticmethod
- def outlier_for_iqr(dataframe, source_item, bit=1.5) -> pd.DataFrame:
- '''
- 通过四分位极差法来计算离群点
- '''
- q1, q3 = dataframe[source_item].quantile(0.25), dataframe[source_item].quantile(0.75)
- iqr = q3 - q1
- upb, lob = mean + iqr * bit, mean - iqr * bit
-
- outliers = df[(df[source_item] < lob) | (df[source_item] > upb)]
- return outliers
- @staticmethod
- def fill_value_by_na(dataframe, source_item, fill_way):
- if fill_way == 'mean':
- return dataframe[source_item].mean()
-
- if fill_way == 'median':
- return dataframe[source_item].median()
- if fill_way == 'mode':
- if len(dataframe[source_item].mode()) >= 1:
- return dataframe[source_item].mode()[0]
- if fill_way == 'median':
- return dataframe[source_item].median()
- if fill_way == 'min':
- return dataframe[source_item].min()
- if fill_way == 'max':
- return dataframe[source_item].max()
- return None
- class DCTmp(Base):
- '''
- 1. range_na 加载,并设置 na的取值范围
- 2. dropna 是否为 True,
- 2.1 True 则进行删除操作
- 2.2 False 则执行 3
- 3. 是否删除离群点
- 3.1 True时表示需要删除离群点
- 3.1.1 识别离群点
- 3.1.2 删除离群点数据 end
- 3.2 False 表示不删除离群点
- 4. 是否进行数据填充
- 4.1 False 放弃填充,原样返回 end
- 4.2 True 需要填充数据
- 4.2.1 读取填充方式,并计算出待填充的值
- 4.2.2 使用fillna进行填充 end
- '''
- def clean(self, config, describe, dataframe) -> None:
- # step 1
- # step 2
- if config.dropna:
- # step 2.1
- dataframe.dropna(subset=config.source_item, inplace=True)
- # step 4
- elif config.fillna:
- # step 4.2.1
- fill_value = Base.fill_value_by_na(dataframe, config.fillna_way)
- if fill_value is not None:
- # step 4.2.2
- dataframe.fillna(value={config.source_item:fill_value}, inplace=True)
- # step 3
- if config.drop_solitude:
- # step 3.1.1
- outliers = Base.outlier_for_std(dataframe, config.source_item)
- if len(outliers) > 0:
- # step 3.1.2
- dataframe.drop(index=outliers.index, inplace=True)
- else:
- # 3.2
- pass
|