items.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. #!encoding:utf8
  2. import pandas as pd
  3. from decimal import Decimal,InvalidOperation
  4. from numpy import float128
  5. class Base():
  6. @staticmethod
  7. def describe(series):
  8. return series.describe(include='all')
  9. @staticmethod
  10. def to_dataframe(data, target_item, dtype):
  11. s1v = []
  12. htimev = []
  13. for one in data:
  14. if dtype == str:
  15. s1v.append(one[0])
  16. htimev.append(one[1])
  17. else:
  18. try:
  19. v = Decimal(one[0])
  20. except InvalidOperation:
  21. v = None
  22. finally:
  23. s1v.append(v)
  24. htimev.append(one[1])
  25. s1 = pd.Series(s1v, dtype=dtype)
  26. s2 = pd.Series(htimev)
  27. return pd.DataFrame({target_item: s1, 'htime': s2})
  28. @staticmethod
  29. def outlier_for_std(dataframe, source_item, bit=2) -> pd.DataFrame:
  30. '''
  31. 通过标准方差来计算离群点
  32. '''
  33. mean, std = dataframe[source_item].mean(), dataframe[source_item].std()
  34. upb, lob = mean + std * bit, mean - std * bit
  35. outliers = df[(df[source_item] < lob) | (df[source_item] > upb)]
  36. return outliers
  37. @staticmethod
  38. def outlier_for_iqr(dataframe, source_item, bit=1.5) -> pd.DataFrame:
  39. '''
  40. 通过四分位极差法来计算离群点
  41. '''
  42. q1, q3 = dataframe[source_item].quantile(0.25), dataframe[source_item].quantile(0.75)
  43. iqr = q3 - q1
  44. upb, lob = mean + iqr * bit, mean - iqr * bit
  45. outliers = df[(df[source_item] < lob) | (df[source_item] > upb)]
  46. return outliers
  47. @staticmethod
  48. def fill_value_by_na(dataframe, source_item, fill_way):
  49. if fill_way == 'mean':
  50. return dataframe[source_item].mean()
  51. if fill_way == 'median':
  52. return dataframe[source_item].median()
  53. if fill_way == 'mode':
  54. if len(dataframe[source_item].mode()) >= 1:
  55. return dataframe[source_item].mode()[0]
  56. if fill_way == 'median':
  57. return dataframe[source_item].median()
  58. if fill_way == 'min':
  59. return dataframe[source_item].min()
  60. if fill_way == 'max':
  61. return dataframe[source_item].max()
  62. return None
  63. class DCTmp(Base):
  64. '''
  65. 1. range_na 加载,并设置 na的取值范围
  66. 2. dropna 是否为 True,
  67. 2.1 True 则进行删除操作
  68. 2.2 False 则执行 3
  69. 3. 是否删除离群点
  70. 3.1 True时表示需要删除离群点
  71. 3.1.1 识别离群点
  72. 3.1.2 删除离群点数据 end
  73. 3.2 False 表示不删除离群点
  74. 4. 是否进行数据填充
  75. 4.1 False 放弃填充,原样返回 end
  76. 4.2 True 需要填充数据
  77. 4.2.1 读取填充方式,并计算出待填充的值
  78. 4.2.2 使用fillna进行填充 end
  79. '''
  80. def clean(self, config, describe, dataframe) -> None:
  81. # step 1
  82. # step 2
  83. if config.dropna:
  84. # step 2.1
  85. dataframe.dropna(subset=config.source_item, inplace=True)
  86. # step 4
  87. elif config.fillna:
  88. # step 4.2.1
  89. fill_value = Base.fill_value_by_na(dataframe, config.fillna_way)
  90. if fill_value is not None:
  91. # step 4.2.2
  92. dataframe.fillna(value={config.source_item:fill_value}, inplace=True)
  93. # step 3
  94. if config.drop_solitude:
  95. # step 3.1.1
  96. outliers = Base.outlier_for_std(dataframe, config.source_item)
  97. if len(outliers) > 0:
  98. # step 3.1.2
  99. dataframe.drop(index=outliers.index, inplace=True)
  100. else:
  101. # 3.2
  102. pass