1)删除异常值
#删除异常值
def outliers_proc(data, col_name, scale=3):
"""
用于清洗异常值,默认用 box_plot(scale=3)进行清洗
:param data: 接收 pandas 数据格式
:param col_name: pandas 列名
:param scale: 尺度
:return:
"""
#data_ser = Train_data
#col_name = 'power'
#scale = 3
#box_scale = 3
#data = Train_data
def box_plot_outliers(data_ser, box_scale):
"""
利用箱线图去除异常值
:param data_ser: 接收 pandas.Series 数据格式
:param box_scale: 箱线图尺度,
:return:
"""
# from pandas.core.frame import DataFrame
#a=[1,2,3,4]
#a=DataFrame(a)
#a.quantile(0.75)
#a.quantile(0.25)
iqr = box_scale * (data_ser.quantile(0.75) - data_ser.quantile(0.25))
#确定正常值范围
val_low = data_ser.quantile(0.25) - iqr #极差的1.5倍
val_up = data_ser.quantile(0.75) + iqr
#找出在正常值范围外的值
rule_low = (data_ser < val_low)
rule_up = (data_ser > val_up)
return (rule_low, rule_up), (val_low, val_up)
data_n = data.copy()
data_series = data_n[col_name]
rule, value = box_plot_outliers(data_series, box_scale=scale)
#取出异常值样本的编号
index = np.arange(data_series.shape[0])[rule[0] | rule[1]]
print(