python数据处理之异常值删除(3的塔准则)
# 导入库
import numpy as np
import pandas as pd
from matplotlib import pyplot
from pandas.core.frame import DataFrame
import os
from pandas.plotting import scatter_matrix
import random
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
os.getcwd() # 获取文件路径
os.chdir(r'F:\数据\数据分析') # 文件所在路径
data = pd.read_excel('mydata.xlsx', 'Sheet0')
data.head(10)
data.shape
mean = data.mean()
std = data.std()
range_low = mean-3*std
range_high = mean+3*std
new_data = data
num=0
'''以3*detal准则为依据删除异常值'''
for i in range(677): #行
for j in range(22): #属性
if range_low[j] > data.iloc[i,j] or data.iloc[i,j] > range_high[j]:
print('i',i)
new_data = new_data.drop([i],axis=0)
num = num+1
print('num:',num)
break
data = new_data
'''数据分析查看'''
data.describe()
data.plot(kind='box',subplots=True,layout=(4,6),sharex=False,sharey=False)
pyplot.show()
data.hist()
pyplot.show()
# 散点矩阵图
scatter_matrix(data)
pyplot.show()
'''保存清洗之后的数据'''
data.to_excel('qinxidata.xlsx', sheet_name='Sheet0',index=False)