可视化观察是否有异常值
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
data = np. random. randn( 1000 ) + 10
s = pd. DataFrame( data= data, columns= [ "value" ] )
fig = plt. figure( figsize = ( 10 , 6 ) )
ax1 = fig. add_subplot( 3 , 1 , 1 )
ax1. scatter( s. index, s. values)
plt. grid( )
ax2 = fig. add_subplot( 3 , 1 , 2 )
s. hist( bins= 30 , alpha = 0.5 , ax = ax2)
s. plot( kind = 'kde' , secondary_y= True , ax = ax2)
plt. grid( )
import seaborn as sns
ax2 = fig. add_subplot( 3 , 1 , 3 )
sns. boxplot( data= s. values, orient= "h" )
<matplotlib.axes._subplots.AxesSubplot at 0x1ac493d0370>
定义 量化的 寻找异常值的函数
def outliers ( data, col_name, scale= 3 ) :
def box_plot_outliers ( data_ser, box_scale) :
IQR = box_scale * ( data_ser. quantile( 0.75 ) - data_ser. quantile( 0.25 ) )
val_low = data_ser. quantile( 0.25 ) - IQR
val_up = data_ser. quantile( 0.75 ) + IQR
rule_low = ( data_ser < val_low)
rule_up = ( data_ser > val_up)
return ( rule_low, rule_up) , ( val_low, val_up)
data_n = data. copy( )
data_series = data_n[ col_name]
rule, value = box_plot_outliers( data_series, box_scale= scale)
index = np. arange( data_series. shape[ 0 ] ) [ rule[ 0 ] | rule[ 1 ] ]
print ( "删除了: {} 个数据" . format ( len ( index) ) )
data_n. reset_index( drop= True , inplace= True )
print ( "剩余: {} 个数据" . format ( data_n. shape[ 0 ] ) )
index_low = np. arange( data_series. shape[ 0 ] ) [ rule[ 0 ] ]
outliers = data_series. iloc[ index_low]
print ( "小于下边缘线的数据详细:" )
print ( pd. Series( outliers) . describe( ) )
index_up = np. arange( data_series. shape[ 0 ] ) [ rule[ 1 ] ]
outliers = data_series. iloc[ index_up]
print ( "大于上边缘线的数据详细:" )
print ( pd. Series( outliers) . describe( ) )
fig, ax = plt. subplots( 1 , 2 , figsize= ( 10 , 7 ) )
sns. boxplot( y= data[ col_name] , data= data, ax= ax[ 0 ] )
sns. boxplot( y= data_n[ col_name] , data= data_n, ax= ax[ 1 ] )
return data_n
data_new = outliers( s, "value" , scale= 3 )
删除了: 0 个数据
剩余: 1000 个数据
小于下边缘线的数据详细:
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
Name: value, dtype: float64
大于上边缘线的数据详细:
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
Name: value, dtype: float64