第三章 特征增强:清洗数据
import os
os. listdir( )
['.config', 'sample_data']
!git clone https: // github. com/ ** ** ** ** / Feature- Engineering- Made- Easy. git
Cloning into 'Feature-Engineering-Made-Easy'...
remote: Enumerating objects: 63, done.[K
remote: Total 63 (delta 0), reused 0 (delta 0), pack-reused 63[K
Unpacking objects: 100% (63/63), done.
Checking out files: 100% (62/62), done.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
% matplotlib inline
plt. style. use( 'fivethirtyeight' )
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
import pandas.util.testing as tm
pima = pd. read_csv( '/content/Feature-Engineering-Made-Easy/data/pima.data' )
pima. head( )
6 148 72 35 0 33.6 0.627 50 1 0 1 85 66 29 0 26.6 0.351 31 0 1 8 183 64 0 0 23.3 0.672 32 1 2 1 89 66 23 94 28.1 0.167 21 0 3 0 137 40 35 168 43.1 2.288 33 1 4 5 116 74 0 0 25.6 0.201 30 0
pima_column_names = [ 'times_pregnant' , 'plasma_glucose_concentration' ,
'diastolic_blood_pressure' , 'triceps_thickness' , 'serum_insulin' , 'bmi' ,
'pedigree_function' , 'age' , 'onset_diabetes' ]
pima = pd. read_csv( '/content/Feature-Engineering-Made-Easy/data/pima.data' , names = pima_column_names)
pima. head( )
times_pregnant plasma_glucose_concentration diastolic_blood_pressure triceps_thickness serum_insulin bmi pedigree_function age onset_diabetes 0 6 148 72 35 0 33.6 0.627 50 1 1 1 85 66 29 0 26.6 0.351 31 0 2 8 183 64 0 0 23.3 0.672 32 1 3 1 89 66 23 94 28.1 0.167 21 0 4 0 137 40 35 168 43.1 2.288 33 1
pima[ 'onset_diabetes' ] . value_counts( normalize = True )
0 0.651042
1 0.348958
Name: onset_diabetes, dtype: float64
col = 'plasma_glucose_concentration'
plt. hist( pima[ pima[ 'onset_diabetes' ] == 0 ] [ col] , alpha = 0.5 , label = 'non_diabetes' )
plt. hist( pima[ pima[ 'onset_diabetes' ] == 1 ] [ col] , alpha = .5 , label = 'diabetes' )
plt. legend( loc = 'upper right' )
plt. xlabel( col)
plt. ylabel( 'Frequency' )
plt. title( 'Histogram of {}' . format ( col) )
plt. show( )
for col in [ 'times_pregnant' , 'plasma_glucose_concentration' ,
'diastolic_blood_pressure' , 'triceps_thickness' , 'serum_insulin' , 'bmi' ,
'pedigree_function' , 'age' ] :
plt. hist( pima[ pima[ 'onset_diabetes' ] == 0 ] [ col] , 10 , alpha = 0.5 , label = 'non_diabetes' )
plt. hist( pima[ pima[ 'onset_diabetes' ] == 1 ] [ col] , 10 , alpha = .5 , label = 'diabetes' )
plt. legend( loc = 'upper right' )
plt. xlabel( col)
plt. ylabel( 'Frequency' )
plt. title( 'Histogram of {}' . format ( col) )
plt. show( )
import seaborn as sns
sns. heatmap( pima. corr( ) )
<matplotlib.axes._subplots.AxesSubplot at 0x7f5e2606fc50>
pima. isnull( ) . sum ( )
times_pregnant 0
plasma_glucose_concentration 0
diastolic_blood_pressure 0
triceps_thickness 0
serum_insulin 0
bmi 0
pedigree_function 0
age 0
onset_diabetes 0
dtype: int64
pima. shape
(768, 9)
pima[ 'onset_diabetes' ] . value_counts( normalize = True )
0 0.651042
1 0.348958
Name: onset_diabetes, dtype: float64
pima. describe( )
times_pregnant plasma_glucose_concentration diastolic_blood_pressure triceps_thickness serum_insulin bmi pedigree_function age onset_diabetes count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958 std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951 min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000 25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000 50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000 75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000 max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000
为0
columns = [ 'serum_insulin' , 'bmi' , 'plasma_glucose_concentration' ,
'diastolic_blood_pressure' , 'triceps_thickness' , 'serum_insulin' ]
for col in columns:
pima[ col] = pima[ col] . map ( lambda value: value if value != 0 else None )
pima. isnull( ) . sum ( )
times_pregnant 0
plasma_glucose_concentration 5
diastolic_blood_pressure 35
triceps_thickness 227
serum_insulin 374
bmi 11
pedigree_function 0
age 0
onset_diabetes 0
dtype: int64
pima. head( )
times_pregnant plasma_glucose_concentration diastolic_blood_pressure triceps_thickness serum_insulin bmi pedigree_function age onset_diabetes 0 6 148.0 72.0 35.0 NaN 33.6 0.627 50 1 1 1 85.0 66.0 29.0 NaN 26.6 0.351 31 0 2 8 183.0 64.0 NaN NaN 23.3 0.672 32 1 3 1 89.0 66.0 23.0 94.0 28.1 0.167 21 0 4 0 137.0 40.0 35.0 168.0 43.1 2.288 33 1
pima. describe( )
times_pregnant plasma_glucose_concentration diastolic_blood_pressure triceps_thickness serum_insulin bmi pedigree_function age onset_diabetes count 768.000000 763.000000 733.000000 541.000000 394.000000 757.000000 768.000000 768.000000 768.000000 mean 3.845052 121.686763 72.405184 29.153420 155.548223 32.457464 0.471876 33.240885 0.348958 std 3.369578 30.535641 12.382158 10.476982 118.775855 6.924988 0.331329 11.760232 0.476951 min 0.000000 44.000000 24.000000 7.000000 14.000000 18.200000 0.078000 21.000000 0.000000 25% 1.000000 99.000000 64.000000 22.000000 76.250000 27.500000 0.243750 24.000000 0.000000 50% 3.000000 117.000000 72.000000 29.000000 125.000000 32.300000 0.372500 29.000000 0.000000 75% 6.000000 141.000000 80.000000 36.000000 190.000000 36.600000 0.626250 41.000000 1.000000 max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000
填充缺失值
empty_plasma_index = pima[ pima[ 'plasma_glucose_concentration' ] . isnull( ) ] . index
empty_plasma_index
Int64Index([75, 182, 342, 349, 502], dtype='int64')
pima. loc[ empty_plasma_index] [ 'plasma_glucose_concentration' ]
75 NaN
182 NaN
342 NaN
349 NaN
502 NaN
Name: plasma_glucose_concentration, dtype: float64
from sklearn. impute import SimpleImputer
imputer = SimpleImputer( strategy= 'mean' )
pima_imputed = imputer. fit_transform( pima)
type ( pima_imputed)
numpy.ndarray
pima_imputed = pd. DataFrame( pima_imputed, columns= pima_column_names)
pima_imputed. head( )
times_pregnant plasma_glucose_concentration diastolic_blood_pressure triceps_thickness serum_insulin bmi pedigree_function age onset_diabetes 0 6.0 148.0 72.0 35.00000 155.548223 33.6 0.627 50.0 1.0 1 1.0 85.0 66.0 29.00000 155.548223 26.6 0.351 31.0 0.0 2 8.0 183.0 64.0 29.15342 155.548223 23.3 0.672 32.0 1.0 3 1.0 89.0 66.0 23.00000 94.000000 28.1 0.167 21.0 0.0 4 0.0 137.0 40.0 35.00000 168.000000 43.1 2.288 33.0 1.0
pima_imputed. isnull( ) . sum ( )
times_pregnant 0
plasma_glucose_concentration 0
diastolic_blood_pressure 0
triceps_thickness 0
serum_insulin 0
bmi 0
pedigree_function 0
age 0
onset_diabetes 0
dtype: int64
在机器学习流水线上填充值
借
from sklearn. model_selection import train_test_split
from sklearn. neighbors import KNeighborsClassifier
from sklearn. model_selection import GridSearchCV
X = pima[ [ 'serum_insulin' ] ] . copy( )
y = pima[ 'onset_diabetes' ] . copy( )
X. isnull( ) . sum ( )
serum_insulin 374
dtype: int64
X. shape
(768, 1)
entire_data_set_mean = X. mean( )
X = X. fillna( entire_data_set_mean)
print ( entire_data_set_mean)
serum_insulin 155.548223
dtype: float64
X_train, X_test, y_train, y_test = train_test_split( X, y, random_state = 99 )
X_train. shape, y_train. shape
((576, 1), (576,))
knn = KNeighborsClassifier( )
knn. fit( X_train, y_train)
knn. score( X_test, y_test)
0.65625
上例中训练集和测试集填充缺失值错误采用了均值填充
矺夹借错误采用了ĺ‡ĺ€źĺĄŤĺ
X = pima[ [ 'serum_insulin' ] ] . copy( )
y = pima[ 'onset_diabetes' ] . copy( )
X_train, X_test, y_train, y_test = train_test_split( X, y, random_state = 99 )
X. isnull( ) . sum ( )
serum_insulin 374
dtype: int64
X_test. shape, y_test. shape, X_train. shape, y_train. shape
((192, 1), (192,), (576, 1), (576,))
training_mean = X_train. mean( )
X_train = X_train. fillna( training_mean)
X_test = X_test. fillna( training_mean)
print ( training_mean)
serum_insulin 158.546053
dtype: float64
X_test. shape, y_test. shape, X_train. shape, y_train. shape
((192, 1), (192,), (576, 1), (576,))
knn = KNeighborsClassifier( )
knn. fit( X_train, y_train)
print ( knn. score( X_test, y_test) )
0.4895833333333333
Pipeline
from sklearn. pipeline import Pipeline
knn_params = { 'classify__n_neighbors' : [ 1 , 2 , 3 , 4 , 5 , 6 , 7 ] }
knn = KNeighborsClassifier( )
mean_impute = Pipeline( [ ( 'imputer' , SimpleImputer( strategy= 'mean' ) ) , ( 'classify' , knn) ] )
X = pima. drop( 'onset_diabetes' , axis = 1 )
y = pima[ 'onset_diabetes' ]
grid = GridSearchCV( mean_impute, knn_params)
grid. fit( X, y)
print ( grid. best_score_, grid. best_params_)
0.7305407011289364 {'classify__n_neighbors': 7}
from sklearn. pipeline import Pipeline
knn_params = { 'classify__n_neighbors' : [ 1 , 2 , 3 , 4 , 5 , 6 , 7 ] }
knn = KNeighborsClassifier( )
median_impute = Pipeline( [ ( 'imputer' , SimpleImputer( strategy= 'median' ) ) , ( 'classify' , knn) ] )
X = pima. drop( 'onset_diabetes' , axis = 1 )
y = pima[ 'onset_diabetes' ]
grid = GridSearchCV( median_impute, knn_params)
grid. fit( X, y)
print ( grid. best_score_, grid. best_params_)
0.7292589763177999 {'classify__n_neighbors': 7}
标准化与归一化
impute = SimpleImputer( )
pima_imputed_mean = pd. DataFrame( impute. fit_transform( pima) , columns= pima_column_names)
pima_imputed_mean. hist( figsize= ( 15 , 15 ) ) ;
pima_imputed_mean. hist( figsize = ( 15 , 15 ) , sharex= True ) ;
Z_score
from sklearn. preprocessing import StandardScaler
scale = StandardScaler( )
pima_imputed__mean_scaled = pd. DataFrame( scale. fit_transform( pima_imputed_mean) , columns= pima_column_names)
pima_imputed__mean_scaled. hist( figsize= ( 15 , 15 ) , sharex= True ) ;
from sklearn. pipeline import Pipeline
from sklearn. preprocessing import StandardScaler
饝é‡ć–°ĺŽšäš‰ĺ‚数䝼珌ĺˆćľć°´çşż
knn_params = { 'imputer__strategy' : [ 'mean' , 'median' ] , 'classify__n_neighbors' : [ 1 , 2 ,
3 , 4 , 5 , 6 , 7 ] }
mean_impute_standardize = Pipeline( [ ( 'imputer' , SimpleImputer( ) ) , ( 'standardize' , StandardScaler( ) ) , ( 'classify' , knn) ] )
X = pima. drop( 'onset_diabetes' , axis = 1 )
y = pima[ 'onset_diabetes' ]
grid = GridSearchCV( mean_impute_standardize, knn_params)
grid. fit( X, y)
print ( grid. best_score_, grid. best_params_)
0.7539173245055598 {'classify__n_neighbors': 7, 'imputer__strategy': 'mean'}
knn_params = { 'imputer__strategy' : [ 'mean' , 'median' ] , 'classify__n_neighbors' : [ 1 , 2 ,
3 , 4 , 5 , 6 , 7 ] }
mean_impute_standardize = Pipeline( [ ( 'imputer' , SimpleImputer( ) ) , ( 'standardize' ,
StandardScaler( ) ) , ( 'classify' , knn) ] )
X = pima. drop( 'onset_diabetes' , axis= 1 )
y = pima[ 'onset_diabetes' ]
grid = GridSearchCV( mean_impute_standardize, knn_params)
grid. fit( X, y)
print ( grid. best_score_, grid. best_params_)
0.7539173245055598 {'classify__n_neighbors': 7, 'imputer__strategy': 'mean'}