特征工程2

第三章 特征增强:清洗数据

import os 
os.listdir()
['.config', 'sample_data']
!git clone https://github.com/********/Feature-Engineering-Made-Easy.git
Cloning into 'Feature-Engineering-Made-Easy'...
remote: Enumerating objects: 63, done.[K
remote: Total 63 (delta 0), reused 0 (delta 0), pack-reused 63[K
Unpacking objects: 100% (63/63), done.
Checking out files: 100% (62/62), done.
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline 
plt.style.use('fivethirtyeight')
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
pima = pd.read_csv('/content/Feature-Engineering-Made-Easy/data/pima.data')
pima.head()
61487235033.60.627501
01856629026.60.351310
18183640023.30.672321
218966239428.10.167210
30137403516843.12.288331
45116740025.60.201300
pima_column_names = ['times_pregnant', 'plasma_glucose_concentration',
'diastolic_blood_pressure', 'triceps_thickness', 'serum_insulin', 'bmi',
'pedigree_function', 'age', 'onset_diabetes']

pima = pd.read_csv('/content/Feature-Engineering-Made-Easy/data/pima.data',names = pima_column_names)

pima.head()

times_pregnantplasma_glucose_concentrationdiastolic_blood_pressuretriceps_thicknessserum_insulinbmipedigree_functionageonset_diabetes
061487235033.60.627501
11856629026.60.351310
28183640023.30.672321
318966239428.10.167210
40137403516843.12.288331
pima['onset_diabetes'].value_counts(normalize = True)
0    0.651042
1    0.348958
Name: onset_diabetes, dtype: float64
#绘制两类的直方图
col = 'plasma_glucose_concentration'
plt.hist(pima[pima['onset_diabetes']==0][col],alpha = 0.5,label = 'non_diabetes')
plt.hist(pima[pima['onset_diabetes']==1][col],alpha =.5,label = 'diabetes')
plt.legend(loc = 'upper right')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.title('Histogram of {}'.format(col))
plt.show()

在这里插入图片描述

for col in ['times_pregnant', 'plasma_glucose_concentration',
'diastolic_blood_pressure', 'triceps_thickness', 'serum_insulin', 'bmi',
'pedigree_function', 'age']:
  plt.hist(pima[pima['onset_diabetes']==0][col],10,alpha = 0.5,label = 'non_diabetes')
  plt.hist(pima[pima['onset_diabetes']==1][col],10,alpha =.5,label = 'diabetes')
  plt.legend(loc = 'upper right')
  plt.xlabel(col)
  plt.ylabel('Frequency')
  plt.title('Histogram of {}'.format(col))
  plt.show()

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

在这里插入图片描述
在这里插入图片描述

import seaborn as sns

sns.heatmap(pima.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x7f5e2606fc50>

在这里插入图片描述

pima.isnull().sum()
times_pregnant                  0
plasma_glucose_concentration    0
diastolic_blood_pressure        0
triceps_thickness               0
serum_insulin                   0
bmi                             0
pedigree_function               0
age                             0
onset_diabetes                  0
dtype: int64
pima.shape
(768, 9)
pima['onset_diabetes'].value_counts(normalize = True)
# 空准确率
0    0.651042
1    0.348958
Name: onset_diabetes, dtype: float64
pima.describe()
times_pregnantplasma_glucose_concentrationdiastolic_blood_pressuretriceps_thicknessserum_insulinbmipedigree_functionageonset_diabetes
count768.000000768.000000768.000000768.000000768.000000768.000000768.000000768.000000768.000000
mean3.845052120.89453169.10546920.53645879.79947931.9925780.47187633.2408850.348958
std3.36957831.97261819.35580715.952218115.2440027.8841600.33132911.7602320.476951
min0.0000000.0000000.0000000.0000000.0000000.0000000.07800021.0000000.000000
25%1.00000099.00000062.0000000.0000000.00000027.3000000.24375024.0000000.000000
50%3.000000117.00000072.00000023.00000030.50000032.0000000.37250029.0000000.000000
75%6.000000140.25000080.00000032.000000127.25000036.6000000.62625041.0000001.000000
max17.000000199.000000122.00000099.000000846.00000067.1000002.42000081.0000001.000000
#注意到缺失值被填充为0

为0
columns = ['serum_insulin', 'bmi', 'plasma_glucose_concentration',
'diastolic_blood_pressure', 'triceps_thickness','serum_insulin']

for col in columns:
  pima[col] = pima[col].map(lambda value: value if value !=0 else None)

pima.isnull().sum()
times_pregnant                    0
plasma_glucose_concentration      5
diastolic_blood_pressure         35
triceps_thickness               227
serum_insulin                   374
bmi                              11
pedigree_function                 0
age                               0
onset_diabetes                    0
dtype: int64
pima.head()
times_pregnantplasma_glucose_concentrationdiastolic_blood_pressuretriceps_thicknessserum_insulinbmipedigree_functionageonset_diabetes
06148.072.035.0NaN33.60.627501
1185.066.029.0NaN26.60.351310
28183.064.0NaNNaN23.30.672321
3189.066.023.094.028.10.167210
40137.040.035.0168.043.12.288331
pima.describe()
times_pregnantplasma_glucose_concentrationdiastolic_blood_pressuretriceps_thicknessserum_insulinbmipedigree_functionageonset_diabetes
count768.000000763.000000733.000000541.000000394.000000757.000000768.000000768.000000768.000000
mean3.845052121.68676372.40518429.153420155.54822332.4574640.47187633.2408850.348958
std3.36957830.53564112.38215810.476982118.7758556.9249880.33132911.7602320.476951
min0.00000044.00000024.0000007.00000014.00000018.2000000.07800021.0000000.000000
25%1.00000099.00000064.00000022.00000076.25000027.5000000.24375024.0000000.000000
50%3.000000117.00000072.00000029.000000125.00000032.3000000.37250029.0000000.000000
75%6.000000141.00000080.00000036.000000190.00000036.6000000.62625041.0000001.000000
max17.000000199.000000122.00000099.000000846.00000067.1000002.42000081.0000001.000000

填充缺失值

empty_plasma_index = pima[pima['plasma_glucose_concentration'].isnull()].index
empty_plasma_index
Int64Index([75, 182, 342, 349, 502], dtype='int64')
pima.loc[empty_plasma_index]['plasma_glucose_concentration']
75    NaN
182   NaN
342   NaN
349   NaN
502   NaN
Name: plasma_glucose_concentration, dtype: float64
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')

pima_imputed = imputer.fit_transform(pima)

type(pima_imputed)
numpy.ndarray
pima_imputed = pd.DataFrame(pima_imputed,columns=pima_column_names)

pima_imputed.head()
times_pregnantplasma_glucose_concentrationdiastolic_blood_pressuretriceps_thicknessserum_insulinbmipedigree_functionageonset_diabetes
06.0148.072.035.00000155.54822333.60.62750.01.0
11.085.066.029.00000155.54822326.60.35131.00.0
28.0183.064.029.15342155.54822323.30.67232.01.0
31.089.066.023.0000094.00000028.10.16721.00.0
40.0137.040.035.00000168.00000043.12.28833.01.0
pima_imputed.isnull().sum()
times_pregnant                  0
plasma_glucose_concentration    0
diastolic_blood_pressure        0
triceps_thickness               0
serum_insulin                   0
bmi                             0
pedigree_function               0
age                             0
onset_diabetes                  0
dtype: int64

在机器学习流水线上填充值

借

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
X = pima[['serum_insulin']].copy()   ## ĺŒă€ă€‘
y = pima['onset_diabetes'].copy()

X.isnull().sum()
serum_insulin    374
dtype: int64
X.shape
(768, 1)
entire_data_set_mean = X.mean()
X = X.fillna(entire_data_set_mean)
print(entire_data_set_mean)
serum_insulin    155.548223
dtype: float64
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 99)
X_train.shape,y_train.shape
((576, 1), (576,))

knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.65625

上例中训练集和测试集填充缺失值错误采用了均值填充

çźşĺ¤ąĺ€źé”™čŻŻé‡‡ç”¨äş†ĺ‡ĺ€źĺĄŤĺ

# 合适的方法
X = pima[['serum_insulin']].copy()
y = pima['onset_diabetes'].copy()

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 99)
X.isnull().sum()
serum_insulin    374
dtype: int64
X_test.shape,y_test.shape,X_train.shape,y_train.shape
((192, 1), (192,), (576, 1), (576,))
training_mean = X_train.mean()
X_train = X_train.fillna(training_mean)
X_test = X_test.fillna(training_mean)

print(training_mean)
serum_insulin    158.546053
dtype: float64
X_test.shape,y_test.shape,X_train.shape,y_train.shape
((192, 1), (192,), (576, 1), (576,))
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
print(knn.score(X_test,y_test))
0.4895833333333333

Pipeline

from sklearn.pipeline import Pipeline

knn_params = {'classify__n_neighbors':[1,2,3,4,5,6,7]} # 必须重新定义参数以符合流水线

knn = KNeighborsClassifier()

mean_impute = Pipeline([('imputer',SimpleImputer(strategy='mean')),('classify',knn)])

X = pima.drop('onset_diabetes',axis = 1)
y = pima['onset_diabetes']

grid = GridSearchCV(mean_impute,knn_params)
grid.fit(X,y)

print(grid.best_score_,grid.best_params_)
0.7305407011289364 {'classify__n_neighbors': 7}
from sklearn.pipeline import Pipeline

knn_params = {'classify__n_neighbors':[1,2,3,4,5,6,7]} # 必须重新定义参数以符合流水线

knn = KNeighborsClassifier()

median_impute = Pipeline([('imputer',SimpleImputer(strategy='median')),('classify',knn)])

X = pima.drop('onset_diabetes',axis = 1)
y = pima['onset_diabetes']

grid = GridSearchCV(median_impute,knn_params)
grid.fit(X,y)

print(grid.best_score_,grid.best_params_)
0.7292589763177999 {'classify__n_neighbors': 7}

标准化与归一化

impute = SimpleImputer()

pima_imputed_mean = pd.DataFrame(impute.fit_transform(pima),columns=pima_column_names)
pima_imputed_mean.hist(figsize=(15,15));

在这里插入图片描述

pima_imputed_mean.hist(figsize = (15,15),sharex=True);

在这里插入图片描述

Z_score

from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
pima_imputed__mean_scaled = pd.DataFrame(scale.fit_transform(pima_imputed_mean),columns=pima_column_names)
pima_imputed__mean_scaled.hist(figsize=(15,15),sharex=True);

在这里插入图片描述

#将Z-score加入到Pipeline中
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# knn_params = {'imputer__stragety':['mean','median'],'classify__n_neighbors':[1,2,3,4,5,6,7]} # ĺż
éĄťé‡ć–°ĺŽšäš‰ĺ‚ć•°äťĽçŹŚĺˆćľć°´çşż
knn_params = {'imputer__strategy':['mean','median'], 'classify__n_neighbors':[1, 2, 
3, 4, 5, 6, 7]}

mean_impute_standardize = Pipeline([('imputer',SimpleImputer()),('standardize',StandardScaler()),('classify',knn)])

X = pima.drop('onset_diabetes',axis = 1)
y = pima['onset_diabetes']

grid = GridSearchCV(mean_impute_standardize,knn_params)
grid.fit(X,y)

print(grid.best_score_,grid.best_params_)
0.7539173245055598 {'classify__n_neighbors': 7, 'imputer__strategy': 'mean'}
knn_params = {'imputer__strategy':['mean', 'median'], 'classify__n_neighbors':[1, 2, 
3, 4, 5, 6, 7]}

mean_impute_standardize = Pipeline([('imputer', SimpleImputer()), ('standardize',
StandardScaler()), ('classify', knn)])
X = pima.drop('onset_diabetes', axis=1)
y = pima['onset_diabetes']

grid = GridSearchCV(mean_impute_standardize, knn_params)
grid.fit(X, y)

print(grid.best_score_, grid.best_params_)
0.7539173245055598 {'classify__n_neighbors': 7, 'imputer__strategy': 'mean'}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值