特征工程2

最新推荐文章于 2021-02-16 13:49:53 发布

Up_梅子酒

最新推荐文章于 2021-02-16 13:49:53 发布

阅读量345

点赞数

分类专栏： Feature Engineering 文章标签： python

本文链接：https://blog.csdn.net/eerywh/article/details/107678233

版权

Feature Engineering 专栏收录该内容

8 篇文章 1 订阅

订阅专栏

第三章特征增强：清洗数据

import os 
os.listdir()

['.config', 'sample_data']

!git clone https://github.com/********/Feature-Engineering-Made-Easy.git

Cloning into 'Feature-Engineering-Made-Easy'...
remote: Enumerating objects: 63, done.[K
remote: Total 63 (delta 0), reused 0 (delta 0), pack-reused 63[K
Unpacking objects: 100% (63/63), done.
Checking out files: 100% (62/62), done.

import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline 
plt.style.use('fivethirtyeight')

/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

pima = pd.read_csv('/content/Feature-Engineering-Made-Easy/data/pima.data')
pima.head()

	6	148	72	35	0	33.6	0.627	50	1
0	1	85	66	29	0	26.6	0.351	31	0
1	8	183	64	0	0	23.3	0.672	32	1
2	1	89	66	23	94	28.1	0.167	21	0
3	0	137	40	35	168	43.1	2.288	33	1
4	5	116	74	0	0	25.6	0.201	30	0

pima_column_names = ['times_pregnant', 'plasma_glucose_concentration',
'diastolic_blood_pressure', 'triceps_thickness', 'serum_insulin', 'bmi',
'pedigree_function', 'age', 'onset_diabetes']

pima = pd.read_csv('/content/Feature-Engineering-Made-Easy/data/pima.data',names = pima_column_names)

pima.head()

	times_pregnant	plasma_glucose_concentration	diastolic_blood_pressure	triceps_thickness	serum_insulin	bmi	pedigree_function	age	onset_diabetes
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

pima['onset_diabetes'].value_counts(normalize = True)

0    0.651042
1    0.348958
Name: onset_diabetes, dtype: float64

#绘制两类的直方图
col = 'plasma_glucose_concentration'
plt.hist(pima[pima['onset_diabetes']==0][col],alpha = 0.5,label = 'non_diabetes')
plt.hist(pima[pima['onset_diabetes']==1][col],alpha =.5,label = 'diabetes')
plt.legend(loc = 'upper right')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.title('Histogram of {}'.format(col))
plt.show()

在这里插入图片描述

for col in ['times_pregnant', 'plasma_glucose_concentration',
'diastolic_blood_pressure', 'triceps_thickness', 'serum_insulin', 'bmi',
'pedigree_function', 'age']:
  plt.hist(pima[pima['onset_diabetes']==0][col],10,alpha = 0.5,label = 'non_diabetes')
  plt.hist(pima[pima['onset_diabetes']==1][col],10,alpha =.5,label = 'diabetes')
  plt.legend(loc = 'upper right')
  plt.xlabel(col)
  plt.ylabel('Frequency')
  plt.title('Histogram of {}'.format(col))
  plt.show()

在这里插入图片描述

import seaborn as sns

sns.heatmap(pima.corr())

<matplotlib.axes._subplots.AxesSubplot at 0x7f5e2606fc50>

在这里插入图片描述

pima.isnull().sum()

times_pregnant                  0
plasma_glucose_concentration    0
diastolic_blood_pressure        0
triceps_thickness               0
serum_insulin                   0
bmi                             0
pedigree_function               0
age                             0
onset_diabetes                  0
dtype: int64

pima.shape

(768, 9)

pima['onset_diabetes'].value_counts(normalize = True)
# 空准确率

0    0.651042
1    0.348958
Name: onset_diabetes, dtype: float64

pima.describe()

	times_pregnant	plasma_glucose_concentration	diastolic_blood_pressure	triceps_thickness	serum_insulin	bmi	pedigree_function	age	onset_diabetes
count	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000
mean	3.845052	120.894531	69.105469	20.536458	79.799479	31.992578	0.471876	33.240885	0.348958
std	3.369578	31.972618	19.355807	15.952218	115.244002	7.884160	0.331329	11.760232	0.476951
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.078000	21.000000	0.000000
25%	1.000000	99.000000	62.000000	0.000000	0.000000	27.300000	0.243750	24.000000	0.000000
50%	3.000000	117.000000	72.000000	23.000000	30.500000	32.000000	0.372500	29.000000	0.000000
75%	6.000000	140.250000	80.000000	32.000000	127.250000	36.600000	0.626250	41.000000	1.000000
max	17.000000	199.000000	122.000000	99.000000	846.000000	67.100000	2.420000	81.000000	1.000000

#注意到缺失值被填充为0

ä¸ş0
columns = ['serum_insulin', 'bmi', 'plasma_glucose_concentration',
'diastolic_blood_pressure', 'triceps_thickness','serum_insulin']

for col in columns:
  pima[col] = pima[col].map(lambda value: value if value !=0 else None)

pima.isnull().sum()

times_pregnant                    0
plasma_glucose_concentration      5
diastolic_blood_pressure         35
triceps_thickness               227
serum_insulin                   374
bmi                              11
pedigree_function                 0
age                               0
onset_diabetes                    0
dtype: int64

pima.head()

	times_pregnant	plasma_glucose_concentration	diastolic_blood_pressure	triceps_thickness	serum_insulin	bmi	pedigree_function	age	onset_diabetes
0	6	148.0	72.0	35.0	NaN	33.6	0.627	50	1
1	1	85.0	66.0	29.0	NaN	26.6	0.351	31	0
2	8	183.0	64.0	NaN	NaN	23.3	0.672	32	1
3	1	89.0	66.0	23.0	94.0	28.1	0.167	21	0
4	0	137.0	40.0	35.0	168.0	43.1	2.288	33	1

pima.describe()

	times_pregnant	plasma_glucose_concentration	diastolic_blood_pressure	triceps_thickness	serum_insulin	bmi	pedigree_function	age	onset_diabetes
count	768.000000	763.000000	733.000000	541.000000	394.000000	757.000000	768.000000	768.000000	768.000000
mean	3.845052	121.686763	72.405184	29.153420	155.548223	32.457464	0.471876	33.240885	0.348958
std	3.369578	30.535641	12.382158	10.476982	118.775855	6.924988	0.331329	11.760232	0.476951
min	0.000000	44.000000	24.000000	7.000000	14.000000	18.200000	0.078000	21.000000	0.000000
25%	1.000000	99.000000	64.000000	22.000000	76.250000	27.500000	0.243750	24.000000	0.000000
50%	3.000000	117.000000	72.000000	29.000000	125.000000	32.300000	0.372500	29.000000	0.000000
75%	6.000000	141.000000	80.000000	36.000000	190.000000	36.600000	0.626250	41.000000	1.000000
max	17.000000	199.000000	122.000000	99.000000	846.000000	67.100000	2.420000	81.000000	1.000000

填充缺失值

empty_plasma_index = pima[pima['plasma_glucose_concentration'].isnull()].index

empty_plasma_index

Int64Index([75, 182, 342, 349, 502], dtype='int64')

pima.loc[empty_plasma_index]['plasma_glucose_concentration']

75    NaN
182   NaN
342   NaN
349   NaN
502   NaN
Name: plasma_glucose_concentration, dtype: float64

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')

pima_imputed = imputer.fit_transform(pima)

type(pima_imputed)

numpy.ndarray

pima_imputed = pd.DataFrame(pima_imputed,columns=pima_column_names)

pima_imputed.head()

	times_pregnant	plasma_glucose_concentration	diastolic_blood_pressure	triceps_thickness	serum_insulin	bmi	pedigree_function	age	onset_diabetes
0	6.0	148.0	72.0	35.00000	155.548223	33.6	0.627	50.0	1.0
1	1.0	85.0	66.0	29.00000	155.548223	26.6	0.351	31.0	0.0
2	8.0	183.0	64.0	29.15342	155.548223	23.3	0.672	32.0	1.0
3	1.0	89.0	66.0	23.00000	94.000000	28.1	0.167	21.0	0.0
4	0.0	137.0	40.0	35.00000	168.000000	43.1	2.288	33.0	1.0

pima_imputed.isnull().sum()

times_pregnant                  0
plasma_glucose_concentration    0
diastolic_blood_pressure        0
triceps_thickness               0
serum_insulin                   0
bmi                             0
pedigree_function               0
age                             0
onset_diabetes                  0
dtype: int64

在机器学习流水线上填充值

ĺ€ź

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

X = pima[['serum_insulin']].copy()   ## ĺŒă€ă€‘
y = pima['onset_diabetes'].copy()

X.isnull().sum()

serum_insulin    374
dtype: int64

X.shape

(768, 1)

entire_data_set_mean = X.mean()
X = X.fillna(entire_data_set_mean)
print(entire_data_set_mean)

serum_insulin    155.548223
dtype: float64

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 99)

X_train.shape,y_train.shape

((576, 1), (576,))

knn = KNeighborsClassifier()

knn.fit(X_train,y_train)
knn.score(X_test,y_test)

0.65625

上例中训练集和测试集填充缺失值错误采用了均值填充

çźşĺ¤ąĺ€źé”™čŻŻé‡‡ç”¨äş†ĺ‡ĺ€źĺĄŤĺ

# 合适的方法
X = pima[['serum_insulin']].copy()
y = pima['onset_diabetes'].copy()

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 99)
X.isnull().sum()

serum_insulin    374
dtype: int64

X_test.shape,y_test.shape,X_train.shape,y_train.shape

((192, 1), (192,), (576, 1), (576,))

training_mean = X_train.mean()
X_train = X_train.fillna(training_mean)
X_test = X_test.fillna(training_mean)

print(training_mean)

serum_insulin    158.546053
dtype: float64

X_test.shape,y_test.shape,X_train.shape,y_train.shape

((192, 1), (192,), (576, 1), (576,))

knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
print(knn.score(X_test,y_test))

0.4895833333333333

Pipeline

from sklearn.pipeline import Pipeline

knn_params = {'classify__n_neighbors':[1,2,3,4,5,6,7]} # 必须重新定义参数以符合流水线

knn = KNeighborsClassifier()

mean_impute = Pipeline([('imputer',SimpleImputer(strategy='mean')),('classify',knn)])

X = pima.drop('onset_diabetes',axis = 1)
y = pima['onset_diabetes']

grid = GridSearchCV(mean_impute,knn_params)
grid.fit(X,y)

print(grid.best_score_,grid.best_params_)

0.7305407011289364 {'classify__n_neighbors': 7}

from sklearn.pipeline import Pipeline

knn_params = {'classify__n_neighbors':[1,2,3,4,5,6,7]} # 必须重新定义参数以符合流水线

knn = KNeighborsClassifier()

median_impute = Pipeline([('imputer',SimpleImputer(strategy='median')),('classify',knn)])

X = pima.drop('onset_diabetes',axis = 1)
y = pima['onset_diabetes']

grid = GridSearchCV(median_impute,knn_params)
grid.fit(X,y)

print(grid.best_score_,grid.best_params_)

0.7292589763177999 {'classify__n_neighbors': 7}

标准化与归一化

impute = SimpleImputer()

pima_imputed_mean = pd.DataFrame(impute.fit_transform(pima),columns=pima_column_names)
pima_imputed_mean.hist(figsize=(15,15));

在这里插入图片描述

pima_imputed_mean.hist(figsize = (15,15),sharex=True);

在这里插入图片描述

Z_score

from sklearn.preprocessing import StandardScaler

scale = StandardScaler()

pima_imputed__mean_scaled = pd.DataFrame(scale.fit_transform(pima_imputed_mean),columns=pima_column_names)
pima_imputed__mean_scaled.hist(figsize=(15,15),sharex=True);

在这里插入图片描述

#将Z-score加入到Pipeline中
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# knn_params = {'imputer__stragety':['mean','median'],'classify__n_neighbors':[1,2,3,4,5,6,7]} # ĺż
éĄťé‡ć–°ĺŽšäš‰ĺ‚ć•°äťĽçŹŚĺˆćľć°´çşż
knn_params = {'imputer__strategy':['mean','median'], 'classify__n_neighbors':[1, 2, 
3, 4, 5, 6, 7]}

mean_impute_standardize = Pipeline([('imputer',SimpleImputer()),('standardize',StandardScaler()),('classify',knn)])

X = pima.drop('onset_diabetes',axis = 1)
y = pima['onset_diabetes']

grid = GridSearchCV(mean_impute_standardize,knn_params)
grid.fit(X,y)

print(grid.best_score_,grid.best_params_)

0.7539173245055598 {'classify__n_neighbors': 7, 'imputer__strategy': 'mean'}

knn_params = {'imputer__strategy':['mean', 'median'], 'classify__n_neighbors':[1, 2, 
3, 4, 5, 6, 7]}

mean_impute_standardize = Pipeline([('imputer', SimpleImputer()), ('standardize',
StandardScaler()), ('classify', knn)])
X = pima.drop('onset_diabetes', axis=1)
y = pima['onset_diabetes']

grid = GridSearchCV(mean_impute_standardize, knn_params)
grid.fit(X, y)

print(grid.best_score_, grid.best_params_)

0.7539173245055598 {'classify__n_neighbors': 7, 'imputer__strategy': 'mean'}

Up_梅子酒

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录

特征工程2

第三章 特征增强：清洗数据

填充缺失值

在机器学习流水线上填充值

上例中训练集和测试集填充缺失值错误采用了均值填充

Pipeline

标准化与归一化

Z_score

第三章特征增强：清洗数据