特征工程入门与实践2

第三章 特征增强:清洗数据

import os 
os.listdir()
['.config', 'sample_data']
!git clone https://github.com/********/Feature-Engineering-Made-Easy.git
Cloning into 'Feature-Engineering-Made-Easy'...
remote: Enumerating objects: 63, done.
remote: Total 63 (delta 0), reused 0 (delta 0), pack-reused 63
Unpacking objects: 100% (63/63), done.
Checking out files: 100% (62/62), done.
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline 
plt.style.use('fivethirtyeight')
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
pima = pd.read_csv('/content/Feature-Engineering-Made-Easy/data/pima.data')
pima.head()
6 148 72 35 0 33.6 0.627 50 1
0 1 85 66 29 0 26.6 0.351 31 0
1 8 183 64 0 0 23.3 0.672 32 1
2 1 89 66 23 94 28.1 0.167 21 0
3 0 137 40 35 168 43.1 2.288 33 1
4 5 116 74 0 0 25.6 0.201 30 0
pima_column_names = ['times_pregnant', 'plasma_glucose_concentration',
'diastolic_blood_pressure', 'triceps_thickness', 'serum_insulin', 'bmi',
'pedigree_function', 'age', 'onset_diabetes']

pima = pd.read_csv('/content/Feature-Engineering-Made-Easy/data/pima.data',names = pima_column_names)

pima.head()

times_pregnant plasma_glucose_concentration diastolic_blood_pressure triceps_thickness serum_insulin bmi pedigree_function age onset_diabetes
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
pima['onset_diabetes'].value_counts(normalize = True)
0    0.651042
1    0.348958
Name: onset_diabetes, dtype: float64
#绘制两类的直方图
col = 'plasma_glucose_concentration'
plt.hist(pima[pima['onset_diabetes']==0][col],alpha = 0.5,label = 'non_diabetes')
plt.hist(pima[pima['onset_diabetes']==1][col],alpha =.5,label = 'diabetes')
plt.legend(loc = 'upper right')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.title('Histogram of {}'.format(col))
plt.show()

在这里插入图片描述

for col in ['times_pregnant', 'plasma_glucose_concentration',
'diastolic_blood_pressure', 'triceps_thickness', 'serum_insulin', 'bmi',
'pedigree_function', 'age']:
  plt.hist(pima[pima['onset_diabetes']==0][col],10,alpha = 0.5,label = 'non_diabetes')
  plt.hist(pima[pima['onset_diabetes']==1][col],10,alpha =.5,label = 'diabetes')
  plt.legend(loc = 'upper right')
  plt.xlabel(col)
  plt.ylabel('Frequency')
  plt.title('Histogram of {}'.format(col))
  plt.show()

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

在这里插入图片描述
在这里插入图片描述

import seaborn as sns

sns.heatmap(pima.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x7f5e2606fc50>

在这里插入图片描述

pima.isnull().sum()
times_pregnant                  0
plasma_glucose_concentration    0
diastolic_blood_pressure        0
triceps_thickness               0
serum_insulin                   0
bmi                             0
pedigree_function               0
age                             0
onset_diabetes                  0
dtype: int64
pima.shape
(768, 9)
pima['onset_diabetes'].value_counts(normalize = True)
# 空准确率
0    0.651042
1    0.348958
Name: onset_diabetes, dtype: float64
pima.describe()
times_pregnant plasma_glucose_concentration diastolic_blood_pressure triceps_thickness serum_insulin bmi pedigree_function age onset_diabetes
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000
#注意到缺失值被填充为0

为0
columns = ['serum_insulin', 'bmi', 'plasma_glucose_concentration',
'diastolic_blood_pressure', 'triceps_thickness','serum_insulin']

for col in columns:
  pima[col] = pima[col].map(lambda value: value if value !=0 else None)

pima.isnull().sum()
times_pregnant                    0
plasma_glucose_concentration      5
diastolic_blood_pressure         35
triceps_thickness               227
serum_insulin                   374
bmi                              11
pedigree_function                 0
age                               0
onset_diabetes                    0
dtype: int64
pima.head()
times_pregnant plasma_glucose_concentration diastolic_blood_pressure triceps_thickness serum_insulin bmi pedigree_function age onset_diabetes
0 6 148.0 72.0 35.0 NaN 33.6 0.627 50 1
1 1 85.0 66.0 29.0 NaN 26.6 0.351 31 0
2 8 183.0 64.0 NaN NaN 23.3 0.672 32 1
3 1 89.0 66.0 23.0 94.0 28.1 0.167 21 0
4 0 137.0 40.0 35.0 168.0 43.1 2.288 33 1
pima.describe()
times_pregnant plasma_glucose_concentration diastolic_blood_pressure triceps_thickness serum_insulin bmi pedigree_function age onset_diabetes
count 768.000000 763.000000 733.000000 541.000000 394.000000 757.000000 768.000000 768.000000 768.000000
mean 3.845052 121.686763 72.405184 29.153420 155.548223 32.457464 0.471876 33.240885 0.348958
std 3.369578 30.535641 12.382158 10.476982 118.775855 6.924988 0.331329 11.760232 0.476951
min 0.000000 44.000000 24.000000 7.000000 14.000000 18.200000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 64.000000 22.000000 76.250000 27.500000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 29.000000 125.000000 32.300000 0.372500 29.000000 0.000000
75% 6.000000 141.000000 80.000000 36.000000 190.000000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000

填充缺失值

empty_plasma_index = pima[pima['plasma_glucose_concentration'].isnull()].index
empty_plasma_index
Int64Index([75, 182, 342, 349, 502], dtype='int64')
pima.loc[empty_plasma_index]['plasma_glucose_concentration']
75    NaN
182   NaN
342   NaN
349   NaN
502   NaN
Name: plasma_glucose_concentration, dtype: float64
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')

pima_imputed = imputer.fit_transform(pima)

type(pima_imputed)
numpy.ndarray
pima_imputed = pd.DataFrame(pima_imputed,columns=pima_column_names)

pima_imputed.head()
times_pregnant plasma_glucose_concentration diastolic_blood_pressure triceps_thickness serum_insulin bmi pedigree_function age onset_diabetes
0 6.0 148.0 72.0 35.00000 155.548223 33.6 0.627 50.0 1.0
1 1.0 85.0 66.0 29.00000 155.548223 26.6 0.351 31.0 0.0
2 8.0 183.0 64.0 29.15342 155.548223 23.3 0.672 32.0 1.0
3 1.0 89.0 66.0 23.00000 94.000000 28.1 0.167 21.0 0.0
4 0.0 137.0 40.0 35.00000 168.000000 43.1 2.288 33.0 1.0
pima_imputed.isnull().sum()
times_pregnant                  0
plasma_glucose_concentration    0
diastolic_blood_pressure        0
triceps_thickness               0
serum_insulin                   0
bmi                             0
pedigree_function               0
age                             0
onset_diabetes                  0
dtype: int64

在机器学习流水线上填充值

借

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
X = pima[['serum_insulin']].copy()   ## ĺŒă€ă€‘
y = pima['onset_diabetes'].copy()

X.isnull().sum()
serum_insulin    374
dtype: int64
X.shape
(768, 1)
entire_data_set_mean = X.mean()
X = X.fillna(entire_data_set_mean)
print(entire_data_set_mean)
serum_insulin    155.548223
dtype: float64
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 99)
X_train.shape,y_train.shape
((576, 1), (576,))

knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.65625

上例中训练集和测试集填充缺失值错误采用了均值填充

çźşĺ¤ąĺ€źé”™čŻŻé‡‡ç”¨äş†ĺ‡ĺ€źĺĄŤĺ

# 合适的方法
X = pima[['serum_insulin']].copy()
y = pima['onset_diabetes'].copy()

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 99)
X.isnull().sum()
serum_insulin    374
dtype: int64
X_test.shape,y_test.shape,X_train.shape,y_train.shape
((192, 1), (192,), (576, 1), (576,))
training_mean = X_train.mean()
X_train = X_train.fillna(training_mean)
X_test = X_test.fillna(training_mean)

print(training_mean)
serum_insulin    158.546053
dtype: float64
X_test.shape,y_test.shape,X_train.shape,y_train.shape
((192, 1), (192,), (576, 1), (576,))
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
print(knn.score(X_test,y_test))
0.4895833333333333

Pipeline

from sklearn.pipeline import Pipeline

knn_params = {'classify__n_neighbors':[1,2,3,4,5,6,7]} # 必须重新定义参数以符合流水线

knn = KNeighborsClassifier()

mean_impute = Pipeline([('imputer',SimpleImputer(strategy='mean')),('classify',knn)])

X = pima.drop('onset_diabetes',axis = 1)
y = pima['onset_diabetes']

grid = GridSearchCV(mean_impute,knn_params)
grid.fit(X,y)

print(grid.best_score_,grid.best_params_)
0.7305407011289364 {'classify__n_neighbors': 7}
from sklearn.pipeline import Pipeline

knn_params = {'classify__n_neighbors':[1,2,3,4,5,6,7]} # 必须重新定义参数以符合流水线

knn = KNeighborsClassifier()

median_impute = Pipeline([('imputer',SimpleImputer(strategy='median')),('classify',knn)])

X = pima.drop('onset_diabetes',axis = 1)
y = pima['onset_diabetes']

grid = GridSearchCV(median_impute,knn_params)
grid.fit(X,y)

print(grid.best_score_,grid.best_params_)
0.7292589763177999 {'classify__n_neighbors': 7}

标准化与归一化

impute = SimpleImputer()

pima_imputed_mean = pd.DataFrame(impute.fit_transform(pima),columns=pima_column_names)
pima_imputed_mean.hist(figsize=(15,15));

在这里插入图片描述

pima_imputed_mean.hist(figsize = (15,15),sharex=True);

在这里插入图片描述

Z_score

from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
pima_imputed__mean_scaled = pd.DataFrame(scale.fit_transform(pima_imputed_mean),columns=pima_column_names)
pima_imputed__mean_scaled.hist(figsize=(15,15),sharex=True);

在这里插入图片描述

#将Z-score加入到Pipeline中
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# knn_params = {'imputer__stragety':['mean','median'],'classify__n_neighbors':[1,2,3,4,5,6,7]} # ĺż
éĄťé‡ć–°ĺŽšäš‰ĺ‚ć•°äťĽçŹŚĺˆćľć°´çşż
knn_params = {'imputer__strategy':['mean','median'], 'classify__n_neighbors':[1, 2, 
3, 4, 5, 6, 7]}

mean_impute_standardize = Pipeline([('imputer',SimpleImputer()),('standardize',StandardScaler()),('classify',knn)])

X = pima.drop('onset_diabetes',axis = 1)
y = pima['onset_diabetes']

grid = GridSearchCV(mean_impute_standardize,knn_params)
grid.fit(X,y)

print(grid.best_score_,grid.best_params_)
0.7539173245055598 {'classify__n_neighbors': 7, 'imputer__strategy': 'mean'}
knn_params = {'imputer__strategy':['mean', 'median'], 'classify__n_neighbors':[1, 2, 
3, 4, 5, 6, 7]}

mean_impute_standardize = Pipeline([('imputer', SimpleImputer()), ('standardize',
StandardScaler()), ('classify', knn)])
X = pima.drop('onset_diabetes', axis=1)
y = pima['onset_diabetes']

grid = GridSearchCV(mean_impute_standardize, knn_params)
grid.fit(X, y)

print(grid.best_score_, grid.best_params_)
0.7539173245055598 {'classify__n_neighbors': 7, 'imputer__strategy': 'mean'}

基于深度学习的计算机视觉:原理与实践(上部)

02-19
本课程适合具有一定深度学习基础,希望发展为深度学习之计算机视觉方向的算法工程师和研发人员的同学们。 基于深度学习的计算机视觉是目前人工智能最活跃的领域,应用非常广泛,如人脸识别和无人驾驶中的机器视觉等。该领域的发展日新月异,网络模型和算法层出不穷。如何快速入门并达到可以从事研发的高度对新手和中级水平的学生而言面临不少的挑战。精心准备的本课程希望帮助大家尽快掌握基于深度学习的计算机视觉的基本原理、核心算法和当前的领先技术,从而有望成为深度学习之计算机视觉方向的算法工程师和研发人员。 本课程系统全面地讲述基于深度学习的计算机视觉技术的原理并进行项目实践。课程涵盖计算机视觉的七大任务,包括图像分类、目标检测、图像分割(语义分割、实例分割、全景分割)、人脸识别、图像描述、图像检索、图像生成(利用生成对抗网络)。本课程注重原理和实践相结合,逐篇深入解读经典和前沿论文70余篇,图文并茂破译算法难点, 使用思维导图梳理技术要点。项目实践使用Keras框架(后端为Tensorflow),学员可快速上手。 通过本课程的学习,学员可把握基于深度学习的计算机视觉的技术发展脉络,掌握相关技术原理和算法,有助于开展该领域的研究与开发实战工作。另外,深度学习之计算机视觉方向的知识结构及学习建议请参见本人CSDN博客。 本课程提供课程资料的课件PPT(pdf格式)和项目实践代码,方便学员学习和复习。 本课程分为上下两部分,其中上部包含课程的前五章(课程介绍、深度学习基础、图像分类、目标检测、图像分割),下部包含课程的后四章(人脸识别、图像描述、图像检索、图像生成)。

智能家居与物联网 从入门到精通 HomeAssistant实战视频操作手册

06-05
©️2020 CSDN 皮肤主题: 大白 设计师: CSDN官方博客 返回首页
实付0元
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值