2021-05-26

最新推荐文章于 2024-08-14 00:01:40 发布

要做一个小太阳

最新推荐文章于 2024-08-14 00:01:40 发布

阅读量178

点赞数 1

分类专栏：算法学习文章标签：机器学习

本文链接：https://blog.csdn.net/weixin_44544263/article/details/117304613

版权

算法学习专栏收录该内容

2 篇文章 0 订阅

订阅专栏

机器学习中调参的两种方法：学习曲线，网格搜索

%matplotlib inline

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.datasets import load_wine


wine = load_wine()

wine.data

wine.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2])

from sklearn.model_selection import train_test_split

Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)

clf = DecisionTreeClassifier(random_state=0)

rfc = RandomForestClassifier(random_state=0)

clf = clf.fit(Xtrain,Ytrain)

rfc = rfc.fit(Xtrain,Ytrain)

score_c = clf.score(Xtest,Ytest)

score_r = rfc.score(Xtest,Ytest)

print("Single Tree:{}".format(score_c),"Random Forest:{}".format(score_r))

Single Tree:0.9444444444444444 Random Forest:0.9814814814814815

from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt

rfc = RandomForestClassifier(n_estimators=25)

rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10)

clf = DecisionTreeClassifier()

clf_s = cross_val_score(clf,wine.data,wine.target,cv=10)

plt.plot(range(1,11),rfc_s,label = "RandomForest")

plt.plot(range(1,11),clf_s,label = "Decision Tree")

plt.legend()

plt.show()

在这里插入图片描述

rfc_l = []

clf_l = []

for i in range(10):

rfc = RandomForestClassifier(n_estimators=25)

rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10).mean()

rfc_l.append(rfc_s)

clf = DecisionTreeClassifier()

clf_s = cross_val_score(clf,wine.data,wine.target,cv=10).mean()

clf_l.append(clf_s)

plt.plot(range(1,11),rfc_l,label = "Random Forest")

plt.plot(range(1,11),clf_l,label = "Decision Tree")

plt.legend()

plt.show()

#是否有注意到，单个决策树的波动轨迹和随机森林一致？

#再次验证了我们之前提到的，单个决策树的准确率越高，随机森林的准确率也会越高

在这里插入图片描述

superpa = []

for i in range(200):

    rfc =  RandomForestClassifier(n_estimators=i+1,n_jobs = -1)

    rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10).mean()

    superpa.append(rfc_s)

print(max(superpa),superpa.index(max(superpa)))

plt.figure(figsize=[20,5])

plt.plot(range(1,201),superpa)

plt.show()

0.9888888888888889 40

在这里插入图片描述

from sklearn.datasets import load_boston

from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestRegressor

from sklearn import metrics

boston = load_boston()

regressor = RandomForestRegressor(n_estimators=100,random_state=0)

cross_val_score(regressor, boston.data, boston.target, cv=10

                ,scoring = "neg_mean_squared_error")

sorted(sklearn.metrics.SCORERS.keys())

NameError Traceback (most recent call last)
in
7 cross_val_score(regressor, boston.data, boston.target, cv=10
8 ,scoring = “neg_mean_squared_error”)
----> 9sorted(sklearn.metrics.SCORERS.keys())

NameError: name ‘sklearn’ is not defined

待解决

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.datasets import load_boston

from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score
dataset = load_boston()
dataset.data.shape

（503,13）

#总共506*13=6578个数据

X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]       #506 
n_features = X_full.shape[1]       #13

#首先确定我们希望放入的缺失数据的比例，在这里我们假设是50%，那总共#就要有3289个数据缺失

rng = np.random.RandomState(0)
missing_rate = 0.5
n_missing_samples = int(np.floor(n_samples * n_features * missing_rate))

#np.floor向下取整，返回.0格式的浮点数
n_missing_samples

3289

#所有数据要随机遍布在数据集的各行各列当中，而一个缺失的数据会需要一#个行索引和一个列索引如果能够创造一个数组，包含3289个分布在0~506

#中间的行索引，和3289个分布在0~13之间的列索引，那我们就可以利用索#引来为数据中的任意3289个位置赋空值然后我们用0，均值和随机森林来填#写这些缺失值，然后查看回归的结果如何

missing_features = rng.randint(0,n_features,n_missing_samples)
missing_samples = rng.randint(0,n_samples,n_missing_samples)
#randint（上限，下限,n）在上限和下限之间取出n个整数
missing_features

array([12, 7, 8, …, 4, 11, 10])

len(missing_features)

3289

missing_samples

array([202, 481, 86, …, 293, 84, 63])

len(missing_samples)

3289

#missing_samples = rng.choice(dataset.data.shape[0] ,n_missing_samples,replace=False)

#我们现在采样了3289个数据，远远超过我们的样本量506，所以我们使用随#机抽取的函数randint。但如果我们需要的数据量小于我们的样本量506，那#我们可以采用np.random.choice来抽#样，choice会随机抽取不重复的随机#数，因此可以帮助我们让数据更加分散，确保数据不会集中在一些行中

X_missing = X_full.copy()
y_missing = y_full.copy()
X_missing

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, …, 1.5300e+01, 3.9690e+02,
4.9800e+00],
[2.7310e-02, 0.0000e+00, 7.0700e+00, …, 1.7800e+01, 3.9690e+02,
9.1400e+00],
[2.7290e-02, 0.0000e+00, 7.0700e+00, …, 1.7800e+01, 3.9283e+02,
4.0300e+00],
…,
[6.0760e-02, 0.0000e+00, 1.1930e+01, …, 2.1000e+01, 3.9690e+02,
5.6400e+00],
[1.0959e-01, 0.0000e+00, 1.1930e+01, …, 2.1000e+01, 3.9345e+02,
6.4800e+00],
[4.7410e-02, 0.0000e+00, 1.1930e+01, …, 2.1000e+01, 3.9690e+02,
7.8800e+00]]

X_missing[missing_samples,missing_features] = np.nan

X_missing = pd.DataFrame(X_missing)

#转换成DataFrame是为了后续方便各种操作，numpy对矩阵的运算速度快到拯救人生，但是在索引等功能上却不如pandas来得好用

#使用均值进行填补

0 1 2 3 4 5 6 7 8 9 10 11 12
0 NaN NaN NaN NaN 0.538 6.575 65.2 4.0900 NaN NaN 15.3 396.90 4.98
1 0.02731 0.0 NaN 0.0 NaN NaN 78.9 4.9671 NaN 242.0 17.8 NaN NaN
2 0.02729 NaN NaN 0.0 0.469 NaN 61.1 4.9671 NaN NaN NaN 392.83 NaN
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 NaN NaN 222.0 18.7 NaN 2.94
4 NaN NaN NaN 0.0 0.458 NaN 54.2 NaN NaN NaN NaN 396.90 NaN
… … … … … … … … … … … … … …
501 NaN 0.0 NaN 0.0 NaN 6.593 69.1 NaN 1.0 273.0 NaN NaN 9.67
502 0.04527 0.0 NaN NaN 0.573 6.120 NaN NaN 1.0 NaN NaN 396.90 NaN
503 0.06076 NaN 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 NaN 396.90 5.64
504 0.10959 0.0 NaN NaN 0.573 NaN 89.3 2.3889 NaN 273.0 21.0 393.45 6.48
505 0.04741 0.0 11.93 0.0 0.573 NaN 80.8 2.5050 1.0 273.0 21.0 396.90 7.88
506 rows × 13 columns

from sklearn.impute import SimpleImputer      
imp_mean = SimpleImputer(missing_values=np.nan , strategy='mean')   #实例化
#训练fit+导出predict  >>> fit_transform
X_missing_mean = imp_mean.fit_transform(X_missing)

使用Shift+Tab来显示小框框
判断现在导出的数据中是否还有缺失值

isnull()只能用于在DataFrame中判断

array([[3.51659772e+00, 1.07171053e+01, 1.10618627e+01, …,
1.53000000e+01, 3.96900000e+02, 4.98000000e+00],
[2.73100000e-02, 0.00000000e+00, 1.10618627e+01, …,
1.78000000e+01, 3.57910063e+02, 1.31145847e+01],
[2.72900000e-02, 1.07171053e+01, 1.10618627e+01, …,
1.84161290e+01, 3.92830000e+02, 1.31145847e+01],
…,
[6.07600000e-02, 1.07171053e+01, 1.19300000e+01, …,
1.84161290e+01, 3.96900000e+02, 5.64000000e+00],
[1.09590000e-01, 0.00000000e+00, 1.10618627e+01, …,
2.10000000e+01, 3.93450000e+02, 6.48000000e+00],
[4.74100000e-02, 0.00000000e+00, 1.19300000e+01, …,
2.10000000e+01, 3.96900000e+02, 7.88000000e+00]])

pd.DataFrame(X_missing_mean).isnull()
pd.DataFrame(X_missing_mean).isnull().sum()

0 0
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
dtype: int64

#使用0进行填补

imp_0 = SimpleImputer(missing_values=np.nan , strategy="constant",fill_value=0)
X_missing_0 = imp_0.fit_transform(X_missing)
pd.DataFrame(X_missing_0)

0 1 2 3 4 5 6 7 8 9 10 11 12
0 0.00000 0.0 0.00 0.0 0.538 6.575 65.2 4.0900 0.0 0.0 15.3 396.90 4.98
1 0.02731 0.0 0.00 0.0 0.000 0.000 78.9 4.9671 0.0 242.0 17.8 0.00 0.00
2 0.02729 0.0 0.00 0.0 0.469 0.000 61.1 4.9671 0.0 0.0 0.0 392.83 0.00
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 0.0000 0.0 222.0 18.7 0.00 2.94
4 0.00000 0.0 0.00 0.0 0.458 0.000 54.2 0.0000 0.0 0.0 0.0 396.90 0.00
… … … … … … … … … … … … … …
501 0.00000 0.0 0.00 0.0 0.000 6.593 69.1 0.0000 1.0 273.0 0.0 0.00 9.67
502 0.04527 0.0 0.00 0.0 0.573 6.120 0.0 0.0000 1.0 0.0 0.0 396.90 0.00
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 0.0 396.90 5.64
504 0.10959 0.0 0.00 0.0 0.573 0.000 89.3 2.3889 0.0 273.0 21.0 393.45 6.48
505 0.04741 0.0 11.93 0.0 0.573 0.000 80.8 2.5050 1.0 273.0 21.0 396.90 7.88
506 rows × 13 columns

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, …, 1.5300e+01, 3.9690e+02,
4.9800e+00],
[2.7310e-02, 0.0000e+00, 0.0000e+00, …, 1.7800e+01, 0.0000e+00,
0.0000e+00],
[2.7290e-02, 0.0000e+00, 0.0000e+00, …, 0.0000e+00, 3.9283e+02,
0.0000e+00],
…,
[6.0760e-02, 0.0000e+00, 1.1930e+01, …, 0.0000e+00, 3.9690e+02,
5.6400e+00],
[1.0959e-01, 0.0000e+00, 0.0000e+00, …, 2.1000e+01, 3.9345e+02,
6.4800e+00],
[4.7410e-02, 0.0000e+00, 1.1930e+01, …, 2.1000e+01, 3.9690e+02,
7.8800e+00]])

使用随机森林填补缺失值

先分析填补某一列的过程，先填补缺失值少的列，再填补缺失值多的列

#找出数据集中，缺失值从小到大排列对的特征们的顺序，本质找索引

X_missing_reg = X_missing.copy()
#找出数据集中，缺失值从小到大排列对的特征们的顺序，本质找索引
sortindex = np.argsort(X_missing_reg.isnull().sum(axis=0)).values
X_missing_reg.isnull()
X_missing_reg.isnull().sum(axis=0)

0 199
1 202
2 200
3 192
4 186
5 203
6 182
7 201
8 199
9 207
10 196
11 190
12 205
dtype: int64

np.sort(X_missing_reg.isnull().sum(axis=0))

#不包含索引

array([182, 186, 190, 192, 196, 199, 199, 200, 201, 202, 203, 205, 207],
dtype=int64)

np.argsort(X_missing_reg.isnull().sum(axis=0))

#返回从小到大（按照缺失值排列）的顺序所对应的索引，第六行最少，依次是第四行…

0 6
1 4
2 11
3 3
4 10
5 0
6 8
7 2
8 7
9 1
10 5
11 12
12 9
dtype: int64

下面展示一些 内联代码片。

X_missing_reg = X_missing.copy()

#找出数据集中，缺失值从小到大排列对的特征们的顺序，本质找索引

sortindex = np.argsort(X_missing_reg.isnull().sum(axis=0)).values

np.argsort(X_missing_reg.isnull().sum(axis=0)).values

array([ 6, 4, 11, 3, 10, 0, 8, 2, 7, 1, 5, 12, 9], dtype=int64)

#构建我们的新特征矩阵和新标签(没有被选中)
df = X_missing_reg
df

#新标签 fillc = df.iloc[:,i]
fillc = df.iloc[:,6]
fillc

0 65.2
1 78.9
2 61.1
3 45.8
4 54.2
…
501 69.1
502 NaN
503 91.0
504 89.3
505 80.8
Name: 6, Length: 506, dtype: float64

#新特征矩阵
df.iloc[:,df.columns != 6]
0 1 2 3 4 5 7 8 9 10 11 12 0 0
0 NaN NaN NaN NaN 0.538 6.575 4.0900 NaN NaN 15.3 396.90 4.98 24.0 24.0
1 0.02731 0.0 NaN 0.0 NaN NaN 4.9671 NaN 242.0 17.8 NaN NaN 21.6 21.6
2 0.02729 NaN NaN 0.0 0.469 NaN 4.9671 NaN NaN NaN 392.83 NaN 34.7 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 NaN NaN 222.0 18.7 NaN 2.94 33.4 33.4
4 NaN NaN NaN 0.0 0.458 NaN NaN NaN NaN NaN 396.90 NaN 36.2 36.2
… … … … … … … … … … … … … … …
501 NaN 0.0 NaN 0.0 NaN 6.593 NaN 1.0 273.0 NaN NaN 9.67 22.4 22.4
502 0.04527 0.0 NaN NaN 0.573 6.120 NaN 1.0 NaN NaN 396.90 NaN 20.6 20.6
503 0.06076 NaN 11.93 0.0 0.573 6.976 2.1675 1.0 273.0 NaN 396.90 5.64 23.9 23.9
504 0.10959 0.0 NaN NaN 0.573 NaN 2.3889 NaN 273.0 21.0 393.45 6.48 22.0 22.0
505 0.04741 0.0 11.93 0.0 0.573 NaN 2.5050 1.0 273.0 21.0 396.90 7.88 11.9 11.9
506 rows × 14 columns

pd.DataFrame(y_full)
0
0 24.0
1 21.6
2 34.7
3 33.4
4 36.2
… …
501 22.4
502 20.6
503 23.9
504 22.0
505 11.9
506 rows × 1 columns
#新特征矩阵 (结果的显示有点不太懂)

df = pd.concat([df.iloc[:,df.columns != 6],pd.DataFrame(y_full)],axis=1)
df

( 0 1 2 3 4 5 7 8 9 10 11 \

0 NaN NaN NaN NaN 0.538 6.575 4.0900 NaN NaN 15.3 396.90
1 0.02731 0.0 NaN 0.0 NaN NaN 4.9671 NaN 242.0 17.8 NaN
2 0.02729 NaN NaN 0.0 0.469 NaN 4.9671 NaN NaN NaN 392.83
3 0.03237 0.0 2.18 0.0 0.458 6.998 NaN NaN 222.0 18.7 NaN
4 NaN NaN NaN 0.0 0.458 NaN NaN NaN NaN NaN 396.90
… … … … … … … … … … … …
501 NaN 0.0 NaN 0.0 NaN 6.593 NaN 1.0 273.0 NaN NaN
502 0.04527 0.0 NaN NaN 0.573 6.120 NaN 1.0 NaN NaN 396.90
503 0.06076 NaN 11.93 0.0 0.573 6.976 2.1675 1.0 273.0 NaN 396.90
504 0.10959 0.0 NaN NaN 0.573 NaN 2.3889 NaN 273.0 21.0 393.45
505 0.04741 0.0 11.93 0.0 0.573 NaN 2.5050 1.0 273.0 21.0 396.90

0 4.98
1 NaN
2 NaN
3 2.94
4 NaN
… …
501 9.67
502 NaN
503 5.64
504 6.48
505 7.88

[506 rows x 12 columns],
0
0 24.0
1 21.6
2 34.7
3 33.4
4 36.2
… …
501 22.4
502 20.6
503 23.9
504 22.0
505 11.9

[506 rows x 1 columns])

df = pd.concat([df.iloc[:,df.columns != 6],pd.DataFrame(y_full)],axis=1)

有我运行了多次，所以后面加了好几次y_full

0 1 2 3 4 5 7 8 9 10 11 12 0 0 0 0 0
0 NaN NaN NaN NaN 0.538 6.575 4.0900 NaN NaN 15.3 396.90 4.98 24.0 24.0 24.0 24.0 24.0
1 0.02731 0.0 NaN 0.0 NaN NaN 4.9671 NaN 242.0 17.8 NaN NaN 21.6 21.6 21.6 21.6 21.6
2 0.02729 NaN NaN 0.0 0.469 NaN 4.9671 NaN NaN NaN 392.83 NaN 34.7 34.7 34.7 34.7 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 NaN NaN 222.0 18.7 NaN 2.94 33.4 33.4 33.4 33.4 33.4
4 NaN NaN NaN 0.0 0.458 NaN NaN NaN NaN NaN 396.90 NaN 36.2 36.2 36.2 36.2 36.2
… … … … … … … … … … … … … … … … … …
501 NaN 0.0 NaN 0.0 NaN 6.593 NaN 1.0 273.0 NaN NaN 9.67 22.4 22.4 22.4 22.4 22.4
502 0.04527 0.0 NaN NaN 0.573 6.120 NaN 1.0 NaN NaN 396.90 NaN 20.6 20.6 20.6 20.6 20.6
503 0.06076 NaN 11.93 0.0 0.573 6.976 2.1675 1.0 273.0 NaN 396.90 5.64 23.9 23.9 23.9 23.9 23.9
504 0.10959 0.0 NaN NaN 0.573 NaN 2.3889 NaN 273.0 21.0 393.45 6.48 22.0 22.0 22.0 22.0 22.0
505 0.04741 0.0 11.93 0.0 0.573 NaN 2.5050 1.0 273.0 21.0 396.90 7.88 11.9 11.9 11.9 11.9 11.9
506 rows × 17 columns

#在新特征矩阵中，对含有缺失值的列，进行0的填补

df_0 = SimpleImputer(missing_values=np.nan,strategy=‘constant’,fill_value=0).fit_transform(df)

#找出我们的训练集和测试集

#是被选中要填充的特征中（现在是我们的标签），存在的那些值，非空值

Ytrain = fillc[fillc.notnull()]

fillc[fillc.notnull()]取出这些非空的列

#是被选中要填充的特征中（现在是我们的标签），不存在的那些值，空值

#我们需要的不是Ytest的值，而是Ytest的索引

Ytest = fillc[fillc.isnull()]

fillc[fillc.isnull()]

5 NaN
7 NaN
9 NaN
12 NaN
13 NaN
…
485 NaN
486 NaN
494 NaN
499 NaN
502 NaN
Name: 6, Length: 182, dtype: float64

#新特征矩阵上，被选出来要填充特征的非空值多对应的记录

Xtrain = df_0[Ytrain.index,:]
Xtrain

array([[0.0000e+00, 0.0000e+00, 0.0000e+00, …, 2.4000e+01, 2.4000e+01,
2.4000e+01],
[2.7310e-02, 0.0000e+00, 0.0000e+00, …, 2.1600e+01, 2.1600e+01,
2.1600e+01],
[2.7290e-02, 0.0000e+00, 0.0000e+00, …, 3.4700e+01, 3.4700e+01,
3.4700e+01],
…,
[6.0760e-02, 0.0000e+00, 1.1930e+01, …, 2.3900e+01, 2.3900e+01,
2.3900e+01],
[1.0959e-01, 0.0000e+00, 0.0000e+00, …, 2.2000e+01, 2.2000e+01,
2.2000e+01],
[4.7410e-02, 0.0000e+00, 1.1930e+01, …, 1.1900e+01, 1.1900e+01,
1.1900e+01]])

#新特征矩阵上，被选出来要填充特征的空值多对应的记录

Xtest = df_0[Ytest.index,:]
Xtest

array([[ 0.02985, 0. , 0. , …, 28.7 , 28.7 , 28.7 ],
[ 0. , 12.5 , 7.87 , …, 27.1 , 27.1 , 27.1 ],
[ 0.17004, 0. , 0. , …, 18.9 , 18.9 , 18.9 ],
…,
[ 0.27957, 0. , 0. , …, 24.5 , 24.5 , 24.5 ],
[ 0.17783, 0. , 9.69 , …, 17.5 , 17.5 , 17.5 ],
[ 0.04527, 0. , 0. , …, 20.6 , 20.6 , 20.6 ]])

#用随机森林回归来填补缺失值
#实例化

rfc = RandomForestRegressor(n_estimators=100)

#导入训练集去进行训练

rfc = rfc.fit(Xtrain, Ytrain)

#用predict接口将Xtest导入，得到我们预测的结果（回归结果），即是我们要用来填补空值的这些值

Ypredict = rfc.predict(Xtest)

#将填补好的特征返回到我们的原始的特征矩阵中

Ypredict

array([25.816, 65.734, 67.704, 59.335, 67.757, 81.509, 59.68 , 52.392,
82.556, 66.381, 94.135, 89.856, 85.564, 65.785, 57.662, 27.282,
45.332, 40.672, 52.756, 60.738, 75.71 , 92.172, 67.2 , 53.854,
25.74 , 68.38 , 36.538, 28.302, 32.727, 43.181, 55.09 , 42.196,
49.888, 53.926, 54.878, 75.356, 37.211, 73.915, 49.905, 80.261,
80.507, 65.152, 80.431, 86.287, 78.683, 90.905, 92.648, 96.239,
91.625, 94.563, 91.709, 92.391, 96.791, 87.561, 89.883, 91.073,
94.49 , 86.094, 90.565, 90.765, 90.636, 90.477, 74.751, 55.795,
46.413, 59.239, 77.423, 76.826, 50.514, 62.129, 72.62 , 32.398,
28.316, 31.745, 38.722, 24.945, 40.147, 57.422, 80.461, 59.151,
64.973, 64.145, 74.994, 69.307, 68.001, 76.978, 71.915, 71.784,
68.131, 66.585, 37.956, 30.472, 43.277, 87.923, 91.283, 80.45 ,
77.974, 79.786, 57.801, 43.598, 48.014, 35.684, 36.018, 43.535,
29.454, 32.434, 32.788, 31.564, 47.261, 67.499, 85.409, 77.284,
63.935, 86.057, 92.491, 65.98 , 54.078, 39.873, 45.659, 78.834,
43.365, 61.017, 49.056, 73.053, 49.356, 64.595, 68.946, 78.956,
29.908, 75.519, 27.552, 58.749, 48.141, 84.523, 70.333, 89.979,
83.158, 94.936, 97.04 , 94.679, 96.834, 96.613, 91.614, 86.302,
98.624, 83.868, 88.707, 97.302, 81.694, 81.515, 83.338, 95.609,
89.443, 90.194, 92.352, 90.102, 81.558, 88.271, 86.195, 89.367,
86.594, 94.072, 92.759, 88.925, 89.937, 78.482, 83.861, 73.924,
75.69 , 89.502, 86.522, 95.819, 87.822, 67.971, 69.088, 57.488,
71.875, 73.227, 80.525, 75.557, 79.239, 66.24 ]

len(Ypredict)

182

#将填补好的特征返回到我们的原始特征矩阵中

X_missing_reg.iloc[:,6]

0 65.2
1 78.9
2 61.1
3 45.8
4 54.2
…
501 69.1
502 NaN
503 91.0
504 89.3
505 80.8
Name: 6, Length: 506, dtype: float64

X_missing_reg.loc[X_missing_reg.iloc[:,6].isnull(),6] = Ypredict

X_missing_reg

X_missing_reg.isnull().sum()

0 199
1 202
2 200
3 192
4 186
5 203
6 0
7 201
8 199
9 207
10 196
11 190
12 205
dtype: int64

一次性替换所有的缺失值

X_missing_reg = X_missing.copy()

#找出数据集中，缺失值从小到大排列对的特征们的顺序，本质找索引

sortindex = np.argsort(X_missing_reg.isnull().sum(axis=0)).values

for i in sortindex:

    #构建我们的新特征矩阵和新标签(没有被选中)

    df = X_missing_reg

    fillc = df.iloc[:,i]

    df = pd.concat([df.iloc[:,df.columns != i],pd.DataFrame(y_full)],axis=1)

    
    #在新特征矩阵中，对含有缺失值的列，进行0的填补 

    df_0  =SimpleImputer(missing_values=np.nan

,strategy='constant',fill_value=0).fit_transform(df)


    #找出我们的训练集和测试集

    #是被选中要填充的特征中（现在是我们的标签），存在的那些值，非空值

    Ytrain = fillc[fillc.notnull()]
 
    #是被选中要填充的特征中（现在是我们的标签），不存在的那些值，空值

    #我们需要的不是Ytest的值，而是Ytest的索引

    Ytest = fillc[fillc.isnull()] 

    #新特征矩阵上，被选出来要填充特征的非空值多对应的记录

    Xtrain = df_0[Ytrain.index,:] 

    #新特征矩阵上，被选出来要填充特征的空值多对应的记录

    Xtest = df_0[Ytest.index,:]

#用随机森林回归来填补缺失值


#实例化

rfc = RandomForestRegressor(n_estimators=100)


#导入训练集去进行训练

rfc = rfc.fit(Xtrain, Ytrain)


#用predict接口将Xtest导入，得到我们预测的结果（回归结果），即是我们要用来填补空值的这些值

Ypredict = rfc.predict(Xtest)

X_missing_reg.loc[X_missing_reg.iloc[:,i].isnull(),i] = Ypredict

X_missing_reg

X_missing_reg.isnull().sum()

0 0
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
dtype: int64

#对所有数据进行建模，取得MSE结果，MSE越小越好

X = [X_full,X_missing_mean,X_missing_0,X_missing_reg]

mse = []

std = []

for x in X:

    estimator = RandomForestRegressor(random_state=0,n_estimators=100)

    scores = cross_val_score(estima,x,y_full,scoring='neg_mean_squared_error',

    cv=5).mean()

    mse.append(scores * -1)

mse

[21.571667100368845, 42.70030853331392, 50.764441255367885, 18.108193514463217]

*[zip([“X_full”,“X_missing_mean” ,“X_missing_0”,“X_missing_reg”],mse)]

[(‘X_full’, 21.571667100368845),
(‘X_missing_mean’, 42.70030853331392),
(‘X_missing_0’, 50.764441255367885),
(‘X_missing_reg’, 18.108193514463217)]

下面展示一些 内联代码片。

x_labels = ['Full data', 'Zero Imputation', 'Mean Imputation',' Regressor Imputation']

colors = ['r', 'g', 'b', 'orange']

plt.figure(figsize=(12, 6))

ax = plt.subplot(111)

for i in np.arange(len(mse)):

    ax.barh(i, mse[i],color=colors[i], alpha=0.6, align='center')

ax.set_title('Imputation Techniques with Boston Data')

ax.set_xlim(left=np.min(mse) * 0.9,

right=np.max(mse) * 1.1)

ax.set_yticks(np.arange(len(mse)))

ax.set_xlabel('MSE')

ax.set_yticklabels(x_labels)

plt.show()

在这里插入图片描述

要做一个小太阳

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
2021-05-26

机器学习中调参的两种方法：学习曲线，网格搜索%matplotlib inlinefrom sklearn.tree import DecisionTreeClassifierfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.datasets import load_winewine = load_wine()wine.datawine.targetarray([0, 0, 0, 0, 0, 0,
复制链接

扫一扫