%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine
wine = load_wine()
wine
[{"metadata":{"trusted":false,"scrolled":false},"id":"e7e71df6","cell_type":"code","source":"wine","execution_count":88,"outputs":[{"data":{"text/plain":"{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,\n 1.065e+03],\n [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,\n 1.050e+03],\n [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,\n 1.185e+03],\n ...,\n [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,\n 8.350e+02],\n [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,\n 8.400e+02],\n [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,\n 5.600e+02]]),\n 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,\n 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n 2, 2]),\n 'frame': None,\n 'target_names': array(['class_0', 'class_1', 'class_2'], dtype='
wine.data.shape
(178, 13)
wine.target
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
#实例化
#训练集带入实例化后的模型去进行训练,使用的接口是fit
#使用其他接口将测试集导入我们训练好的模型,去获取我们是希望获取的结果()score,Y_test)
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest = train_test_split(wine.data,wine.target,test_size=0.3)#所有的特征和标签分开
clf = DecisionTreeClassifier(random_state= 0)#实例化 #r是控制随机性andom_state是控制随机性
rlf = RandomForestClassifier(random_state= 0)
clf = clf.fit(Xtrain,Ytrain)#导入训练集训练
rlf = rlf.fit(Xtrain,Ytrain)#导入训练集
score_c = clf.score(Xtest,Ytest)#模型精确性的标准,等于accuracy
score_r = rlf.score(Xtest,Ytest)
print("Single Tree:{}".format(score_c)
,"Random Forest:{}".format(score_r))
Single Tree:0.9814814814814815 Random Forest:1.0
#交叉验证:一个结果会对模型造成影响,希望在探究在不同的测试集和训练集来看模型的稳定性
#cross_val_score交叉验证不用自己分测试集和训练集,交叉验证输入的是完整的特征矩阵和完整的标签,cv是交叉验证的次数
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
rfc = RandomForestClassifier(n_estimators = 25)
rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10)
clf = DecisionTreeClassifier()
clf_s = cross_val_score(clf,wine.data,wine.target,cv=10)
plt.plot(range(1,11),rfc_s,label="RandomForest")
plt.plot(range(1,11),clf_s,label="DecisionTree")
plt.legend()#请显示图例
plt.show()
#结果显示在每次交叉验证当中,随机森马的准确性大于等于决策树
superpa = []#实例化
for i in range(200):
rfc = RandomForestClassifier(n_estimators=i+1,n_jobs=-1)#跑了200次#
rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10).mean()
superpa.append(rfc_s)
print(max(superpa),superpa.index(max(superpa))+1)#最高准确率以及它所在的位置
plt.figure(figsize=[20,5])
plt.plot(range(1,201),superpa)
plt.show()
#List.index(object),对象object在列表List当中的索引superpa.index(max(superpa))+1,其实是63
0.9944444444444445 62
#10次建模下的100次交叉验证的图像
rfc_l = []
clf_l = []
#建立两个列表来保存结果
for i in range(10):#每个模型进行十次交叉验证,总共100次交叉验证。
rfc = RandomForestClassifier(n_estimators=25)
rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10).mean()
rfc_l.append(rfc_s)#把求得的平均值放到rfc_l这个列表中去
clf = DecisionTreeClassifier()
clf_s = cross_val_score(clf,wine.data,wine.target,cv=10).mean()
clf_l.append(clf_s)#把求得的平均值放到clf_l这个列表中去
plt.plot(range(1,11),r、fc_l,label="RandomForest")
plt.plot(range(1,11),clf_l,label="DecisionTree")
plt.legend()
plt.show()
#结果:随着建模次数越来越多,随机森林比决策树好的越来越多