%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_wine # 导入红酒数据集
import matplotlib.pyplot as plt
随机森林分类器
wine = load_wine()
print(wine.data.shape,wine.target.shape)
wine
一、train_test_split划分下训练
- 训练集带入实例化后的模型去训练,使用的接口是fit
- 使用其他接口将测试集导入我们训练好的模型,去获取我们希望的结果(score,Y_est)
Xtrain,Xtest,Ytrain,Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
clf = DecisionTreeClassifier(random_state=0) # 实例化
rfc = RandomForestClassifier(random_state=0) # 默认n_estimators=100
clf = clf.fit(Xtrain,Ytrain) # 训练
rfc = rfc.fit(Xtrain,Ytrain)
score_c = clf.score(Xtest,Ytest) # 得到测试集的分数
score_r = rfc.score(Xtest,Ytest)
print(score_c,score_r)
二、交叉验证下的训练
2.1、一次交叉验证 (10次划分)
rfc = RandomForestClassifier(n_estimators=25) # 实例化
rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10) # 交叉划分数据集,返回每一次划分的得分
clf = DecisionTreeClassifier()
clf_s = cross_val_score(clf,wine.data,wine.target,cv=10)
plt.plot(range(1,11),rfc_s,label="RandomForestClassifier")
plt.plot(range(1,11),clf_s,label="DcisionTreeClassifier")
plt.legend()
plt.show()
#=======================另一种写法===========================#
'''
label = "RandomForest"
for model in [RandomForestClassifier(n_estimators=25),DecisionTreeClassifier()]:
score = cross_val_score(model,wine.data,wine.target,cv=10)
print("{}:".format(label))
print(score.mean())
plt.plot(range(1,11),score,label=label)
plt.legend()
label = "DecisionTree"
'''
2.2、10次交叉验证 (10*10 100次划分)
rfc_1 = []
clf_1 = []
for i in range(10):
rfc = RandomForestClassifier(n_estimators=25)
rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10).mean() # 每一轮循环的10次取均值
rfc_1.append(rfc_s)
clf = DecisionTreeClassifier()
clf_s = cross_val_score(clf,wine.data,wine.target,cv=10).mean()
clf_1.append(clf_s)
plt.plot(range(1,11),rfc_1,label="Random Forest")
plt.plot(range(1,11),clf_1,label="Decisin Tree")
plt.legend()
plt.show()
2.3、n_esrimators的学习曲线
superpa = []
for i in range(50):
rfc = RandomForestClassifier(n_estimators=i+1,n_jobs=-1)
rfc_s = cross_val_score(rfc,wine.data,wine.target,cv=10).mean()
superpa.append(rfc_s)
print(max(superpa),superpa.index(max(superpa))+1)
plt.figure(figsize=[20,5])
plt.plot(range(1,51),superpa)
plt.show()
2.4、随机森林的重要属性、参数
- estimators_ 查看森林中树的状况
- bootstrap 有放回抽样技术 默认为True 63.2%的数据被使用
- oob_score 带外数据36.8% out of bag data 用带外数据测试,实例化时将oob_score调整为True
- oob_score_ 查看在带外数据上的测试结果
rfc = RandomForestClassifier(n_estimators=25)
rfc = rfc.fit(Xtrain,Ytrain) # 训练后查看
rfc.estimators_
rfc.estimators_[0] # rfc.estimators_[0].random_sate
rfc = RandomForestClassifier(n_estimators=25,oob_score=True)
rfc = rfc.fit(wine.data,wine.target)
rfc.oob_score_
重要属性
- score
- feature_importances_
- apply
- predict
- predict_proba
rfc = RandomForestClassifier(n_estimators=25)
rfc = rfc.fit(Xtrain,Ytrain)
rfc.score(Xtest,Ytest) # 在测试集上的得分
随机森林回归器
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
boston = load_boston()
regressor = RandomForestRegressor(n_estimators=100,random_state=10)
cross_val_score(regressor,boston.data,boston.target,cv=10
,scoring="neg_mean_squared_error") # 打分的标准,默认返回的为R^2