使用随机森林模型来训练和交叉验证数据集。
- 数据矩阵:allmatrix
- 标签:target
randomforest training
preliminary trainning
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.model_selection import cross_val_score
>>> from sklearn.metrics import confusion_matrix as CM
>>> from sklearn.metrics import accuracy_score as ACCS
>>> Xtrain,Xtest,Ytrain,Ytest = train_test_split(allmatrix,target,test_size=0.3,random_state=420)
>>> rfc = RandomForestClassifier(n_estimators=100,random_state=90,n_jobs = -1)
>>> rfc = rfc.fit(Xtrain,Ytrain)
>>> pred_rfc = rfc.predict(Xtest)
>>> score=ACCS(Ytest, pred_rfc)
>>> print("RFC 1st training test score: {}".format(score))
RFC 1st training test score: 0.8417841962046704
>>> print("Training data score: {}".format(rfc.score(Xtrain,Ytrain)))
Training data score: 0.999946644308091
>>> cm_rfc = CM(Ytest, pred_rfc)
>>> cm_rfc
array([[23340, 4507],
[ 4389, 23991]])
CV=5 cross_val 交叉验证
>>> score_val = cross_val_score(rfc,allmatrix,target,cv=5)
>>> print("RFC n_estimator=100 cv=5 corssvalidation score: {}".format(score_val))
RFC n_estimator=100 cv=5 corssvalidation score: [0.82998986 0.84630776 0.86231459 0.81141287 0.82240423]
>>> score_val.mean()
0.8344858607420459
增加n_estimators = 200
>>> score=rfc200.score(Xtest,Ytest)
>>> print("RFC n_estimator=200 training test score: {}".format(score))
RFC n_estimator=200 training test score: 0.8479022533658207
使用criterion=‘entropy’
>>> rfc = RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=90,n_jobs = -1)
>>> rfc = rfc.fit(Xtrain,Ytrain)
>>> rfc.score(Xtest,Ytest)
0.8427445888985718
>>> entr_val=cross_val_score(rfc,allmatrix,target,cv=5)
>>> entr_val
array([0.83004322, 0.84561413, 0.8643688 , 0.81197311, 0.81944296])
>>> entr_val.mean()
0.834288442607874
准确率低于默认参数时的模型
调整max-depth
>>> scorel = []
>>> for i in range(2,22,2):
... rfc = RandomForestClassifier(n_estimators=130,
... n_jobs=-1,
... max_depth=i
... ,random_state=90)
... score = cross_val_score(rfc,allmatrix,target,cv=5).mean()
... scorel.append(score)
...
>>> print("max-depth optimization and its value")
max-depth optimization and its value
>>> print(max(scorel),((scorel.index(max(scorel))+1)*2))
0.834336482169649 20
- 影响不大
调整max-fetures
now()
score2 = []
features = range(35, 71, 5)
>>> for i in features:
... rfc = RandomForestClassifier(n_estimators=100,n_jobs=-1,max_features=i,random_state=90)
... score = cross_val_score(rfc,allmatrix,target,cv=5).mean()
... score2.append(score)
>>> print(max(score2),(features[score2.index(max(score2))]))
0.8365133731312191 65
使用标准化后的数据
- 模型效果无明显提升
>>> rfc = RandomForestClassifier(n_estimators=100,random_state=90,n_jobs = -1)
>>> rfc = RandomForestClassifier(n_estimators=100,random_state=90,n_jobs = -1)
>>> Xstd_val=cross_val_score(rfc,X_std,target,cv=5)
>>> Xstd_val
array([0.83001654, 0.84582755, 0.86226123, 0.81221321, 0.8228044 ])
>>> Xstd_val.mean()
0.834624586313739
使用归一化的数据
- 模型效果无明显提升
>>> from sklearn.preprocessing import MinMaxScaler
>>> scaler = MinMaxScaler()
>>> X_mm=scaler.fit_transform(allmatrix)
/homes/xiaohuizou/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py:595: DataConversionWarning: Data with input dtype int16 was converted to float64 by MinMaxScaler.
warnings.warn(msg, DataConversionWarning)
>>> Xstd_mm=cross_val_score(rfc,X_mm,target,cv=5)
>>> Xstd_mm
array([0.82974977, 0.84718813, 0.86228791, 0.81135951, 0.82240423])
>>> Xstd_mm.mean()
0.8345979111111592
结论
使用随机森林模型最多只能使准确率达到84%左右,无法进一步提升。如果想要更好的表现,需要使用其他模型