使用鸢尾花数据集实验和随机森林决策树模型
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets #自带数据集
from sklearn.model_selection import train_test_split,cross_val_score #划分数据 交叉验证
from sklearn.neighbors import KNeighborsClassifier #一个简单的模型,只有K一个参数,类似K-means
import matplotlib.pyplot as plt
iris = datasets.load_iris() #加载sklearn自带的数据集
train = iris.data #这是数据
target = iris.target #这是每个数据所对应的标签
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.5, random_state=0)
# model
clf = RandomForestClassifier(n_jobs=-1)
# Set the parameters by cross-validation
tuned_parameters = {
'n_estimators': [50, 100, 200]
# ,'criterion': ['gini', 'entropy']
# ,'max_depth': [2, 5]
# ,'max_features': ['log2', 'sqrt', 'int']
# ,'bootstrap': [True, False]
# ,'warm_start': [True, False]
}
scores = ['precision']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print("#########################################")
clf = GridSearchCV(clf, tuned_parameters, cv=5,
scoring='%s_macro' % score)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print("#########################################")
print(clf.best_params_)
print("#########################################")
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score'] # clf.cv_results_是一个字典 反应每个参数条件下,进行五折交叉验证的验证集准确率分数
stds = clf.cv_results_['std_test_score'] #反应每个参数条件下,进行五折交叉验证的波动情况
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("字典 %s",clf.cv_results_)
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred)) # 混淆矩阵
print()
网格搜索clf.cv_results_解读:
字典 %s {'mean_fit_time': array([0.31090269, 0.15191479, 0.25862093]), 'std_fit_time': array([0.47454571, 0.01327594, 0.0244417 ]), 'mean_score_time': array([0.11265926, 0.11408353, 0.11209092]), 'std_score_time': array([0.00379422, 0.0049297 , 0.00397596]), 'param_n_estimators': masked_array(data=[50, 100, 200],
mask=[False, False, False],
fill_value='?',
dtype=object), 'params': [{'n_estimators': 50}, {'n_estimators': 100}, {'n_estimators': 200}], 'split0_test_score': array([1., 1., 1.]), 'split1_test_score': array([1., 1., 1.]), 'split2_test_score': array([0.93333333, 0.93333333, 0.93333333]), 'split3_test_score': array([0.85, 0.85, 0.85]), 'split4_test_score': array([0.95238095, 0.95238095, 0.95238095]), 'mean_test_score': array([0.94714286, 0.94714286, 0.94714286]), 'std_test_score': array([0.05521346, 0.05521346, 0.05521346]), 'rank_test_score': array([1, 1, 1])}
clf.cv_results_为字典结构。其中
means = clf.cv_results_['mean_test_score'] # clf.cv_results_是一个字典, 反应每个参数条件下,进行五折交叉验证的验证集准确率分数 。比如代码中调节的参数为'n_estimators': [50, 100, 200]
共三个值,所以返回的means也为三个值,其中每个值为进行五折交叉验证的验证集准确率平均分数(也就是说对每个参数都进行一次五折交叉验证,按照这个流程往下)
stds = clf.cv_results_['std_test_score'] #反应每个参数条件下,进行五折交叉验证的波动情况
验证集准确率的标准差,反应其每次交叉验证评分波动情况
代码运行结果
# Tuning hyper-parameters for precision
#########################################
Best parameters set found on development set:
#########################################
{'n_estimators': 50}
#########################################
Grid scores on development set:
0.947 (+/-0.110) for {'n_estimators': 50}
0.947 (+/-0.110) for {'n_estimators': 100}
0.947 (+/-0.110) for {'n_estimators': 200}
字典 %s {'mean_fit_time': array([0.31090269, 0.15191479, 0.25862093]), 'std_fit_time': array([0.47454571, 0.01327594, 0.0244417 ]), 'mean_score_time': array([0.11265926, 0.11408353, 0.11209092]), 'std_score_time': array([0.00379422, 0.0049297 , 0.00397596]), 'param_n_estimators': masked_array(data=[50, 100, 200],
mask=[False, False, False],
fill_value='?',
dtype=object), 'params': [{'n_estimators': 50}, {'n_estimators': 100}, {'n_estimators': 200}], 'split0_test_score': array([1., 1., 1.]), 'split1_test_score': array([1., 1., 1.]), 'split2_test_score': array([0.93333333, 0.93333333, 0.93333333]), 'split3_test_score': array([0.85, 0.85, 0.85]), 'split4_test_score': array([0.95238095, 0.95238095, 0.95238095]), 'mean_test_score': array([0.94714286, 0.94714286, 0.94714286]), 'std_test_score': array([0.05521346, 0.05521346, 0.05521346]), 'rank_test_score': array([1, 1, 1])}
Detailed classification report:
The model is trained on the full development set.
The scores are computed on the full evaluation set.
precision recall f1-score support
0 1.00 1.00 1.00 21
1 0.88 0.97 0.92 30
2 0.95 0.83 0.89 24
accuracy 0.93 75
macro avg 0.94 0.93 0.94 75
weighted avg 0.94 0.93 0.93 75