实验内容:
实验代码:
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np
def find_best_C(data_train, data_test):
KF = KFold(n_splits=5)
C=[1e-02, 1e-01, 1e00, 1e01, 1e02]
innerscore=[]
for c in C:
inner=[]
for t_index, v_index in KF.split(data_train):
X_t, Y_t = data_train[t_index], data_test[t_index]
X_v, Y_v = data_train[v_index], data_test[v_index]
clf = SVC(C=c,kernel='rbf',gamma='auto')
clf.fit(X_t, Y_t)
inner.append(clf.score(X_v, Y_v))
innerscore.append(sum(inner)/len(inner))
return C[np.argmax(innerscore)]
def find_best_N(data_train, data_test):
KF = KFold(n_splits=5)
N=[10, 50, 100, 500, 1000]
innerscore=[]
for n in N:
inner=[]
for t_index, v_index in KF.split(data_train):
X_t, Y_t = data_train[t_index], data_test[t_index]
X_v, Y_v = data_train[v_index], data_test[v_index]
clf = RandomForestClassifier(n_estimators=n)
clf.fit(X_t, Y_t)
inner.append(clf.score(X_v, Y_v))
innerscore.append(sum(inner)/len(inner))
return N[np.argmax(innerscore)]
#数据集,样本数1000,10个特征,有两种类别
data = datasets.make_classification(n_samples=1000, n_features=10, n_informative=2, n_redundant=2,n_repeated=0,n_classes=2)
#划分数据集
kf = KFold(n_splits=10, shuffle=True)
G_accuracy=[]
G_F1score=[]
G_AUCROC=[]
SVC_accuracy=[]
SVC_F1score=[]
SVC_AUCROC=[]
RanFor_accuracy=[]
RanFor_F1score=[]
RanFor_AUCROC=[]
for train_index, test_index in kf.split(data[0]):
X_train, Y_train = data[0][train_index], data[1][train_index]
X_test, Y_test = data[0][test_index], data[1][test_index]
#使用朴素贝叶斯算法,Gaussian分布不需要预估其参数值
clf_G=GaussianNB()
clf_G.fit(X_train, Y_train)
pred_G=clf_G.predict(X_test)
G_accuracy.append(metrics.accuracy_score(Y_test, pred_G))
G_F1score.append(metrics.f1_score(Y_test, pred_G))
G_AUCROC.append(metrics.roc_auc_score(Y_test, pred_G))
#使用SVC算法,可能的C值为[1e-02, 1e-01, 1e00, 1e01, 1e02]
best_C=find_best_C(X_train, Y_train)
clf_SVC=SVC(C=best_C, kernel='rbf', gamma='auto')
clf_SVC.fit(X_train, Y_train)
pred_SVC=clf_SVC.predict(X_test)
SVC_accuracy.append(metrics.accuracy_score(Y_test, pred_SVC))
SVC_F1score.append(metrics.f1_score(Y_test, pred_SVC))
SVC_AUCROC.append(metrics.roc_auc_score(Y_test, pred_SVC))
#使用随机森林,可能的 n_estimators 为 [10, 50 ,100,500,1000])
best_N=find_best_N(X_train, Y_train)
clf_RanFor = RandomForestClassifier(n_estimators=best_N)
clf_RanFor.fit(X_train, Y_train)
pred_RanFor=clf_RanFor.predict(X_test)
RanFor_accuracy.append(metrics.accuracy_score(Y_test, pred_RanFor))
RanFor_F1score.append(metrics.f1_score(Y_test, pred_RanFor))
RanFor_AUCROC.append(metrics.roc_auc_score(Y_test, pred_RanFor))
print("GaussianNB:")
print("For Accuracy:")
print("per-folder:",G_accuracy)
print("averaged:",np.mean(G_accuracy))
print("For F1-score:")
print("per-folder:",G_F1score)
print("average:",np.mean(G_F1score))
print("For AUC ROC:")
print("per-folder:",G_AUCROC)
print("average:",np.mean(G_AUCROC))
print("Total average",(np.mean(G_accuracy)+np.mean(G_F1score)+np.mean(G_AUCROC))/3,'\n')
print("SVC(with best parameter",best_C,"):")
print("For Accuracy:")
print("per-folder:",SVC_accuracy)
print("averaged:",np.mean(SVC_accuracy))
print("For F1-score:")
print("per-folder:",SVC_F1score)
print("average:",np.mean(SVC_F1score))
print("For AUC ROC:")
print("per-folder:",SVC_AUCROC)
print("average:",np.mean(SVC_AUCROC))
print("Total average",(np.mean(SVC_accuracy)+np.mean(SVC_F1score)+np.mean(SVC_AUCROC))/3,'\n')
print("RandomForest(with best parameter",best_N,"):")
print("For Accuracy:")
print("per-folder:",RanFor_accuracy)
print("averaged:",np.mean(RanFor_accuracy))
print("For F1-score:")
print("per-folder:",SVC_F1score)
print("average:",np.mean(RanFor_F1score))
print("For AUC ROC:")
print("per-folder:",RanFor_AUCROC)
print("average:",np.mean(RanFor_AUCROC))
print("Total average",(np.mean(RanFor_accuracy)+np.mean(RanFor_F1score)+np.mean(RanFor_AUCROC))/3 )
代码运行
MacBookdeMBP:pyhomework macbook$ python3 test.py
GaussianNB:
For Accuracy:
per-folder: [0.9, 0.9, 0.88, 0.88, 0.89, 0.87, 0.9, 0.88, 0.9, 0.93]
averaged: 0.893
For F1-score:
per-folder: [0.8913043478260869, 0.8913043478260869, 0.8909090909090909, 0.8846153846153846, 0.8932038834951457, 0.8712871287128714, 0.8979591836734694, 0.8775510204081634, 0.8979591836734694, 0.9278350515463919]
average: 0.892392862268616
For AUC ROC:
per-folder: [0.8993558776167472, 0.8987595038015207, 0.8787878787878789, 0.8798076923076923, 0.8897559023609444, 0.8699999999999999, 0.9, 0.8868686868686869, 0.9, 0.9309723889555822]
average: 0.8934307930699055
Total average 0.8929412184461739
SVC(with best parameter 0.1 ):
For Accuracy:
per-folder: [0.96, 0.9, 0.93, 0.91, 0.91, 0.84, 0.93, 0.93, 0.92, 0.96]
averaged: 0.9190000000000002
For F1-score:
per-folder: [0.9555555555555557, 0.8936170212765957, 0.9391304347826087, 0.9158878504672897, 0.9108910891089109, 0.836734693877551, 0.9306930693069307, 0.9263157894736842, 0.92, 0.96]
average: 0.9188825503849127
For AUC ROC:
per-folder: [0.958132045088567, 0.8991596638655462, 0.9242424242424243, 0.9086538461538461, 0.9101640656262505, 0.84, 0.9299999999999999, 0.9343434343434344, 0.9199999999999999, 0.9603841536614646]
average: 0.9185079632981534
Total average 0.9187968378943555
RandomForest(with best parameter 50 ):
For Accuracy:
per-folder: [0.98, 0.91, 0.95, 0.91, 0.91, 0.89, 0.94, 0.93, 0.92, 0.97]
averaged: 0.931
For F1-score:
per-folder: [0.9555555555555557, 0.8936170212765957, 0.9391304347826087, 0.9158878504672897, 0.9108910891089109, 0.836734693877551, 0.9306930693069307, 0.9263157894736842, 0.92, 0.96]
average: 0.9319791722272693
For AUC ROC:
per-folder: [0.9798711755233495, 0.909763905562225, 0.9444444444444444, 0.9094551282051283, 0.9093637454981992, 0.89, 0.94, 0.9363636363636364, 0.9199999999999999, 0.9701880752300921]
average: 0.9309450110827076
Total average 0.9313080611033256
运行结果的分析
1. 对于本次实验的数据,SVC算法的最佳参数为C=0.1,RandomForest算法的最佳参数为n_estimators=50
2. 分析每个算法的分数可以看出:RandomForest、SVC、Naive bayes这三种算法的性能依次递减
3. 在每个算法中,使用不同的性能评估方式所得到的分数大致相同