产生一组随机的二分类数据,取训练集分别训练NaiveBayes,SVC,随机森林三种算法,对于SVC和随机森林算法,需要额外评估其参数的最佳取值。最后分别使用ACC,f1,rocauc方法评估三种算法。
from sklearn import cross_validation
from sklearn import datasets
from sklearn import naive_bayes
from sklearn import svm
from sklearn import ensemble
from sklearn import metrics
from numpy import argmax
acc_for_NB = [] #使用accuracy评估三个算法
acc_for_SVC = []
acc_for_RFC = []
f1_for_NB = [] # 使用F1-score评估三个算法
f1_for_SVC = []
f1_for_RFC = []
auc_for_NB = [] # 使用AUC ROC评估三个算法
auc_for_SVC = []
auc_for_RFC = []
X, Y = datasets.make_classification(n_samples = 1000, n_features = 10)
kf = cross_validation.KFold(1000, n_folds = 10, shuffle = True)
for train_index, test_index in kf:
X_train, y_train = X[train_index], Y[train_index]
X_test, y_test = X[test_index], Y[test_index]
#使用NB算法
NBclf = naive_bayes.GaussianNB()
NBclf.fit(X_train, y_train)
NBpred = NBclf.predict(X_test)
#分别用三种方法评估NB算法结果
acc_for_NB.append(metrics.accuracy_score(y_test, NBpred))
f1_for_NB.append(metrics.f1_score(y_test, NBpred))
auc_for_NB.append(metrics.roc_auc_score(y_test, NBpred))
#使用SVC算法,先找出最佳的参数C
nn = len(X_train)
Cvalues = [1e-02, 1e-01, 1e00, 1e01, 1e02]
Cscore = [] #记录每个C取值的评估分数
for C in Cvalues:
#将X_train分为5个fold
ikf = cross_validation.KFold(nn, n_folds = 5, shuffle = True)
innerscore = []
#对一个C的取值进行评估
for inner_train_index, inner_test_index in ikf:
inner_X_train, inner_X_test = X_train[inner_train_index], X_train[inner_test_index]