Steps
1 Create a classification dataset (n samples ! 1000, n features ! 10)
2 Split the dataset using 10-fold cross validation
3 Train the algorithms
I GaussianNB
I SVC (possible C values [1e-02, 1e-01, 1e00, 1e01, 1e02], RBF kernel)
I RandomForestClassifier (possible n estimators values [10, 100, 1000])
4 Evaluate the cross-validated performance
I Accuracy
I F1-score
I AUC ROC
5 Write a short report summarizing the methodology and the results
from sklearn import datasets
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np
def rbf_svm(x_train, y_train, x_test, c):
clf = SVC(C=c,kernel='rbf',gamma=0.1)
clf.fit(x_train,y_train)
return clf.predict(x_test)
def rbf_random(x_train,y_train,x_test,n):
clf = RandomForestClassifier(n_estimators=n)
clf.fit(x_train,y_train)
return clf.predict(x_test)
#choose the best c and the pred
def svm(x_train,y_train,x_test,y_test):
nn = len(x_train)
bestC = None
Cvalues = [1e-2,1e-1,1e0,1e1,1e2]
innerscore = []
for c in Cvalues:
ikf = cross_validation.KFold(nn,n_folds=5,shuffle=True,
random_state=5678)
innerf1 = []
for t_index,v_index in ikf:
x_t, x_v = x_train[t_index], x_train[v_index]
y_t, y_v = y_train[t_index], y_train[v_index]
ipred = rbf_svm(x_t,y_t,x_v,c)
innerf1.append(metrics.f1_score(y_v,ipred))
innerscore.append(sum(innerf1)/len(innerf1))
bestC = Cvalues[np.argmax(innerscore)]
return rbf_svm(x_train,y_train,x_test,bestC)
#choose the best n and the pred
def ran(x_train,y_train,x_test,y_test):
nn = len(x_train)
bestN = None
N = [10,100,1000]
innerscore = []
for n in N:
ikf = cross_validation.KFold(nn,n_folds=3,shuffle=True,
random_state=5678)
innerf1 = []
for t_index, v_index in ikf:
x_t, x_v = x_train[t_index], x_train[v_index]
y_t, y_v = y_train[t_index], y_train[v_index]
ipred = rbf_random(x_t, y_t, x_v, n)
innerf1.append(metrics.f1_score(y_v,ipred))
innerscore.append(sum(innerf1)/len(innerf1))
bestN = N[np.argmax(innerscore)]
return rbf_random(x_train,y_train,x_test,bestN)
X,y = datasets.make_classification(n_samples=1000,n_features=10,
n_informative = 2, n_redundant=2,n_repeated=0,n_classes=2)
kf = cross_validation.KFold(len(y),n_folds=10,shuffle=True)
g_accuracy = []
g_f1 = []
g_auc_roc = []
s_accuracy = []
s_f1 = []
s_auc_roc = []
r_accuracy = []
r_f1 = []
r_auc_roc = []
for train_index,test_index in kf:
x_train , y_train = X[train_index], y[train_index]
x_test, y_test = X[test_index], y[test_index]
clf = GaussianNB()
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
g_accuracy.append(metrics.accuracy_score(y_test,pred))
g_f1.append(metrics.f1_score(y_test,pred))
g_auc_roc.append(metrics.roc_auc_score(y_test,pred))
pred = svm(x_train,y_train,x_test,y_test)
s_accuracy.append(metrics.accuracy_score(y_test,pred))
s_f1.append(metrics.f1_score(y_test,pred))
s_auc_roc.append(metrics.roc_auc_score(y_test,pred))
pred = ran(x_train,y_train,x_test,y_test)
r_accuracy.append(metrics.accuracy_score(y_test,pred))
r_f1.append(metrics.f1_score(y_test,pred))
r_auc_roc.append(metrics.roc_auc_score(y_test,pred))
print('GuassianNB:')
for i in range(0,len(g_f1)):
print('for test'+str(i)+':')
print('accuracy:'+str(g_accuracy[i]))
print('f1:'+str(g_f1[i]))
print('auc_roc:'+str(g_auc_roc[i]))
print('---------------------------')
print('SVC:')
for i in range(0,len(s_f1)):
print('for test'+str(i)+':')
print('accuracy:'+str(s_accuracy[i]))
print('f1:'+str(s_f1[i]))
print('auc_roc:'+str(s_auc_roc[i]))
print('---------------------------')
print('RandomForestClassifier:')
for i in range(0,len(r_f1)):
print('for test'+str(i)+':')
print('accuracy:'+str(r_accuracy[i]))
print('f1:'+str(r_f1[i]))
print('auc_roc:'+str(r_auc_roc[i]))
print('---------------------------')