sklearn
代码
from sklearn import datasets
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
######################################################
# cross_validation
######################################################
dataset = datasets.make_classification(n_samples=1000, n_features=10)
X,Y = dataset
kf = cross_validation.KFold(1000,n_folds = 10, shuffle = True)
for train_index,test_index in kf:
x_train,y_train = X[train_index] , Y[train_index]
x_test, y_test = X[test_index], Y[test_index]
print(x_test)
print(x_train)
print(y_test)
print(y_train)
######################################################
# GaussianNB
######################################################
for train_index,test_index in kf:
x_train,y_train = X[train_index] , Y[train_index]
x_test, y_test = X[test_index], Y[test_index]
clf = GaussianNB()
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
print(pred)
print(y_test)
Accuracy = metrics.accuracy_score(y_test, pred)
F1_score = metrics.f1_score(y_test, pred)
AUC_ROC= metrics.roc_auc_score(y_test, pred)
print("GaussianNB",Accuracy,F1_score,AUC_ROC)
######################################################
# svc
######################################################
for train_index,test_index in kf:
x_train,y_train = X[train_index] , Y[train_index]
x_test, y_test = X[test_index], Y[test_index]
clf = SVC(C = 1e-01,kernel='rbf',gamma = 0.1)
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
print(pred)
print(y_test)
Accuracy = metrics.accuracy_score(y_test, pred)
F1_score = metrics.f1_score(y_test, pred)
AUC_ROC= metrics.roc_auc_score(y_test, pred)
print("svc",Accuracy,F1_score,AUC_ROC)
######################################################
# RandomForestClassifier
######################################################
for train_index,test_index in kf:
x_train,y_train = X[train_index] , Y[train_index]
x_test, y_test = X[test_index], Y[test_index]
clf = RandomForestClassifier(n_estimators=6)
clf.fit(x_train,y_train)
pred = clf.predict(x_test)
print(pred)
print(y_test)
Accuracy = metrics.accuracy_score(y_test, pred)
F1_score = metrics.f1_score(y_test, pred)
AUC_ROC= metrics.roc_auc_score(y_test, pred)
print("RandomForestClassifier",Accuracy,F1_score,AUC_ROC)
输出
输出太长,部分就不放上来了
GaussianNB 0.94 0.9454545454545454 0.9380032206119162
GaussianNB 0.87 0.8737864077669902 0.8725961538461539
GaussianNB 0.95 0.9504950495049506 0.9505802320928372
GaussianNB 0.97 0.9739130434782608 0.9679722562219502
GaussianNB 0.93 0.9320388349514563 0.9297719087635055
GaussianNB 0.9 0.8979591836734694 0.9020473705339221
GaussianNB 0.91 0.8831168831168832 0.9241758241758242
GaussianNB 0.93 0.94017094017094 0.9265188834154352
GaussianNB 0.94 0.9375000000000001 0.9409875551987155
GaussianNB 0.96 0.9649122807017544 0.9555555555555555
svc 0.96 0.9622641509433962 0.961352657004831
svc 0.93 0.9292929292929293 0.9310897435897436
svc 0.93 0.9292929292929293 0.9301720688275311
svc 0.95 0.9565217391304347 0.9475724194206445
svc 0.95 0.9494949494949494 0.950580232092837
svc 0.95 0.9462365591397849 0.9492171818546769
svc 0.93 0.9066666666666667 0.9395604395604396
svc 0.93 0.94017094017094 0.9265188834154352
svc 0.96 0.9574468085106385 0.9598554797270173
svc 0.97 0.9724770642201834 0.9707070707070706
RandomForestClassifier 0.96 0.9615384615384615 0.962962962962963
RandomForestClassifier 0.94 0.9387755102040817 0.9407051282051283
RandomForestClassifier 0.95 0.9484536082474228 0.9497799119647861
RandomForestClassifier 0.95 0.9557522123893805 0.9504283965728273
RandomForestClassifier 0.93 0.9278350515463919 0.9309723889555822
RandomForestClassifier 0.95 0.9462365591397849 0.9492171818546769
RandomForestClassifier 0.97 0.9577464788732395 0.9703296703296703
RandomForestClassifier 0.94 0.9473684210526316 0.9417077175697864
RandomForestClassifier 0.97 0.967741935483871 0.9692894419911683
RandomForestClassifier 0.96 0.9629629629629629 0.9616161616161615
从左到右分别是 Accuracy,F1_score,AUC_ROC
总体上来说随机森林算法和svc差不多,而GaussianNB相对来说较弱
总体来说这次作业相对比较简单,可以在老师的pdf上找到源码,同时让我手动查找的内容相对较少