sklearn练习
from sklearn import datasets
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
def evaluate(y_test, pred):
acc = metrics.accuracy_score(y_test, pred)
# f1 = metrics.f1_score(y_test, pred, average='micro')
# auc = metrics.roc_auc_score(y_test, pred)
# print('acc =', acc)
# print('f1 =', f1)
# print('auc =', auc)
return acc
print('Naive Bayes:')
scores = []
dataset = datasets.make_classification(n_samples=1000, n_features=10, n_informative=8, n_redundant=2, n_repeated=0, n_classes=4)
kf = cross_validation.KFold(len(dataset[0]), n_folds=10, shuffle=True)
for train_index, test_index in kf:
x_train, y_train = dataset[0][train_index], dataset[1][train_index]
x_test, y_test = dataset[0][test_index], dataset[1][test_index]
clf = GaussianNB()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
scores.append(evaluate(y_test, pred))
print(sum(scores)/len(scores))
print('SVC:')
scores = []
params = [1e-2, 1e-1, 1e0, 1e1, 1e2]
for train_index, test_index in kf:
x_train, y_train = dataset[0][train_index], dataset[1][train_index]
x_test, y_test = dataset[0][test_index], dataset[1][test_index]
for c in params:
clf = SVC(C=c, kernel='rbf', gamma=0.1)
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
scores.append(evaluate(y_test, pred))
print(sum(scores)/len(scores))
print('Randm Forest:')
scores = []
params = [10, 100, 1000]
for train_index, test_index in kf:
x_train, y_train = dataset[0][train_index], dataset[1][train_index]
x_test, y_test = dataset[0][test_index], dataset[1][test_index]
for estimate in params:
clf = RandomForestClassifier(n_estimators=estimate)
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
scores.append(evaluate(y_test, pred))
print(sum(scores)/len(scores))
结果:
Naive Bayes:
0.654
SVC:
0.6838
Random Forest:
0.7853333333333332
Random Forest花的时间最长,评估结果也最高,Naive Bayes花的时间最短,评估成绩也最低。
备注:不知道为什么f1_score和roc_auc_score不能编译通过