Scikit-Learn Assignment
Assignment
from sklearn import datasets
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
def DataAnalysis():
"""Followed by steps"""
iris = datasets.load_iris()
# Create a classification dataset (n_samples >= 1000, n_features >= 10)
dataset = datasets.make_classification(n_samples = 1000, n_features = 10,
n_informative = 2, n_redundant = 2, n_repeated = 0, n_classes = 2)
print ("dataset information")
# dataset description
print (iris.DESCR)
# data examples (features)
print (iris.data)
# data target labels (classes)
print (iris.target)
# Split the dataset using 10-fold cross validation
kf = cross_validation.KFold(len(iris.data), n_folds = 10, shuffle = True)
for train_index, test_index in kf:
X_train, y_train = iris.data[train_index], iris.target[train_index]
X_test, y_test = iris.data[test_index], iris.target[test_index]
print ("\nsplit the dataset")
print (X_train)
print (y_train)
print (X_test)
print (y_test)
# GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print ("\nGaussianNB")
print (pred)
print (y_test)
# SVC
clf = SVC(C = 1e-02, kernel = 'rbf', gamma = 0.1)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print ("\nSVC")
print (pred)
print (y_test)
# RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print ("\nRandomForestClassifier")
print (pred)
print (y_test)
# Performance evaluation
acc = metrics.accuracy_score(y_test, pred)
print ("\nAccuracy")
print (acc)
f1 = metrics.f1_score(y_test, pred, average = "weighted")
print ("\nF1-score")
print (f1)
auc = metrics.roc_auc_score(y_test, pred)
print ("\nAUC ROC")
print (auc)
DataAnalysis()
Result of the Assignment
Attention: only part of the results are displayed
dataset information
split the dataset
Algorithm and Evaluation