In [37]:
dt_scores = cross_val_score(decision_tree_classifier, all_inputs, all_classes, cv=10)
sb.boxplot(dt_scores)
sb.stripplot(dt_scores, jitter=True, color='white')
Out[37]:
In [40]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier()
parameter_grid = {'n_estimators': [5, 10, 25, 50],
'criterion': ['gini', 'entropy'],
'max_features': [1, 2, 3, 4],
'warm_start': [True, False]}
cross_validation = StratifiedKFold(all_classes, n_folds=10)
grid_search = GridSearchCV(random_forest_classifier,
param_grid=parameter_grid,
cv=cross_validation)
grid_search.fit(all_inputs, all_classes)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
grid_search.best_estimator_
Out[40]:
In [42]:
random_forest_classifier = grid_search.best_estimator_
rf_df = pd.DataFrame({'accuracy': cross_val_score(random_forest_classifier, all_inputs, all_classes, cv=10),
'classifier': ['Random Forest'] * 10})
dt_df = pd.DataFrame({'accuracy': cross_val_score(decision_tree_classifier, all_inputs, all_classes, cv=10),
'classifier': ['Decision Tree'] * 10})
both_df = rf_df.append(dt_df)
sb.boxplot(x='classifier', y='accuracy', data=both_df)
sb.stripplot(x='classifier', y='accuracy', data=both_df, jitter=True, color='white')
Out[42]:
In [43]:
%install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
In [44]:
%load_ext watermark
In [45]:
%watermark -a 'Randal S. Olson' -nmv --packages numpy,pandas,scikit-learn,matplotlib,Seaborn
In [46]:
%matplotlib inline
import pandas as pd
import seaborn as sb
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
# We can jump directly to working with the clean data because we saved our cleaned data set
iris_data_clean = pd.read_csv('iris-data-clean.csv')
# Testing our data: Our analysis will stop here if any of these assertions are wrong
# We know that we should only have three classes
assert len(iris_data_clean['class'].unique()) == 3
# We know that sepal lengths for 'Iris-versicolor' should never be below 2.5 cm
assert iris_data_clean.loc[iris_data_clean['class'] == 'Iris-versicolor', 'sepal_length_cm'].min() >= 2.5
# We know that our data set should have no missing measurements
assert len(iris_data_clean.loc[(iris_data_clean['sepal_length_cm'].isnull()) |
(iris_data_clean['sepal_width_cm'].isnull()) |
(iris_data_clean['petal_length_cm'].isnull()) |
(iris_data_clean['petal_width_cm'].isnull())]) == 0
all_inputs = iris_data_clean[['sepal_length_cm', 'sepal_width_cm',
'petal_length_cm', 'petal_width_cm']].values
all_classes = iris_data_clean['class'].values
# This is the classifier that came out of Grid Search
random_forest_classifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features=3, max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
oob_score=False, random_state=None, verbose=0, warm_start=True)
# All that's left to do now is plot the cross-validation scores
rf_classifier_scores = cross_val_score(random_forest_classifier, all_inputs, all_classes, cv=10)
sb.boxplot(rf_classifier_scores)
sb.stripplot(rf_classifier_scores, jitter=True, color='white')
# ...and show some of the predictions from the classifier
(training_inputs,
testing_inputs,
training_classes,
testing_classes) = train_test_split(all_inputs, all_classes, train_size=0.75)
random_forest_classifier.fit(training_inputs, training_classes)
for input_features, prediction, actual in zip(testing_inputs[:10],
random_forest_classifier.predict(testing_inputs[:10]),
testing_classes[:10]):
print('{}\t-->\t{}\t(Actual: {})'.format(input_features, prediction, actual))