1. 设定
from __future__ import division, print_function, unicode_literals
import numpy as np
import os
np. random. seed( 42 )
% matplotlib inline
import matplotlib as mpl
import matplotlib. pyplot as plt
mpl. rc( 'axes' , labelsize= 14 )
mpl. rc( 'xtick' , labelsize= 12 )
mpl. rc( 'ytick' , labelsize= 12 )
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
def save_fig ( fig_id, tight_layout= True ) :
path = os. path. join( PROJECT_ROOT_DIR, "images" , CHAPTER_ID, fig_id + ".png" )
print ( "Saving figure" , fig_id)
if tight_layout:
plt. tight_layout( )
plt. savefig( path, format = 'png' , dpi= 300 )
2.从MNIST获取训练图片
# 使获取的数据按目标顺序排列好
def sort_by_target(mnist):
reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
mnist.data[:60000] = mnist.data[reorder_train]
mnist.target[:60000] = mnist.target[reorder_train]
mnist.data[60000:] = mnist.data[reorder_test + 60000]
mnist.target[60000:] = mnist.target[reorder_test + 60000]
try :
from sklearn. datasets import fetch_openml
mnist = fetch_openml( 'mnist_784' , version= 1 , cache= True )
mnist. target = mnist. target. astype( np. int8)
sort_by_target( mnist)
except ImportError:
from sklearn. datasets import fetch_mldata
mnist = fetch_mldata( 'MNIST original' )
mnist[ "data" ] , mnist[ "target" ]
mnist. data. shape
X, y = mnist[ "data" ] , mnist[ "target" ]
X. shape
y. shape
some_digit = X[ 36000 ]
some_digit_image = some_digit. reshape( 28 , 28 )
plt. imshow( some_digit_image, cmap = mpl. cm. binary,
interpolation= "nearest" )
plt. axis( "off" )
save_fig( "some_digit_plot" )
plt. show( )
def plot_digit ( data) :
image = data. reshape( 28 , 28 )
plt. imshow( image, cmap = mpl. cm. binary,
interpolation= "nearest" )
plt. axis( "off" )
def plot_digits ( instances, images_per_row= 10 , ** options) :
size = 28
images_per_row = min ( len ( instances) , images_per_row)
images = [ instance. reshape( size, size) for instance in instances]
n_rows = ( len ( instances) - 1 ) // images_per_row + 1
row_images = [ ]
n_empty = n_rows * images_per_row - len ( instances)
images. append( np. zeros( ( size, size * n_empty) ) )
for row in range ( n_rows) :
rimages = images[ row * images_per_row : ( row + 1 ) * images_per_row]
row_images. append( np. concatenate( rimages, axis= 1 ) )
image = np. concatenate( row_images, axis= 0 )
plt. imshow( image, cmap = mpl. cm. binary, ** options)
plt. axis( "off" )
plt. figure( figsize= ( 9 , 9 ) )
example_images = np. r_[ X[ : 12000 : 600 ] , X[ 13000 : 30600 : 600 ] , X[ 30600 : 60000 : 590 ] ]
plot_digits( example_images, images_per_row= 10 )
save_fig( "more_digits_plot" )
plt. show( )
y[ 36000 ]
X_train, X_test, y_train, y_test = X[ : 60000 ] , X[ 60000 : ] , y[ : 60000 ] , y[ 60000 : ]
shuffle_index = np. random. permutation( 60000 )
X_train, y_train = X_train[ shuffle_index] , y_train[ shuffle_index]
3.构造二元分类器
y_train_5 = ( y_train == 5 )
y_test_5 = ( y_test == 5 )
from sklearn. linear_model import SGDClassifier
sgd_clf = SGDClassifier( max_iter= 5 , tol= - np. infty, random_state= 42 )
sgd_clf. fit( X_train, y_train_5)
sgd_clf. predict( [ some_digit] )
from sklearn. model_selection import cross_val_score
cross_val_score( sgd_clf, X_train, y_train_5, cv= 3 , scoring= "accuracy" )
from sklearn. model_selection import StratifiedKFold
from sklearn. base import clone
skfolds = StratifiedKFold( n_splits= 3 , random_state= 42 )
for train_index, test_index in skfolds. split( X_train, y_train_5) :
clone_clf = clone( sgd_clf)
X_train_folds = X_train[ train_index]
y_train_folds = ( y_train_5[ train_index] )
X_test_fold = X_train[ test_index]
y_test_fold = ( y_train_5[ test_index] )
clone_clf. fit( X_train_folds, y_train_folds)
y_pred = clone_clf. predict( X_test_fold)
n_correct = sum ( y_pred == y_test_fold)
print ( n_correct / len ( y_pred) )
from sklearn. base import BaseEstimator
class Never5Classifier ( BaseEstimator) :
def fit ( self, X, y= None ) :
pass
def predict ( self, X) :
return np. zeros( ( len ( X) , 1 ) , dtype= bool )
never_5_clf = Never5Classifier( )
cross_val_score( never_5_clf, X_train, y_train_5, cv= 3 , scoring= "accuracy" )
from sklearn. model_selection import cross_val_predict
y_train_pred = cross_val_predict( sgd_clf, X_train, y_train_5, cv= 3 )
from sklearn. metrics import confusion_matrix
confusion_matrix( y_train_5, y_train_pred)
y_train_perfect_predictions = y_train_5
confusion_matrix( y_train_5, y_train_perfect_predictions)
from sklearn. metrics import precision_score, recall_score
precision_score( y_train_5, y_train_pred)
4344 / ( 4344 + 1307 )
recall_score( y_train_5, y_train_pred)
from sklearn. metrics import f1_score
f1_score( y_train_5, y_train_pred)
4344 / ( 4344 + ( 1077 + 1307 ) / 2 )
y_scores = sgd_clf. decision_function( [ some_digit] )
y_scores
threshold = 0
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred
threshold = 200000
y_some_digit_pred = ( y_scores > threshold)
y_some_digit_pred
y_scores = cross_val_predict( sgd_clf, X_train, y_train_5, cv= 3 ,
method= "decision_function" )
y_scores. shape
from sklearn. metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve( y_train_5, y_scores)
def plot_precision_recall_vs_threshold ( precisions, recalls, thresholds) :
plt. plot( thresholds, precisions[ : - 1 ] , "b--" , label= "Precision" , linewidth= 2 )
plt. plot( thresholds, recalls[ : - 1 ] , "g-" , label= "Recall" , linewidth= 2 )
plt. xlabel( "Threshold" , fontsize= 16 )
plt. legend( loc= "upper left" , fontsize= 16 )
plt. ylim( [ 0 , 1 ] )
plt. figure( figsize= ( 8 , 4 ) )
plot_precision_recall_vs_threshold( precisions, recalls, thresholds)
plt. xlim( [ - 700000 , 700000 ] )
save_fig( "precision_recall_vs_threshold_plot" )
plt. show( )
( y_train_pred == ( y_scores > 0 ) ) . all ( )
y_train_pred_90 = ( y_scores > 70000 )
precision_score( y_train_5, y_train_pred_90)
recall_score( y_train_5, y_train_pred_90)
def plot_precision_vs_recall ( precisions, recalls) :
plt. plot( recalls, precisions, "b-" , linewidth= 2 )
plt. xlabel( "Recall" , fontsize= 16 )
plt. ylabel( "Precision" , fontsize= 16 )
plt. axis( [ 0 , 1 , 0 , 1 ] )
plt. figure( figsize= ( 8 , 6 ) )
plot_precision_vs_recall( precisions, recalls)
save_fig( "precision_vs_recall_plot" )
plt. show( )
from sklearn. metrics import roc_curve
fpr, tpr, thresholds = roc_curve( y_train_5, y_scores)
def plot_roc_curve ( fpr, tpr, label= None ) :
plt. plot( fpr, tpr, linewidth= 2 , label= label)
plt. plot( [ 0 , 1 ] , [ 0 , 1 ] , 'k--' )
plt. axis( [ 0 , 1 , 0 , 1 ] )
plt. xlabel( 'False Positive Rate' , fontsize= 16 )
plt. ylabel( 'True Positive Rate' , fontsize= 16 )
plt. figure( figsize= ( 8 , 6 ) )
plot_roc_curve( fpr, tpr)
save_fig( "roc_curve_plot" )
plt. show( )
from sklearn. ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier( n_estimators= 10 , random_state= 42 )
y_probas_forest = cross_val_predict( forest_clf, X_train, y_train_5,
cv= 3 , method= "predict_proba" )
y_scores_forest = y_probas_forest[ : , 1 ]
fpr_forest, tpr_forest, thresholds_forest = roc_curve( y_train_5, y_scores_forest)
plt. figure( figsize= ( 8 , 6 ) )
plt. plot( fpr, tpr, "b:" , linewidth= 2 , label= "SGD" )
plot_roc_curve( fpr_forest, tpr_forest, "Random Forest" )
plt. legend( loc= "lower right" , fontsize= 16 )
save_fig( "roc_curve_comparison_plot" )
plt. show( )
roc_auc_score( y_train_5, y_scores_forest)
y_train_pred_forest = cross_val_predict( forest_clf, X_train, y_train_5, cv= 3 )
precision_score( y_train_5, y_train_pred_forest)
recall_score( y_train_5, y_train_pred_forest)
y_train_pred_forest = cross_val_predict( forest_clf, X_train, y_train_5, cv= 3 )
precision_score( y_train_5, y_train_pred_forest)
recall_score( y_train_5, y_train_pred_forest)
4. 构造多元分类器
sgd_clf. fit( X_train, y_train)
sgd_clf. predict( [ some_digit] )
some_digit_scores = sgd_clf. decision_function( [ some_digit] )
some_digit_scores
np. argmax( some_digit_scores)
sgd_clf.classes_
sgd_clf. classes_[ 5 ]
from sklearn. multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier( SGDClassifier( max_iter= 5 , tol= - np. infty, random_state= 42 ) )
ovo_clf. fit( X_train, y_train)
ovo_clf. predict( [ some_digit] )
len ( ovo_clf. estimators_)
forest_clf. fit( X_train, y_train)
forest_clf. predict( [ some_digit] )
forest_clf. predict_proba( [ some_digit] )
cross_val_score( sgd_clf, X_train, y_train, cv= 3 , scoring= "accuracy" )
from sklearn. preprocessing import StandardScaler
scaler = StandardScaler( )
X_train_scaled = scaler. fit_transform( X_train. astype( np. float64) )
cross_val_score( sgd_clf, X_train_scaled, y_train, cv= 3 , scoring= "accuracy" )
y_train_pred = cross_val_predict( sgd_clf, X_train_scaled, y_train, cv= 3 )
conf_mx = confusion_matrix( y_train, y_train_pred)
conf_mx
def plot_confusion_matrix ( matrix) :
fig = plt. figure( figsize= ( 8 , 8 ) )
ax = fig. add_subplot( 111 )
cax = ax. matshow( matrix)
fig. colorbar( cax)
plt. matshow( conf_mx, cmap= plt. cm. gray)
save_fig( "confusion_matrix_plot" , tight_layout= False )
plt. show( )
row_sums = conf_mx. sum ( axis= 1 , keepdims= True )
norm_conf_mx = conf_mx / row_sums
np. fill_diagonal( norm_conf_mx, 0 )
plt. matshow( norm_conf_mx, cmap= plt. cm. gray)
save_fig( "confusion_matrix_errors_plot" , tight_layout= False )
plt. show( )
cl_a, cl_b = 3 , 5
X_aa = X_train[ ( y_train == cl_a) & ( y_train_pred == cl_a) ]
X_ab = X_train[ ( y_train == cl_a) & ( y_train_pred == cl_b) ]
X_ba = X_train[ ( y_train == cl_b) & ( y_train_pred == cl_a) ]
X_bb = X_train[ ( y_train == cl_b) & ( y_train_pred == cl_b) ]
plt. figure( figsize= ( 8 , 8 ) )
plt. subplot( 221 ) ; plot_digits( X_aa[ : 25 ] , images_per_row= 5 )
plt. subplot( 222 ) ; plot_digits( X_ab[ : 25 ] , images_per_row= 5 )
plt. subplot( 223 ) ; plot_digits( X_ba[ : 25 ] , images_per_row= 5 )
plt. subplot( 224 ) ; plot_digits( X_bb[ : 25 ] , images_per_row= 5 )
save_fig( "error_analysis_digits_plot" )
plt. show( )
5.构造多标签分类器
from sklearn. neighbors import KNeighborsClassifier
y_train_large = ( y_train >= 7 )
y_train_odd = ( y_train % 2 == 1 )
y_multilabel = np. c_[ y_train_large, y_train_odd]
knn_clf = KNeighborsClassifier( )
knn_clf. fit( X_train, y_multilabel)
knn_clf. predict( [ some_digit] )
y_train_knn_pred = cross_val_predict( knn_clf, X_train, y_multilabel, cv= 3 , n_jobs= - 1 )
f1_score( y_multilabel, y_train_knn_pred, average= "macro" )
6.构造多输出分类器
noise = np. random. randint( 0 , 100 , ( len ( X_train) , 784 ) )
X_train_mod = X_train + noise
noise = np. random. randint( 0 , 100 , ( len ( X_test) , 784 ) )
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test
some_index = 5500
plt. subplot( 121 ) ; plot_digit( X_test_mod[ some_index] )
plt. subplot( 122 ) ; plot_digit( y_test_mod[ some_index] )
save_fig( "noisy_digit_example_plot" )
plt. show( )
knn_clf. fit( X_train_mod, y_train_mod)
clean_digit = knn_clf. predict( [ X_test_mod[ some_index] ] )
plot_digit( clean_digit)
save_fig( "cleaned_digit_example_plot" )
7. 优化分类器,提高辨别精度
from sklearn. model_selection import GridSearchCV
from sklearn. neighbors import KNeighborsClassifier
param_grid = [ { 'weights' : [ "uniform" , "distance" ] , 'n_neighbors' : [ 3 , 4 , 5 ] } ]
knn_clf = KNeighborsClassifier( )
grid_search = GridSearchCV( knn_clf, param_grid, cv= 5 , verbose= 3 , n_jobs= - 1 )
grid_search. fit( X_train, y_train)
grid_search. best_params_
grid_search. best_score_
from sklearn. metrics import accuracy_score
y_pred = grid_search. predict( X_test)
accuracy_score( y_test, y_pred)
8. 增强数据,提高辨别精度
from scipy. ndimage. interpolation import shift
def shift_image ( image, dx, dy) :
image = image. reshape( ( 28 , 28 ) )
shifted_image = shift( image, [ dy, dx] , cval= 0 , mode= "constant" )
return shifted_image. reshape( [ - 1 ] )
image = X_train[ 1000 ]
shifted_image_down = shift_image( image, 0 , 5 )
shifted_image_left = shift_image( image, - 5 , 0 )
plt. figure( figsize= ( 12 , 3 ) )
plt. subplot( 131 )
plt. title( "Original" , fontsize= 14 )
plt. imshow( image. reshape( 28 , 28 ) , interpolation= "nearest" , cmap= "Greys" )
plt. subplot( 132 )
plt. title( "Shifted down" , fontsize= 14 )
plt. imshow( shifted_image_down. reshape( 28 , 28 ) , interpolation= "nearest" , cmap= "Greys" )
plt. subplot( 133 )
plt. title( "Shifted left" , fontsize= 14 )
plt. imshow( shifted_image_left. reshape( 28 , 28 ) , interpolation= "nearest" , cmap= "Greys" )
plt. show( )
X_train_augmented = [ image for image in X_train]
y_train_augmented = [ label for label in y_train]
for dx, dy in ( ( 1 , 0 ) , ( - 1 , 0 ) , ( 0 , 1 ) , ( 0 , - 1 ) ) :
for image, label in zip ( X_train, y_train) :
X_train_augmented. append( shift_image( image, dx, dy) )
y_train_augmented. append( label)
X_train_augmented = np. array( X_train_augmented)
y_train_augmented = np. array( y_train_augmented)
shuffle_idx = np. random. permutation( len ( X_train_augmented) )
X_train_augmented = X_train_augmented[ shuffle_idx]
y_train_augmented = y_train_augmented[ shuffle_idx]
knn_clf = KNeighborsClassifier( ** grid_search. best_params_)
knn_clf. fit( X_train_augmented, y_train_augmented)
y_pred = knn_clf. predict( X_test)
accuracy_score( y_test, y_pred)