import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import time
In [2]:
path = '/Users/mhl/Documents/KaggleDM/digit'
train_data = pd.read_csv(path+'/train.csv')
test_data = pd.read_csv(path+'/test.csv')
In [3]:
print (train_data.info(), test_data.info())
In [4]:
train_x_data = train_data.drop(['label'], axis=1)
train_y_data = train_data['label']
print(train_x_data.head(), train_y_data.head())
print(train_x_data.info())
In [5]:
import numpy as np
classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
num_class = len(classes)
samples = 10
fig = plt.figure(figsize=[15, 10])
im_index = 1
for i in classes:
i_index = np.nonzero([i==y for y in train_y_data])
#print(i_index)
i_index_show = np.random.choice(i_index[0], samples, replace=False)
print(i_index_show, time.time())
for j in i_index_show:
ax = plt.subplot(samples, num_class, im_index)
ax.imshow(train_x_data.loc[j].reshape((28,28)))
im_index = im_index + 1
ax.axis('off')
In [6]:
'''
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(train_x_data, train_y_data)
knn_score = cross_val_score(knn, train_x_data, train_y_data, cv=5).mean()
print knn_score
'''
Out[6]:
In [7]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(train_x_data, train_y_data)
Out[7]:
In [8]:
print(time.time())
knn.predict(train_x_data.loc[0].reshape(1, 784))
print(time.time())
In [9]:
print(time.time())
submit = pd.DataFrame({'ImageId': range(len(test_data))})
initial_label = 0
submit['Label'] = initial_label
part_one = len(test_data)/2
submit.loc[range(part_one), 'Label'] = knn.predict(test_data[:part_one])
print(time.time())
In [10]:
print(time.time())
submit.loc[range(part_one, len(test_data)), 'Label'] = knn.predict(test_data[part_one:len(test_data)])
print(time.time())
In [14]:
submit.loc[range(len(test_data)), 'ImageId'] = range(1, len(test_data)+1)
In [15]:
submit.head()
Out[15]:
In [16]:
submit.to_csv("output.csv", index=False)