Kaggle-digit-v0

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import time 
In [2]:
path = '/Users/mhl/Documents/KaggleDM/digit'
train_data = pd.read_csv(path+'/train.csv')
test_data = pd.read_csv(path+'/test.csv')
In [3]:
print (train_data.info(), test_data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Columns: 784 entries, pixel0 to pixel783
dtypes: int64(784)
memory usage: 167.5 MB
(None, None)
In [4]:
train_x_data = train_data.drop(['label'], axis=1)
train_y_data = train_data['label']
print(train_x_data.head(), train_y_data.head())
print(train_x_data.info())
(   pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
0       0       0       0       0       0       0       0       0       0   
1       0       0       0       0       0       0       0       0       0   
2       0       0       0       0       0       0       0       0       0   
3       0       0       0       0       0       0       0       0       0   
4       0       0       0       0       0       0       0       0       0   

   pixel9    ...     pixel774  pixel775  pixel776  pixel777  pixel778  \
0       0    ...            0         0         0         0         0   
1       0    ...            0         0         0         0         0   
2       0    ...            0         0         0         0         0   
3       0    ...            0         0         0         0         0   
4       0    ...            0         0         0         0         0   

   pixel779  pixel780  pixel781  pixel782  pixel783  
0         0         0         0         0         0  
1         0         0         0         0         0  
2         0         0         0         0         0  
3         0         0         0         0         0  
4         0         0         0         0         0  

[5 rows x 784 columns], 0    1
1    0
2    1
3    4
4    0
Name: label, dtype: int64)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 784 entries, pixel0 to pixel783
dtypes: int64(784)
memory usage: 251.2 MB
None
In [5]:
import numpy as np
classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
num_class = len(classes)
samples = 10
fig = plt.figure(figsize=[15, 10])
im_index = 1
for i in classes:
    i_index = np.nonzero([i==y for y in train_y_data])
    #print(i_index)
    i_index_show = np.random.choice(i_index[0], samples, replace=False)
    print(i_index_show, time.time())
    for j in i_index_show:
        ax = plt.subplot(samples, num_class, im_index)
        ax.imshow(train_x_data.loc[j].reshape((28,28)))
        im_index = im_index + 1
        ax.axis('off')
(array([18728, 38911, 29940,  3728, 41256, 32423,  3599,  7996, 41355, 32530]), 1513496178.672991)
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:14: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  
(array([20976,  5265, 26521, 17075, 41954,  7936, 16476, 31845, 23789,   708]), 1513496179.278652)
(array([25063,  4677, 22101, 19670, 35288, 17263, 39527, 17159, 19174, 24182]), 1513496179.828415)
(array([ 1198, 19375, 39986, 14812, 24147, 33269, 35975, 17606,  9827, 13836]), 1513496180.378412)
(array([24169, 22051, 37003, 30149, 30151,  5658, 18545,  6849, 33402,  4909]), 1513496181.006571)
(array([ 6544, 31383, 40749, 15719, 18827,  2917,  4875, 26055,   144, 41942]), 1513496181.562813)
(array([20459, 23420, 41051, 30173, 12670,  1150,  2575, 17813, 29001, 39903]), 1513496182.272629)
(array([ 2210, 14003, 33382, 31120, 32506, 25176, 16713, 28360, 31741, 30453]), 1513496182.821924)
(array([27942, 16383,  2066, 27957, 38065,  9397, 34033, 25259,  8491,  2906]), 1513496183.561259)
(array([ 5099,  7170, 28900, 38268, 39337, 31594,  6612, 14149, 24331, 14138]), 1513496184.135468)
<img src=""" style="box-sizing: border-box; border: 0px; vertical-align: middle; max-width: 100%; height: auto;" alt="">
In [6]:
'''
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(train_x_data, train_y_data)
knn_score = cross_val_score(knn, train_x_data, train_y_data, cv=5).mean()
print knn_score
'''
Out[6]:
"\nknn = KNeighborsClassifier(n_neighbors=5, weights='distance')\nknn.fit(train_x_data, train_y_data)\nknn_score = cross_val_score(knn, train_x_data, train_y_data, cv=5).mean()\nprint knn_score\n"

Knn spends huge time to provide the result, it is not unacceptable in real life. per time 0.06s, 42000*0.06/60 = 42min, too long time to calculate.

In [7]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn.fit(train_x_data, train_y_data)
Out[7]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='distance')
In [8]:
print(time.time())
knn.predict(train_x_data.loc[0].reshape(1, 784))
print(time.time())
1513496201.64
1513496201.7
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:2: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  

since it will take lots of time to calculate the result, we split the calculation to two part

In [9]:
print(time.time())
submit = pd.DataFrame({'ImageId': range(len(test_data))})
initial_label = 0
submit['Label'] = initial_label
part_one = len(test_data)/2
submit.loc[range(part_one), 'Label'] = knn.predict(test_data[:part_one])
print(time.time())
1513496201.71
1513496997.43
In [10]:
print(time.time())
submit.loc[range(part_one, len(test_data)), 'Label'] = knn.predict(test_data[part_one:len(test_data)])
print(time.time())
1513496997.44
1513497781.03
In [14]:
submit.loc[range(len(test_data)), 'ImageId'] = range(1, len(test_data)+1)
In [15]:
submit.head()
Out[15]:
  ImageId Label
0 1 2
1 2 0
2 3 9
3 4 9
4 5 3
In [16]:
submit.to_csv("output.csv", index=False)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值