原文链接:http://blog.csdn.net/wiking__acm/article/details/43491611
手写体数字的识别,一个比较简单的问题。主要是特征太多,所以用PCA降维处理,然后用knn就可以得到一个准确率相当不错的结果了。
ipython notebook 下根据测试数据生成数字图案的代码:
%pylab
import pandas as pd
img = pd.read_csv('test.csv')
p1 = img.values[1]
pix = []
for i in range(28):
pix.append([])
for j in range(28):
pix[i].append(p1[i*28+j])
plt.imshow(pix)
pca+knn 代码:
import csv
import numpy
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.decomposition import PCA
input_df = pd.read_csv('train.csv', header=0)
submit_df = pd.read_csv('test.csv', header=0)
# merge the two DataFrames into one
df = pd.concat([input_df, submit_df])
df = df .reset_index()
df = df.drop('index', axis=1)
df = df.reindex_axis(input_df.columns, axis=1)
features = input_df.values[:, 1:]
labels = input_df.values[:,0]
pca = PCA(n_components = 64)
pca.fit(df.values[:,1:])
features = pca.transform(features)
pred_data = pca.transform(submit_df.values)
clf = KNeighborsClassifier().fit(features, labels)
#print cross_val_score(clf, features, labels)
output = clf.predict(pred_data).astype(int)
ids = range(1, 28001)
# write to csv file
predictions_file = open("KNN.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["ImageId","Label"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print "done."