数据集与上一篇文章不同,可以使用完整的MNIST数据集了,下载地址:MNIST
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import time
import cv2
from sklearn.cross_validation import train_test_split
# 提取hog特征,784 -> 324
def get_hog_features(trainset):
features = []
hog = cv2.HOGDescriptor('./hog.xml') # 读取配置文件
for image in trainset:
image = image.reshape(28, 28)
cv_img = image.astype(np.uint8) # uint8范围为0-255,和像素数值范围相同
hog_feature = hog.compute(cv_img)
features.append(hog_feature)
features = np.array(features)
features = features.reshape(-1, 324) # 第一维任意,第二维为提取到的特征18*18
return features
def Predict(testset, trainset, train_labels):
predict = []
count = 0
# test_vec shape (D,)
for test_vec in testset:
print(count, end=" ") # 输出测试用例的下标
count += 1
if count % 100 == 0:
print()
# 当前k个最近邻
knn_list = []
# 当前k个最近邻中最远点的坐标
max_index = -1
# 当前k个最近邻中最远点的距离
max_dist = 0
# 先将前k个训练数据放入knn_list中,填充满
for i in range(k):
label = train_labels[i]
train_vec = trainset[i] # shape (D,)
# 计算欧式距离
dist = np.linalg.norm(train_vec - test_vec)
knn_list.append((dist, label))
# 处理剩下的点
for i in range(k, len(train_labels)):
label = train_labels[i]
train_vec = trainset[i]
dist = np.linalg.norm(train_vec - test_vec)
# 寻找10个邻近点中距离最远的点
if max_index < 0:
for j in range(k):
if max_dist < knn_list[j][0]:
max_index = j
max_dist = knn_list[j][0]
# 如果当前k个最近邻中存在距离比当前点远,则替换
if dist < max_dist:
knn_list[max_index] = (dist, label)
# 别忘记重新初始化,因为替换后10个邻近点中
max_index = -1
max_dist = 0
# 从10个最近邻中统计选票
class_label = [0 for i in range(k)] # list快速赋初值方式
for dist, label in knn_list:
class_label[label] += 1
# 选出最大选票对应的选票数
mmax = max(class_label)
for i in range(k):
if mmax == class_label:
predict.append(i)
break
return np.array(predict)
k = 10 # 可由交叉验证获得最佳的k
if __name__ == '__main__':
print('Start reading data:')
time1 = time.time()
raw_data = pd.read_csv('./data/train.csv')
data = raw_data.values
img = data[:, 1:]
labels = data[:, 0]
print(img.shape)
print(labels.shape)
features = get_hog_features(img)
print(features.shape)
train_features, test_features, train_labels, test_labels = train_test_split(features, labels,test_size=0.33, random_state=11111)
time2 = time.time()
print('read data cost %f seconds' % (time2 - time1))
print('Starting training:')
print('knn do not need to train!')
time3 = time.time()
print('training cost %f seconds' % (time3 - time2))
print('Starting predicting:')
test_predict = Predict(test_features, train_features, train_labels)
time4 = time.time()
print('predicting cost %f seconds' % (time4 - time3))
accuracy = np.sum(test_labels == test_predict.reshape(len(test_labels))) / len(test_labels)
print('The accuracy is %f!' % accuracy)
'''
Start reading data:
(42000, 784)
(42000,)
(42000, 324)
read data cost 6.009209 seconds
Starting training:
knn do not need to train!
training cost 0.000033 seconds
Starting predicting:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...
to be continued
运行时间过长不算出accuracy了
'''