数据集为二值化的MNIST,下载地址:MNIST
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import time
import cv2
from sklearn.cross_validation import train_test_split
# 提取hog特征,784 -> 324
def get_hog_features(trainset):
features = []
hog = cv2.HOGDescriptor('./hog.xml') # 读取配置文件
for image in trainset:
image = image.reshape(28, 28)
cv_img = image.astype(np.uint8) # uint8范围为0-255,和像素数值范围相同
hog_feature = hog.compute(cv_img)
features.append(hog_feature)
features = np.array(features)
features = features.reshape(-1, 324) # 第一维任意,第二维为提取到的特征18*18
return features
def Train(trainset, train_labels):
trainset_size = len(train_labels)
# 初始化w和b
w = np.zeros((feature_length, 1)) # shape (D,1)
b = 0
# 学习次数,只有当(xi,yi)分类错误时才增加
study_count = 0
# 统计连续分类正确数,当分类错误时为0
nochange_count = 0
# 连续分类正确上界,当到达此数代表已训练好
nochange_upper_limit = 100000
while True:
nochange_count += 1
if nochange_count > nochange_upper_limit:
break
# 随机选一个数据
index = np.random.randint(0, trainset_size) # 前闭后开
image = trainset[index] # shape (D,)
label = train_labels[index]
# 计算yi(w*xi+b),如果label为1则为正类1,label为0则为负类-1
yi = int(label != object_num) * 2 - 1
result = yi * (image.dot(w) + b)
# 如果为误分类就需要更新w和b
if result <= 0:
# 为了下面的计算,需要重新设置维度
image = image.reshape(feature_length, 1)
w += learning_rate * yi * image
b += learning_rate * yi
study_count += 1
if study_count > nochange_upper_limit:
break
nochange_count = 0
return w, b
def Predict(test_set, w, b):
predict = []
for image in test_set:
result = image.dot(w) + b
result = result > 0 # >0为True,<0为False
predict.append(result)
return np.array(predict)
feature_length = 324 # hog特征维度
learning_rate = 0.0001 # 学习率
object_num = 0 # 分类的数字,如果数字为0,标签为1
study_total = 10000 # 设置最多迭代次数
if __name__ == '__main__':
print('Start reading data:')
time1 = time.time()
# raw_data为pandas的DataFrame类型
# 读取csv并去除第一行,从数据的第一行开始读
# 每一行数据为:第一列标签,后面每一列为像素 28*28=784
# label>0的设置成1,label为0的数据不变
raw_data = pd.read_csv('./data/train_binary.csv', header=0)
# 返回值为numpy的ndarray类型,shape(42000,785)
data = raw_data.values
img = data[:, 1:] # 第二列开始为数据
labels = data[:, 0] # 第一列为label
print(img.shape)
print(labels.shape)
# 利用hog提取特征,784 -> 324
features = get_hog_features(img)
print(features.shape)
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=11111)
print(train_features.shape)
print(test_features.shape)
time2 = time.time()
print('read data cost %f seconds' % (time2 - time1))
print('Starting training:')
w, b = Train(train_features, train_labels)
time3 = time.time()
print('training cost %f seconds' % (time3 - time2))
print('Starting predicting:')
test_predict = Predict(test_features, w, b)
time4 = time.time()
print('predicting cost %f seconds' % (time4 - time3))
# label = 0 负分类-1
# label = 1 正分类1
accuracy = np.sum(test_labels == test_predict.reshape(len(test_labels))) / len(test_labels)
print('The accuracy is: %f!' % accuracy)
'''
output:
Start reading data:
(42000, 784)
(42000,)
(42000, 324)
(28140, 324)
(13860, 324)
read data cost 6.194034 seconds
Starting training:
training cost 46.450333 seconds
Starting predicting:
predicting cost 0.081242 seconds
The accuracy is: 0.996609!
'''