MNIST数据集
MNIST数据集包括70000 张 28 ×28的手写数字灰度图像,图像数据已经被转换为28 × 28 = 784维的向量形式存储,标签对应的为10维向量存储,如:数字3对应的标签为[0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0]。数据集官网上分为四个,60000张图像的训练集,60000张图像的训练集标签,10000张测试集和10000张测试集标签。
直接去Mnist官网下载就行
最小二乘法实现数字分类原理
非齐次线性最小二乘的标准形式:
Ax=B
其中A、B分别为测量数据构成的矩阵与向量,X为待求参数向量。
其标准形式下的最小二乘解为:
x=〖(A^T A)〗^(-1) A^T B
在本次实验中,我们应用最小二乘模型对数字0和非0数字进行分类:
A_train矩阵应该通过训练集图片来构造。
B_train向量是对应的训练集数字标签。
解出x后让测试集的A_test矩阵与x相乘得到了预测的测试集标签b_(test,predict):如果 b_(test,predict) 为1 说明预测结果是正确的,否则为-1预测错误。根据真实的测试集标签b_test和b_(test,predict)可以算出来TP,TN,FP,FN,因此也就可以求得精确率、召回率、错误率等指标。在处理标签b_(test,predict)之前,要进行转换,将实际的标签0和非0转换为1和-1。
实验结果图
代码流程:
1、首先是分别读取训练集、训练集标签、测试集、测试集标签、对于读取文件首先是二进制读取,而且要解析文件头,即前四行数据信息代表着文件标志,图片数量、和图片长宽。
2、之后获得数据在缓冲中的偏移指针,目的是指向数据集中不同图像位置获取每张图片的像素信息,记录成四个向量,分别是train_images(60000,784),test_image(10000,784),train_label(60000),test_label(10000)。
3、之后构造一个特征a矩阵用来存放图像中的有效信息,即非0像素点,方法是遍历训练集60000张图的784个像素点,对于每一张图对应的某点像素元素平方和开根号,如果大于600则视作为A矩阵的一个特征点,从而构造出较少冗余信息的特征矩阵A.
4、然后调用numpy库里的最小二乘法函数qr(QR分解法),步骤是
5、解出x后让测试集的A_test矩阵与x相乘得到了预测的测试集标签b_(test,predict):如果 b_(test,predict)
为1
说明预测结果是正确的,否则为-1预测错误,即区分0与非0。根据真实的测试集标签b_test和b_(test,predict)可以算出来TP,TN,FP,FN,因此也就可以求得精确率、召回率、错误率等指标。
完整代码
import numpy as np
import struct
import prettytable as pt
import time
# 训练集文件
train_images_idx3_ubyte_file = r'E:\project\智能算法\mnist\train-images.idx3-ubyte'
# 训练集标签文件
train_labels_idx1_ubyte_file = r'E:\project\智能算法\mnist\train-labels.idx1-ubyte'
# 测试集文件
test_images_idx3_ubyte_file = r'E:\project\智能算法\mnist\t10k-images.idx3-ubyte'
# 测试集标签文件
test_labels_idx1_ubyte_file = r'E:\project\智能算法\mnist\t10k-labels.idx1-ubyte'
def decode_idx3_ubyte(idx3_ubyte_file):
"""
解析idx3文件的通用函数
:param idx3_ubyte_file: idx3文件路径
:return: 数据集
"""
# 读取二进制数据
bin_data = open(idx3_ubyte_file, 'rb').read()
# 解析文件头信息,依次为魔数、图片数量、每张图片高、每张图片宽
offset = 0
fmt_header = '>iiii' # 因为数据结构中前4行的数据类型都是32位整型,所以采用i格式,但我们需要读取前4行数据,所以需要4个i。我们后面会看到标签集中,只使用2个ii。
magic_number, num_images, num_rows, num_cols = struct.unpack_from(fmt_header, bin_data, offset)
print('魔数:%d, 图片数量: %d张, 图片大小: %d*%d' % (magic_number, num_images, num_rows, num_cols))
# 解析数据集
image_size = num_rows * num_cols
offset += struct.calcsize(fmt_header) # 获得数据在缓存中的指针位置,从前面介绍的数据结构可以看出,读取了前4行之后,指针位置(即偏移位置offset)指向0016。
print(offset)
fmt_image = '>' + str(
image_size) + 'B' # 图像数据像素值的类型为unsigned char型,对应的format格式为B。这里还有加上图像大小784,是为了读取784个B格式数据,如果没有则只会读取一个值(即一副图像中的一个像素值)
print(fmt_image, offset, struct.calcsize(fmt_image))
images = np.empty((num_images, num_rows, num_cols))
# plt.figure()
for i in range(num_images):
if (i + 1) % 10000 == 0:
print('已解析 %d' % (i + 1) + '张')
print(offset)
images[i] = np.array(struct.unpack_from(fmt_image, bin_data, offset)).reshape((num_rows, num_cols))
# print(images[i])
offset += struct.calcsize(fmt_image)
# plt.imshow(images[i],'gray')
# plt.pause(0.00001)
# plt.show()
# plt.show()
return images
def decode_idx1_ubyte(idx1_ubyte_file):
"""
解析idx1文件的通用函数
:param idx1_ubyte_file: idx1文件路径
:return: 数据集
"""
# 读取二进制数据
bin_data = open(idx1_ubyte_file, 'rb').read()
# 解析文件头信息,依次为魔数和标签数
offset = 0
fmt_header = '>ii'
magic_number, num_images = struct.unpack_from(fmt_header, bin_data, offset)
print('魔数:%d, 图片数量: %d张' % (magic_number, num_images))
# 解析数据集
offset += struct.calcsize(fmt_header)
fmt_image = '>B'
labels = np.empty(num_images)
for i in range(num_images):
if (i + 1) % 10000 == 0:
print('已解析 %d' % (i + 1) + '张')
labels[i] = struct.unpack_from(fmt_image, bin_data, offset)[0]
offset += struct.calcsize(fmt_image)
return labels
def load_train_images(idx_ubyte_file=train_images_idx3_ubyte_file):
"""
TRAINING SET IMAGE FILE (train-images-idx3-ubyte):
[offset] [type] [value] [description]
0000 32 bit integer 0x00000803(2051) magic number 用来标志文件的特性2051表示图片集 2049表示标签集
0004 32 bit integer 60000 number of images
0008 32 bit integer 28 number of rows
0012 32 bit integer 28 number of columns
0016 unsigned byte ?? pixel
0017 unsigned byte ?? pixel
........
xxxx unsigned byte ?? pixel
Pixels are organized row-wise. Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).
:param idx_ubyte_file: idx文件路径
:return: n*row*col维np.array对象,n为图片数量
"""
return decode_idx3_ubyte(idx_ubyte_file)
def load_train_labels(idx_ubyte_file=train_labels_idx1_ubyte_file):
"""
TRAINING SET LABEL FILE (train-labels-idx1-ubyte):
[offset] [type] [value] [description]
0000 32 bit integer 0x00000801(2049) magic number (MSB first)
0004 32 bit integer 60000 number of items
0008 unsigned byte ?? label
0009 unsigned byte ?? label
........
xxxx unsigned byte ?? label
The labels values are 0 to 9.
:param idx_ubyte_file: idx文件路径
:return: n*1维np.array对象,n为图片数量
"""
return decode_idx1_ubyte(idx_ubyte_file)
def load_test_images(idx_ubyte_file=test_images_idx3_ubyte_file):
"""
TEST SET IMAGE FILE (t10k-images-idx3-ubyte):
[offset] [type] [value] [description]
0000 32 bit integer 0x00000803(2051) magic number
0004 32 bit integer 10000 number of images
0008 32 bit integer 28 number of rows
0012 32 bit integer 28 number of columns
0016 unsigned byte ?? pixel
0017 unsigned byte ?? pixel
........
xxxx unsigned byte ?? pixel
Pixels are organized row-wise. Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).
:param idx_ubyte_file: idx文件路径
:return: n*row*col维np.array对象,n为图片数量
"""
return decode_idx3_ubyte(idx_ubyte_file)
def load_test_labels(idx_ubyte_file=test_labels_idx1_ubyte_file):
"""
TEST SET LABEL FILE (t10k-labels-idx1-ubyte):
[offset] [type] [value] [description]
0000 32 bit integer 0x00000801(2049) magic number (MSB first)
0004 32 bit integer 10000 number of items
0008 unsigned byte ?? label
0009 unsigned byte ?? label
........
xxxx unsigned byte ?? label
The labels values are 0 to 9.
:param idx_ubyte_file: idx文件路径
:return: n*1维np.array对象,n为图片数量
"""
return decode_idx1_ubyte(idx_ubyte_file)
def pretreat(train_labels, test_labels, train_images, test_images):
train_images_column = train_images.reshape(60000, 784, 1)
test_images_column = test_images.reshape(10000, 784, 1)
'区分0与非0'
for i in range(len(train_labels)):
if train_labels[i] == 0:
train_labels[i] = 1
elif train_labels[i] != 0:
train_labels[i] = -1 ## 5923个0 /60000 约1/10 正确
for i in range(len(test_labels)):
if test_labels[i] == 0:
test_labels[i] = 1
elif test_labels[i] != 0:
test_labels[i] = -1 ## 980个0 /10000 约1/10 正确
train_images_2D = train_images_column.reshape(60000, 784)
test_images_2D = test_images_column.reshape(10000, 784)
'''转置矩阵'''
train_images_2DT = train_images_2D.T
test_images_2DT = test_images_2D.T
return train_labels, test_labels, train_images_2DT, test_images_2DT
def show_result(labels, result, dataset):
TP = 0 # 正类预测为正类
FN = 0 # 正类预测为负类
FP = 0 # 负类预测为正类
TN = 0 # 负类预测为负类
for i in range(len(labels)):
if labels[i] == 1 and result[i] == 1:
TP = TP + 1
elif labels[i] == 1 and result[i] == -1:
FN = FN + 1
elif labels[i] == -1 and result[i] == 1:
FP = FP + 1
elif labels[i] == -1 and result[i] == -1:
TN = TN + 1
tb = pt.PrettyTable()
tb.field_names = [dataset, "预测为0", "预测非0", "总共"]
tb.add_row(["0", TP, FN, TP + FN])
tb.add_row(["非0", FP, TN, FP + TN])
tb.add_row(["总共", TP + FP, FN + TN, TP + FP + FN + TN])
print(tb)
error = (FN + FP) / (TP + FP + FN + TN) * 100
print('错误率为', "%.3f" % error, '%')
print('\n')
def tran1or0(result): ##这个函数来把大于0的转换为1,小于0的转换为0
result[result > 0] = 1
result[result < 0] = -1
result.reshape(len(result), 1)
return result
if __name__ == '__main__':
start1 = time.time()
train_images = load_train_images()
train_labels = load_train_labels()
test_images = load_test_images()
test_labels = load_test_labels()
[train_labels, test_labels, train_images_2DT, test_images_2DT] = pretreat(train_labels, test_labels, train_images,
test_images)
tt = 0
index = []
train_image_feature = np.zeros([493, 60000])
for i in range(784):
non_zero = np.linalg.norm(train_images_2DT[i, :], ord=0)
if non_zero >= 600:
train_image_feature[tt, :] = train_images_2DT[i, :]
tt = tt + 1
index.append(i)
test_image_feature = np.zeros([493, 10000])
test_image_feature = test_images_2DT[index, :]
A = np.hstack([np.ones([60000, 1]), train_image_feature.T])
A_test = np.hstack([np.ones([10000, 1]), test_image_feature.T])
b = train_labels
b_test = test_labels
print('进行QR分解中...')
q, r = np.linalg.qr(A)
print('已完成QR分解')
print('\n')
x = np.linalg.pinv(r).dot(q.T.dot(b))
result = A.dot(x)
tran1or0(result)
show_result(train_labels, result, '训练集')
result_test = A_test.dot(x)
tran1or0(result_test)
show_result(test_labels, result_test, '测试集') ##
end1 = time.time()
print("运行时间:%.2f秒" % (end1 - start1))