# -*- coding: utf-8 -*-
"""
Created on Mon Apr 15 12:52:24 2019
@author: LCCFM
"""
import numpy as np
import struct
import os
from collections import defaultdict
def normalize(data): ##将图片像素二值化
m, n = data.shape
for i in range(m):
for j in range(n):
if data[i, j] != 0:
data[i, j] = 1
else:
data[i, j] = 0
return data
def transforms(imgs):
c, l = imgs.shape
for i in range(c):
imgs[i] = [i for i in range(l)]
imgs[i] = np.array(imgs[i]).reshape(28, 28)
print(imgs[i].shape)
return imgs
def read_data_sets(dir, one_hot=True):
files = {
'test': ['t10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte'],
'train': ['train-images-idx3-ubyte', 'train-labels-idx1-ubyte']
}
data_set = defaultdict(dict)
for key, value in files.items():
for i, fn in enumerate(value): # 可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标
file = open(os.path.join(dir, fn), 'rb') # 拼接路径
f = file.read()
file.close()
if not i: # 把图片文件解压成字节流
img_index = struct.calcsize('>IIII') # 将Python的值根据格式符,转换为字符串# 计算给定的格式(fmt)占用多少字节的内存
_, size, row, column = struct.unpack('>IIII', f[:img_index]) # 将字节字符串解包成为变量
imgs = struct.unpack_from(str(size * row * column) + 'B', f, img_index)
# print(imgs)
data_set['img_shape'] = (row, column, 1)
imgs = np.reshape(imgs, (size, row * column)).astype(np.float32)
# imgs = transforms(imgs)
imgs = normalize(imgs)
i # mgs = (imgs - np.min(imgs)) / (np.max(imgs) - np.min(imgs))
data_set[key]['images'] = imgs
else: # 把标签文件解压成字节流
label_index = struct.calcsize('>II')
_, size = struct.unpack('>II', f[:label_index]) # 按照给定的格式(fmt)#解析字节流string,返回解析出来的tuple
labels = struct.unpack_from(str(size) + 'B', f,
label_index) # 根据MINIST文件的描述,labels的数字是`unsigned byte`格式,占用一个字节,所以这里填写`B`
labels = np.reshape(labels, (size,))
if one_hot:
tmp = np.zeros((size, np.max(labels) + 1))
tmp[np.arange(size), labels] = 1
labels = tmp
data_set[key]['labels'] = labels
return data_set
def train(data_set):
imgs = data_set['train']['images']
labels = data_set['train']['labels']
num_image, dimsnum = imgs.shape
num_label, labelnum = labels.shape
# print(labels) 当前图片是哪个数字,就在某确定位置标注1
# print(num_image, dimsnum)
# print(num_label, labelnum)
label_sum = np.zeros(labelnum)
label_shape = np.zeros((labelnum, dimsnum))
# print(label_shape.shape)
for i in range(num_image):
label = np.argmax(labels[i])
label_sum[label] = label_sum[label] + 1
for j in range(dimsnum):
label_shape[label][j] = label_shape[label][j] + imgs[i][j]
# print('label个数', label_num)
for i in range(labelnum):
for j in range(dimsnum):
label_shape[i][j] = (label_shape[i][j] + 1) / (label_sum[i] + 2)
label_sum = label_sum / num_image # 计算每个label的概率 即p(wi)
return label_sum, label_shape
def test(data_set, pyjk1, pyj): ##测试
imgs = data_set['test']['images']
labels = data_set['test']['labels']
num, dimsnum = imgs.shape
num1, labelnum = labels.shape
acc = 0
# print(pyjk1.shape)
# print(pyjk1[9][100])
for i in range(num):
testdata = imgs[i]
res=np.argmax(labels[i])
# print(p_yj_xi[1])
# print(p_yj_xi.shape)
result = 0
pro = 0
for j in range(labelnum): ##计算xi 属于 第j个类别的概率
p_yj_xi = 1
for k in range(dimsnum):
# testdata = np.concatenate(testdata, axis=0)
xk = testdata[k] ##x^i的第j个像素 或者说是 维度
if (xk == 1):
p_yj_xi *= pyjk1[j][k]
else:
p_yj_xi *= (1 - pyjk1[j][k])
temp = pyj[j] * p_yj_xi
if pro < temp:
pro = temp
result = j
if result == res:
acc = acc + 1
# print('real is: ', np.argmax(labels[i]), ' predict is: ', result)
return acc, num
if __name__ == '__main__':
data_set = read_data_sets('C:/Users/LCCFM/Desktop/data/')
label_sum, label_shape = train(data_set)
# print(imgs.shape)
labels = data_set['train']['labels']
# print(labels.shape)
# print(labels.size)
acc, num = test(data_set, label_shape, label_sum)
print(acc/num)
print('Test accuracy is: %f' % (acc/num))