贝叶斯分类器也广泛应用于分类的实现。特别是文本的识别。本次主要介绍用朴素贝叶斯,在独立检验的条件下实现手写数字的识别。
基本原理:
计算当前图片类别占总类别的概率,从中找到占比最大的,就认定该图片从属于这个类。
代码:
构造分类器,并存储文件为bayes.py。
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 8 15:27:37 2018
@author: Administrator
"""
from numpy import *
class Bayes():
def __init__(self):
self.length = -1
self.label = dict()
self.vector = dict()
def fit(self, train_data:list, labels:list):
if (len(train_data) != len(labels)):
raise ValueError('测试数组与类别数组长度不一致')
self.length = len(train_data[0]) #测试数据特征值长度
labels_num = len(labels) #所有数据量
classes = set(labels) #不重复的标签
for item in classes: #当前类别占总类别比例
this_labels = item
self.label[this_labels] = labels.count(this_labels)/labels_num #当前类别的比例
for vector, label in zip(train_data, labels):
if (label not in self.vector):
self.vector[label] = []
self.vector[label].append(vector)
#print('scuess')
return self
def test(self, test_data, labelSet):
if (self.length == -1):
raise ValueError('无训练数据')
Dict = dict()
for this_labels in labelSet:
p = 1
alllabel = self.label[this_labels]
allvector = self.vector[this_labels]
vector_num = len(allvector)
allvector = array(allvector).T
for index in range(len(test_data)):
vector = list(allvector[index])
p *= vector.count(test_data[index])/vector_num
Dict[this_labels] = p * alllabel
this_label = sorted(Dict, key=lambda x: Dict[x], reverse=True)[0]
return this_label
进行抽样,形成训练集和检验集,文件sample.py
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
#对样本数据进行处理,通过输入文件地址,来获取检验集和测试集
from numpy import *
import os
import pandas as pd
class Sample():
def __init__(self,train_file_route):
self.train_file_route = train_file_route
def data_array(self,file_name):
array = []
fh = open(file_name)
#获取目标数据文件的行数和列数
data = fh.readlines()
col_num = len(data[0].replace('\n', ''))
for this_line in data:
for j in range(col_num):
array.append(int(this_line[j]))
return array
#获取文件的文件名
def sep_label(self,file_name):
filestr = file_name.split(".")[0]
label = int(filestr.split("_")[0])
return label
#建立训练数据
def train_data(self):
labels = []
train_file = os.listdir(self.train_file_route)
num = len(train_file)
#用数组存放训练数据,行:文件总数,列:文件总数*文件总数
train_arrary = zeros((num, 28**2)) #生成全为0的数据
for i in range(num):
this_file_name = train_file[i]
this_labels = self.sep_label(this_file_name)
labels.append(this_labels)
train_arrary[i,:] = self.data_array(self.train_file_route+this_file_name)
return train_arrary,labels
#构造混淆矩阵,即真实数据和预测数据的统计值
#计算模型评价指标,准确率
def Evaluation_Index(real_data, forecast_data):
rst = {'ID':list(range(1,len(real_data)+1)),'real_data':real_data,'forecast_data':forecast_data}
df = pd.DataFrame(rst)
#计算ACC,即判对率占所有数据的比例
Accuracy = len(df[df.real_data == df.forecast_data])/len(df)
Confusion_Matrix = df.groupby(['real_data','forecast_data']).count()
Confusion_Matrix = Confusion_Matrix.unstack()
Confusion_Matrix = Confusion_Matrix.fillna(0).astype(int)
Rec_Matrix = df[df.real_data != df.forecast_data]
return Confusion_Matrix,round(Accuracy,4),Rec_Matrix
调用程序:
import sample
import importlib
import bayes
import bayes_promote
from datetime import datetime
%matplotlib inline
import matplotlib.pylab as plt
import seaborn as sns
importlib.reload(sample)
importlib.reload(bayes)
importlib.reload(bayes_promote)
sns.set()
#主程序,调用样本数据,测试手写数字的识别
train_route = r'E:/data/digit_data_copy/train/'
test_route = r'E:/data/digit_data_copy/test/'
#test_route = r'E:/data/digit_test/t1/'
func = sample.Sample(train_route)
train_data, labels = func.train_data()
func = sample.Sample(test_route)
test_data, real_data = func.train_data()
#调用模型
labelsall=[0,1,2,3,4,5,6,7,8,9]
#识别多个手写体数字(批量处理)
model = bayes.Bayes()
model.fit(train_data, labels)
labelsall=[0,1,2,3,4,5,6,7,8,9]
#识别多个手写体数字(批量处理)
model = bayes.Bayes()
model.fit(train_data, labels)
def bayes_test1():
num = len(test_data)
forecast_data = []
for i in range(num):
label = model.test(test_data[i],labelsall)
forecast_data.append(label)
Confusion_Matrix,Accuracy,Rec_Matrix = sample.Evaluation_Index(real_data, forecast_data)
return Confusion_Matrix,Accuracy,Rec_Matrix
if __name__ == '__main__':
t1 = datetime.now()
Confusion_Matrix,Accuracy,Rec_Matrix = bayes_test1()
t2 = datetime.now()
print('耗费时间:',t2-t1)
print('混淆矩阵:\n',Confusion_Matrix)
print('精确度:\n',Accuracy)
sns.heatmap(Confusion_Matrix.T, square=True, annot=True, fmt='d', cbar=False,
xticklabels=Confusion_Matrix.index, yticklabels=Confusion_Matrix.columns.labels[1])
plt.xlabel('true label')
plt.ylabel('predicted label')
消耗时间:12分钟47秒,精确度:0.8545。对于只有2000条的测试数据而言耗费时间较长,且精确度一般。也有可能是由于模型的优化问题导致的。
测试sklearn的高斯贝叶斯模型效果。
from sklearn.naive_bayes import MultinomialNB #导入高斯贝叶斯模型
from time import clock
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,f1_score
t3 = clock()
model = MultinomialNB() #建立模型
model.fit(train_data, labels) #拟合模型
forecast_data = model.predict(test_data) #模型预测
Confusion_Matrix,Accuracy,Rec_Matrix = sample.Evaluation_Index(real_data, forecast_data) #模型评价指标
#可以调用sklearn的方法使用
#accuracy_score(real_data, forecast_data)
#confusion_matrix(real_data, forecast_data)
t4 = clock()
print('消耗时间为:{:.3f}s'.format(t4-t3))
print('精确度为:',Accuracy)
混淆矩阵:
消耗时间为:0.043s,模型精确度为:0.829,时间较手写的矩阵明显提高,但精确度却有所下降。由于没有考虑其他指标,所以不能片面对两个模型进行评价。
事实证明,以贝叶斯作为分类器的弊端还是较为明显,当数据维度较高时和各维度之间的相关性较强,贝叶斯分类效果就显得不怎么可观。