# python机器学习---用贝叶斯算法实现垃圾邮件分类预测

import numpy
from os import listdir
import jieba
import operator
from gensim import corpora,models,similarities
from numpy import *

#贝叶斯算法的实现
class Bayes:
def __init__(self):
self.length=-1
self.labelcount=dict()
self.vectorcount=dict()#key:value,{label:vect}
def fit(self,dataSet,labels):
if(len(dataSet)!=len(labels)):
raise ValueError("您输入的类别与我们的数据集个数不匹配")
self.length=len(dataSet[0])
labelsnum=len(labels)#数据记录数量，类别总数量
norelabel=set(labels)#不重复类别数组
for item in norelabel:
thislabel=item
#当前类别占总类别的比例，当前类别的概率
self.labelcount[thislabel]=labels.count(thislabel)/labelsnum#当前类别出现的概率
for vect,label in zip(dataSet,labels):
if(label not in self.vectorcount):
self.vectorcount[label]=[]
self.vectorcount[label].append(vect)
print("训练结束")
return self
def btest(self,TestData,labelSet):
if(self.length==-1):
raise ValueError("没有训练，先训练再测试")
#计算当前数据分别为各个类别的概率
lbdict=dict()
for thislb in labelSet:
p=1
labelpct=self.labelcount[thislb]
allvector=self.vectorcount[thislb]
vnum=len(allvector)#取出当前向量列表的长度
allvector=numpy.array(allvector).T
for index in range(0,len(TestData)):
vector=list(allvector[index])
p=p*vector.count(TestData[index])/vnum
lbdict[thislb]=p*labelpct
#取出概率最大的那个类别
thislabel=sorted(lbdict,key=lambda x:lbdict[x],reverse=True)[0]
return thislabel

#进行训练
#从文件名得到分类信息
def seplabel(fname):
filestr=fname.split(".")[0]
thislabel=filestr.split("_")[0]
if(thislabel=="t"):
classstr=1
else:
classstr=0
return classstr

#建立词典
dictdata=""
filelist=listdir("D:/python/train")
for i in range(0,len(filelist)):
cdata=jieba.cut(data)
for j in cdata:
dictdata=dictdata+j+"  "
texts=[dictdata.split()]
dictionary=corpora.Dictionary(texts)

#构建训练集数据向量以及对应的label
def traindataSet():
labels=[]
dirname="D:/python/train"
trainfilelist=listdir(dirname)
#print(trainfilelist)
m=len(trainfilelist)
trainMat=numpy.zeros((m,2048))
for i in range(0,m):
fnamestr= trainfilelist[i]
labels.append(seplabel(fnamestr))
cutdata=jieba.cut(data)
newdata=""
for item in cutdata:
newdata+=item+" "
print(newdata)
#将对应的数据转为稀疏向量
new_vect=dictionary.doc2bow(newdata.split())
#print(new_vect)
thisvec=""
for t in range(0,len(new_vect)):
for k in range(0,len(new_vect[t])):
thisvec=thisvec+str(new_vect[t][k])+"  "
new_vec=thisvec.split()
#print(new_vec)
trainMat[i,:len(new_vec)]=new_vec
return labels,trainMat

#接下来进行贝叶斯算法训练
labels,trainMat=traindataSet()
bys=Bayes()
bys.fit(trainMat,labels)
#测试
cutdata=jieba.cut(testdata)
newdata=""
for i in cutdata:
newdata+=i+"  "
new_vec=dictionary.doc2bow(newdata.split())
thisvec=""
for t in range(0,len(new_vec)):
for k in range(0,len(new_vec[t])):
thisvec=thisvec+str(new_vec[t][k])+"  "
new_vec2=thisvec.split()
#print(new_vec)
testMat=numpy.zeros((1,2048))
testMat[0,:len(new_vec2)]=new_vec2
labels=[0,1]
rst=bys.btest(testMat[0],labels)
if(rst==1):
print("不是垃圾邮件")
else:
print("是垃圾邮件")


©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客