之前人工智能课程需要交一个小作业,参考网上文章,用贝叶斯做了一个邮件分类器,分享给大家。
代码
from re import sub
from os import listdir
from collections import Counter
from itertools import chain
from numpy import array
from jieba import cut
from sklearn.naive_bayes import MultinomialNB
import PIL.Image as im
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# 存放所有文件中的单词
# 每一个元素是一个列表,存放一个文件中的单词
allWords = []
def getWordsFromFile(txtFile):
words = []
with open(txtFile, encoding='utf8') as fp:
for line in fp:
line = line.strip()# 移除头尾空格
# 使用则表达式过滤掉干扰字符或无效字符
line = sub(r'[.【】0-9、——。,!~\* ]','',line)
# print(line)
# 使用开源库jieba 分词
line = cut(line)
# 过滤长度为1的词
line = filter(lambda word: len(word)>1, line)
words.extend(line)
# print(words)
return words
def getTopNWords(topN):
# 按文件编号顺序处理当前文件夹中所有记事本文件
# 共151封邮件内容,0.txt到125.txt是垃圾邮件内容
# 127.txt到150.txt为正常邮件内容
txtFiles = [str(i)+'.txt' for i in range(151)]
# 获取全部单词
for txtFile in txtFiles:
allWords.append(getWordsFromFile(txtFile))
# 获取并返回出现次数最多的前topN个单词
freq = Counter(chain(*allWords))
return [w[0] for w in freq.most_common(topN)]
# 全部训练集中出现次数最多的前topN个单词
topWords = getTopNWords(400)
# 获取特征向量,前600个单词中,每个单词在每个邮件中出现的频率
vector = []
for words in allWords:
temp = list(map(lambda x:words.count(x), topWords))
vector.append(temp)
print(vector[0])
vector = array(vector)
# 邮件标签, 1表示垃圾邮件, 0表示正常邮件
labels = array([1]*127 + [0]* 24)
#labels = array([1]*127 + [0]* 24)
# 创建模型,使用已知训练集训练
model = MultinomialNB()
model.fit(vector, labels)
#
def predict(txtFile):
# 获取注定邮件的内容,返回分类结果
words = getWordsFromFile(txtFile)
currentVector = array(tuple(map(lambda x: words.count(x),topWords)))
result = model.predict(currentVector.reshape((1,-1)))
return '垃圾邮件' if result==1 else '正常邮件'
# 根据词频生成词云图
# wc = WordCloud(font_path=r'C:/Windows/Fonts/STZHONGS.TTF', ,background_color="white",max_words=1000,max_font_size=300,
# width=3000,height=3000).generate_from_frequencies(Counter(chain(*allWords)))
##使用matplotlib的pyplot来进行最后的渲染出图.
#plt.show(wc)
##目标文件另存为这个名录下
# wc.to_file('wordsCloud.png')
print(predict('151.txt'))
print(predict('152.txt'))
print(predict('153.txt'))
print(predict('154.txt'))
print(predict('155.txt'))
数据集
下载链接:
链接: https://pan.baidu.com/s/1KU6LVdS2ZCqfjXM_hz0zJQ 提取码: tduq