本文主要内容为,通过朴素贝叶斯算法对给定数据集完成邮件分类任务
目录
前言 朴素贝叶斯的基本原理
一、拉普拉斯修正
二、主要步骤
1.引入库
代码如下:
import os
import numpy as np
2.读入数据
代码如下(示例):
path_norm='C:/Users/86182/Desktop/机器学习/朴素贝叶斯算法实验/Emails/Training/normal'
path_spam='C:/Users/86182/Desktop/机器学习/朴素贝叶斯算法实验/Emails/Training/spam'
norm_file_list=os.listdir(path_norm)
spam_file_list=os.listdir(path_spam)
注:path改成你的文件夹路径
数据集内容如下:
垃圾邮件:
正常邮件
3.数据预处理
# 数据预处理
def clear_data(string):
#字符处理:
#去掉一些标点符号
sign_list=['?','\/','\\','\'','.','#','"','(',')']
for sign in sign_list:
string=string.replace(sign,'')
#1.大写转小写 2.去掉换行符 3.以空格为分割
s=string.lower().strip('\n').split()
#字符串去重
text=list(set(s))
# print(text)
return text
4.获取词典
def get_word_dic(path_norm,norm_file_list):
norm_word_dic={}
for file in norm_file_list:
with open(path_norm+'/'+file,'r',encoding='utf-8')as f:
s=f.read()
text=clear_data(s)
for word in text:
if word in norm_word_dic:
norm_word_dic[word]+=1
else:
norm_word_dic[word]=1
# print(text)
# print('----------------------------')
return norm_word_dic
5.计算类别的概率值
#计算概率值
def count_p(text,norm_word_dic,p_norm):
for word in text:
times=0 #word出现的次数
if word in norm_word_dic:
times=norm_word_dic[word]
pi=(times+1)/(m+1)
p_norm*=pi
#取对数防止数值下溢
return np.log(p_norm)
6.进行分类
# 训练集两类邮件的数量
m=len(norm_file_list)
n=len(spam_file_list)
def classify(text,norm_word_dic,spam_word_dic,m,n):
# 初始化概率
#这里用到拉普拉斯修正
p_norm=(m+1)/(m+n+2)
p_spam=(n+1)/(m+n+2)
re_norm=count_p(text,norm_word_dic,p_norm)
re_spam=count_p(text,spam_word_dic,p_spam)
print("类别为normal的概率为:",re_norm)
print("类别为spam的概率为:",re_spam)
if re_norm>re_spam:
return 'norm'
else:
return 'spam'
7.进行预测
def predict(test_path):
test_file_list=os.listdir(test_path)
for file in test_file_list:
with open(test_path+'/'+file,'r',encoding='utf-8')as f:
s=f.read()
text=clear_data(s)
result=classify(text,norm_word_dic,spam_word_dic,m,n)
print("文件",file,"的预测结果为:",result,"文件")
#
print('-------------------------------')
# 读取测试集数据
test_path='C:/Users/86182/Desktop/机器学习/朴素贝叶斯算法实验/Emails/test'
predict(test_path)
8.预测结果
总结
不足之处:训练集和测试集太小,所以无法验证该算法的泛化能力。
源代码:
# In[1]:
import os
import numpy as np
# In[2]:
path_norm = 'C:/Users/86182/Desktop/机器学习/朴素贝叶斯算法实验/Emails/Training/normal'
path_spam = 'C:/Users/86182/Desktop/机器学习/朴素贝叶斯算法实验/Emails/Training/spam'
norm_file_list = os.listdir(path_norm)
spam_file_list = os.listdir(path_spam)
# In[3]:
# 数据预处理
def clear_data(string):
# 字符处理:
# 去掉一些标点符号
sign_list = ['?', '\/', '\\', '\'', '.', '#', '"', '(', ')']
for sign in sign_list:
string = string.replace(sign, '')
# 1.大写转小写 2.去掉换行符 3.以空格为分割
s = string.lower().strip('\n').split()
# 字符串去重
text = list(set(s))
# print(text)
return text
# In[4]:
def get_word_dic(path_norm, norm_file_list):
norm_word_dic = {}
for file in norm_file_list:
with open(path_norm + '/' + file, 'r', encoding='utf-8') as f:
s = f.read()
text = clear_data(s)
for word in text:
if word in norm_word_dic:
norm_word_dic[word] += 1
else:
norm_word_dic[word] = 1
# print(text)
# print('----------------------------')
return norm_word_dic
# In[5]:
norm_word_dic = get_word_dic(path_norm, norm_file_list)
spam_word_dic = get_word_dic(path_spam, spam_file_list)
# In[6]:
# 训练集两类邮件的数量
m = len(norm_file_list)
n = len(spam_file_list)
# In[7]:
# 计算概率值
def count_p(text, norm_word_dic, p_norm):
for word in text:
times = 0 # word出现的次数
if word in norm_word_dic:
times = norm_word_dic[word]
pi = (times + 1) / (m + 1)
p_norm *= pi
# 取对数防止数值下溢
return np.log(p_norm)
# In[8]:
def classify(text, norm_word_dic, spam_word_dic, m, n):
# 初始化概率
# 这里用到拉普拉斯修正
p_norm = (m + 1) / (m + n + 2)
p_spam = (n + 1) / (m + n + 2)
re_norm = count_p(text, norm_word_dic, p_norm)
re_spam = count_p(text, spam_word_dic, p_spam)
print("类别为normal的概率为:", re_norm)
print("类别为spam的概率为:", re_spam)
if re_norm > re_spam:
return 'norm'
else:
return 'spam'
# In[9]:
def predict(test_path):
test_file_list = os.listdir(test_path)
for file in test_file_list:
with open(test_path + '/' + file, 'r', encoding='utf-8') as f:
s = f.read()
text = clear_data(s)
result = classify(text, norm_word_dic, spam_word_dic, m, n)
print("文件", file, "的预测结果为:", result, "文件")
#
print('-------------------------------')
# In[10]:
# 读取测试集数据
test_path = 'C:/Users/86182/Desktop/机器学习/朴素贝叶斯算法实验/Emails/test'
predict(test_path)
# In[ ]: