垃圾邮件分类 python_朴素贝叶斯-垃圾邮件分类,法

# -*- coding: utf-8 -*-

# @Time : 2020/4/16 21:09

# @Author : Zudy

'''

1. 基于朴素贝叶斯的垃圾邮件分类

'''

from sklearn import datasets

from time import time

import numpy

import re

import os

import random

def load_data(folder_path):

print("Loading dataset ...")

loadTime = time()

datalist = datasets.load_files(folder_path)

#datalist是一个Bunch类,其中重要的数据项有

#data:原始数据

#filenames:每个文件的名称

#target:类别标签(子目录的文件从0开始标记了索引)

#target_names:类别标签(子目录的具体名称)

#输出总文档数和类别数

print("summary: {0} documents in {1} categories.".format(len(datalist.data),len(datalist.target_names)))

#加载数据所用的时间

print("Load data in {0}seconds".format(time() - loadTime))

#去停用词操作

#datalist.data = [word for word in datalist.data if(word not in stopwords.words('english'))]

return datalist

# 返回集合列表

def word_create(ori_data):

print("\nVectorzing dataset ...")

word_dic = set([]) #建立一个集合列表

#词向量的时间

vectorTime = time()

#词典的构造

for doc in ori_data.data:

#doc是byte,这里将byte转化为string

doc = str(doc, encoding = "utf-8")

#使用正则表达式将特殊符号去除

doc = re.sub("[\s+\.\!\/_,$%^*(+\"\'-]+|[+——!,。?、~@#¥%……&*()<>]+", " ", doc)

#使用默认的空格方式将email分隔开,然后转化为小写字母,与原集合取并集

word_dic = word_dic|set(doc.lower().split())

#向量化的时间和词典中词的数量

print("Vectorzing time:{0}\nThe number of word_dictionary:{1}".format(vectorTime,len(word_dic)))

return list(word_dic)

def doc_represent(wordDic,ori_data):

#创建一个文档数(行)*词向量(列)长度的二维数组

doc_re = numpy.zeros((len(ori_data.data),len(wordDic)),dtype= numpy.int)

#计数器

count = 0

#用来记录词向量表示时间

representTime = time()

for doc in ori_data.data:

#同word_create函数,进行同样的操作

doc = str(doc, encoding = "utf-8")

doc = re.sub("[\s+\.\!\/_,$%^*(+\"\'-]+|[+——!,。?、~@#¥%……&*()<>]+", " ", doc)

for word in doc.lower().split():

if word in wordDic:

#将对应词向量位置置1

doc_re[count][wordDic.index(word)] = 1

count = count+1

print("Represent doc time:{0}\nThe number of doc:{1}".format(representTime-time(),len(doc_re)))

#返回表示文档的二维数组

return doc_re

def pre_probabilty(ori_data):

s_pre_pro = []

#正常邮件的先验概率

P_normal = (normal + 1.0)/(len(ori_data.data) + 2.0)

s_pre_pro.append(P_normal)

#垃圾邮件的先验概率

P_spam = (spam + 1.0)/(len(ori_data.data) + 2.0)

s_pre_pro.append(P_spam)

#返回先验概率的列表

return s_pre_pro

#计算每个词在正常邮件垃圾邮件中的数目

def wordNum_email(email_repre,wordDic):

#用二维向量存储

num_word = numpy.zeros((2,len(wordDic)),dtype= numpy.int)

for i in range(len(wordDic)):

#在正常邮件的数目

for j in range(normal):

num_word[0][i] += email_repre[j][i]

#在垃圾邮件中的数目

for j in range(normal, spam+normal):

num_word[1][i] += email_repre[j][i]

return num_word

#条件概率

def con_probabilty(email_repre,wordDic):

#得到每个词汇在正常邮件、垃圾邮件中的数目

word_num = wordNum_email(email_repre,wordDic)

word_pro = numpy.zeros((2,len(wordDic)),dtype = numpy.double)

for i in range(len(wordDic)):

word_pro[0][i] = round((word_num[0][i]+1)/(normal + 2),8)

word_pro[1][i] = round((word_num[1][i]+1)/(spam + 2 ),8)

return word_pro

#得到每个类别中的文档数

def class_num(path,class_name):

count = 0

path=path+"/"+class_name

for root, dirs, files in os.walk(path): # 遍历统计

for each in files:

count += 1

return count

#测试

def test_spam(test_repre,pre_pro,con_pro):

email_pro = numpy.zeros((len(test_repre),2),dtype = numpy.double)

email_judge = []

normal_num = 0

spam_num = 0

for i in range(len(test_repre)):

email_pro[i][0] = round(pre_pro[0],8)

email_pro[i][1] = round(pre_pro[1],8)

for j in range(len(test_repre[0])):

if test_repre[i][j] != 0:

email_pro[i][0] *= con_pro[0][j]

email_pro[i][1] *= con_pro[1][j]

if email_pro[i][0] > email_pro[i][1] :

email_judge.append(0)

elif email_pro[i][0] < email_pro[i][1] :

email_judge.append(1)

else :

if random.random() > 0.5:

email_judge.append(1)

else:

email_judge.append(0)

for i in range(normal_test):

if email_judge[i] == 0:

normal_num +=1

for i in range(normal_test,len(test_repre)):

if email_judge[i] == 1:

spam_num +=1

print("email_judge=")

print(email_judge)

print("normal_num="+str(normal_num)+"\nspam_num="+str(spam_num))

return (normal_num + spam_num)/len(test_repre)

if __name__ == "__main__":

# 训练集和测试集的路径

train_path = "D:/Python/Python_learning/Book_code/LH_mechine_learning/bayes/spamDataset/email/train1"

test_path = "D:/Python/Python_learning/Book_code/LH_mechine_learning/bayes/spamDataset/email/test1"

train_list = load_data(train_path)

test_list = load_data(test_path)

normal = class_num(train_path,"pos") # 正常邮件的数目

spam = class_num(train_path,"neg") # 垃圾邮件的数目

WordDictionary = word_create(train_list) # 建立词汇表

docRepre = doc_represent(WordDictionary,train_list) # 将训练数据进行向量表示

prePro = pre_probabilty(train_list)

conPro = con_probabilty(docRepre,WordDictionary)

print("\npreProbablity:",prePro) # 计算先验概率

print("conProbablity:",conPro) # 计算条件概率

testRepre = doc_represent(WordDictionary,test_list) # 测试数据的向量表示

normal_test = class_num(test_path, "pos") # 正常邮件的数目

spam_test = class_num(test_path, "neg") # 垃圾邮件的数目

test_accuracy = test_spam(testRepre,prePro,conPro) # 测试数据的准确率

print ("test accuracy")

print(test_accuracy)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值