贝叶斯垃圾邮件分类

贝叶斯垃圾邮件分类

数据地址
链接:https://pan.baidu.com/s/1_uNOllcSQoT3kn4ijb7ThQ
提取码:yxk2
导入需要的包

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.externals import joblib

设置路径

VOCAB_PATH = './data/vocab.txt' # 词表保存路径
DATA_PATH = './data/final_data.csv' # 训练数据路径

读取数据划分训练集与测试集

data = pd.read_csv(DATA_PATH, sep=',')
datas = data.iloc[:, 1]  # 短信内容
labels = data.iloc[:, 0]  # 短信分类
x_train, x_test, y_train, y_test = train_test_split(datas, labels, train_size=0.8, random_state=0)
#去掉原索引
xTrain = x_train.reset_index(drop=True)
yTrain = y_train.reset_index(drop=True)
xTest = x_test.reset_index(drop=True)
yTest = y_test.reset_index(drop=True)

构建词表

def tokenize(message):
    doclist = []
    for line in message:  #将每一条数据拆分为一个一个的字
        temp_list = []
        for word in str(line):
            temp_list.append(word)
        doclist.append(temp_list)
    return doclist

def creatVocablist(doclist):
    vocabset = set([])
    for document in doclist:
        vocabset = vocabset|set(document)
    return list(vocabset)
# 构建词汇表
doclist = tokenize(xTrain)
word_dict_set = creatVocablist(doclist)
vocab = pd.DataFrame(word_dict_set)
#保存词表
vocab.to_csv(VOCAB_PATH, header=None, index=None)

将数据转换为向量

num_features = len(vocab)
#将每一个邮件转换为词表大小的向量 出现位置标记为1
def setOfWord2Vec(vocablist,inputSet):
    returnVec = [0]*len(vocablist)
    for i in range(len(str(inputSet))):
        if str(inputSet)[i] in vocablist:
            returnVec[vocablist.index(str(inputSet)[i])] = 1
    return returnVec

def generateMat(data):
    num_samples = len(data)
    feature = np.zeros((num_samples, num_features))
    for i in range(num_samples):
        te = setOfWord2Vec(word_dict_set,data[i])
        feature[i] = te
    return feature

xTrain_future = generateMat(xTrain)
xTest_future = generateMat(xTest)

模型训练

model = MultinomialNB()
model.fit(xTrain_future, yTrain)

模型测试

predict_classify = model.predict(xTest_future)
count = 0
for i in range(len(predict_classify)):
    if yTest[i] == predict_classify[i]:
        count += 1
print("准确率:",count/len(predict_classify))

保存模型

joblib.dump(model, './data/bayes.pkl')

加载模型使用

bayes_model = joblib.load('./data/bayes.pkl')
text = "可以借助坦克作战工具参与战斗"
datas = list()
datas.append(text)
datas_future = generateMat(datas)
predict_classify = bayes_model.predict(datas_future)
predict_classify
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值