贝叶斯垃圾邮件分类
数据地址
链接:https://pan.baidu.com/s/1_uNOllcSQoT3kn4ijb7ThQ
提取码:yxk2
导入需要的包
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.externals import joblib
设置路径
VOCAB_PATH = './data/vocab.txt' # 词表保存路径
DATA_PATH = './data/final_data.csv' # 训练数据路径
读取数据划分训练集与测试集
data = pd.read_csv(DATA_PATH, sep=',')
datas = data.iloc[:, 1] # 短信内容
labels = data.iloc[:, 0] # 短信分类
x_train, x_test, y_train, y_test = train_test_split(datas, labels, train_size=0.8, random_state=0)
#去掉原索引
xTrain = x_train.reset_index(drop=True)
yTrain = y_train.reset_index(drop=True)
xTest = x_test.reset_index(drop=True)
yTest = y_test.reset_index(drop=True)
构建词表
def tokenize(message):
doclist = []
for line in message: #将每一条数据拆分为一个一个的字
temp_list = []
for word in str(line):
temp_list.append(word)
doclist.append(temp_list)
return doclist
def creatVocablist(doclist):
vocabset = set([])
for document in doclist:
vocabset = vocabset|set(document)
return list(vocabset)
# 构建词汇表
doclist = tokenize(xTrain)
word_dict_set = creatVocablist(doclist)
vocab = pd.DataFrame(word_dict_set)
#保存词表
vocab.to_csv(VOCAB_PATH, header=None, index=None)
将数据转换为向量
num_features = len(vocab)
#将每一个邮件转换为词表大小的向量 出现位置标记为1
def setOfWord2Vec(vocablist,inputSet):
returnVec = [0]*len(vocablist)
for i in range(len(str(inputSet))):
if str(inputSet)[i] in vocablist:
returnVec[vocablist.index(str(inputSet)[i])] = 1
return returnVec
def generateMat(data):
num_samples = len(data)
feature = np.zeros((num_samples, num_features))
for i in range(num_samples):
te = setOfWord2Vec(word_dict_set,data[i])
feature[i] = te
return feature
xTrain_future = generateMat(xTrain)
xTest_future = generateMat(xTest)
模型训练
model = MultinomialNB()
model.fit(xTrain_future, yTrain)
模型测试
predict_classify = model.predict(xTest_future)
count = 0
for i in range(len(predict_classify)):
if yTest[i] == predict_classify[i]:
count += 1
print("准确率:",count/len(predict_classify))
保存模型
joblib.dump(model, './data/bayes.pkl')
加载模型使用
bayes_model = joblib.load('./data/bayes.pkl')
text = "可以借助坦克作战工具参与战斗"
datas = list()
datas.append(text)
datas_future = generateMat(datas)
predict_classify = bayes_model.predict(datas_future)
predict_classify