数据集链接: https://plg.uwaterloo.ca/cgi-bin/cgiwrap/gvcormac/foo06
1. 数据集介绍¶
数据集中包含大约 64620 封中文邮件, 其中垃圾邮件有 42854 封, 正常的邮件有 21766 封. 通过使用朴素贝叶斯算法实现对垃圾邮件的分类.
2. 案例实现¶
2.0 导入需要库¶
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import codecs
import re
import jieba
import time
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
import joblib
from collections import Counter
import random
filename1 = 'data/trec06c/data1.csv'
filename2 = 'data/trec06c/data2.csv'
filename3 = 'data/trec06c/data3.csv'
2.1 读取邮件数据¶
def email_from_file():
email_labels = []
email_contents = []
sample_number = 500
# 读取所有邮件
for line in open('data/trec06c/full/index', errors='ignore'):
# 获得邮件标签
label, data = line.strip().split()
# 读取邮件内容
file_name = 'data/trec06c' + data[2:]
file_data = codecs.open(file_name, 'r', 'gbk', errors='ignore').read()
# 存储标签和数据
email_labels.append(label)
email_contents.append(file_data)
# 从所有邮件中随机选择垃圾和正常邮件各 1000 封邮件
email_data = pd.DataFrame({'content': email_contents, 'label': email_labels})
spam_email = email_data[email_data['label'] == 'spam'].sample(sample_number)
ham_email = email_data[email_data['label'] == 'ham'].sample(sample_number)
# 将邮件拼接到一起
email_data = pd.concat([spam_email, ham_email])
email_data.to_csv(filename1)
2.2 邮件数据处理¶
def preprocessin_email():
# 1. 读取数据
emal_data = pd.read_csv('data/trec06c/data1.csv')
# 减少数据集
emal_data = emal_data[:1000]
# 2. 数据预处理
contents = []
start = time.time()
for index, email in enumerate(emal_data['content'], 1):
# 1. 去除换行符
email = email.replace('\n', ' ')
# 2. 去除非中文内容
email = re.sub('[^\u4e00-\u9fff]', '', email)
# 3. 去除多余空白
email = ' '.join(email.split())
# 4. 分词
email = ' '.join(jieba.lcut(email))
contents.append(email)
# 大概需要2分半
if index % 100 == 0:
print('已预处理 %5d 封邮件, 用时: %.2fs' % (index, time.time() - start))
# 3. 将新内容写入文件
data = pd.DataFrame({'content': contents, 'label': emal_data['label']}).to_csv('data/trec06c/data2.csv')
2.3 邮件数据向量化¶
def email_vectorize():
# 1. 读取数据集
email = pd.read_csv(filename2)
# 2. 构建词频向量
stop_words = []
for word in open('data/trec06c/stoplist.txt', 'r', encoding='gbk'):
stop_words.append(word.strip())
transformer = CountVectorizer(stop_words=stop_words)
x = transformer.fit_transform(email['content']).toarray()
print(x.shape)
# 使用 0 代表正常邮件, 1 代表垃圾邮件
y = np.where(email['label'].values == 'ham', 0, 1)
# 3. 将训练数据存储
data = pd.DataFrame(x)
data[x.shape[1]] = y
data.to_csv(filename3)
2.4 邮件转换词频向量¶
def email_training():
# 1. 读取数据
data = pd.read_csv(filename3)
x = data.iloc[:, :-1]
y = data.iloc[:, -1]
# 3. 数据集分割
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
# 4. 模型训练
estimator = MultinomialNB()
estimator.fit(x_train, y_train)
# 5. 保存模型
joblib.dump(estimator, 'model/multinomialnb.pth')
# 6. 模型评估
accurary = estimator.score(x_test, y_test)
print('预测准确率:', accurary)
if __name__ == '__main__':
email_from_file()
email_processing()
email_vectorize()
email_training()