机器学习：使用逻辑回归对垃圾邮件分类

m0_56970979

已于 2024-04-26 12:18:39 修改

阅读量212

点赞数 3

文章标签：机器学习逻辑回归分类 python sklearn

于 2024-04-26 12:11:34 首次发布

本文链接：https://blog.csdn.net/m0_56970979/article/details/138214795

版权

先附上代码

from sklearn.model_selection import train_test_split # 拆分数据集
from sklearn.feature_extraction.text import CountVectorizer # 文本数据向量化
from sklearn.linear_model import LogisticRegression # 分类模型
from sklearn.metrics import accuracy_score # 精度
from sklearn.datasets import load_iris
import string # 字符串处理

def text_process(text):
    '''
    单份文本样本预处理
    :param text: 输入文本
    :return: 清理好的文本
    '''
    #去除缩写"'m","'re","'s"替换部分缩写
    short = ["'m","'re","'s",'\n',' c u ',"n't",' u ']
    repla = ['','','','',' see you ',' not',' you ']
    for i,w in enumerate(short):
        text=text.replace(w,repla[i])
    #去除标点
    remove = str.maketrans('','',string.punctuation)
    text = text.translate(remove)
    return text # 清理好的文本

def file_process(filename):
    '''
    数据文件（包含收集的全部数据）预处理
    :param filename: 文件路径
    :return: 清理后的文本内容，标签（是否垃圾邮件）
    '''
 # 标签
    labels = [] 
 # 邮件内容
    contents = []
 # 读取文件
    with open(filename,'r',encoding='utf-8') as f:
 # 逐行读取
        lines = f.readlines()
        for line in lines:
        #得到训练数据和标签
            target = line.split('\t')#注意这里在文本中标签和信息是用table隔开的
        #读取标签
            if target[0]=='ham':
                label = 1
            else:
                label=0
            content = text_process(target[1].lower())#这里将每一行信息转为小写并处理后存入content
            labels.append(label)
            contents.append(content)
    return contents,labels # 清理后的文本内容，标签（是否垃圾邮件）

contents,label = file_process("spam_set.txt")
# 文本向量化
vectorizer = CountVectorizer()
#我们选择前4000个样本训练，选择后面的用来测试
X_train_vectors = vectorizer.fit_transform(contents[0:4000])
X_test_vectors = vectorizer.transform(contents[4000:])
Y_train_vertors = label[0:4000]
Y_test_vertors = label[4000:]


clf = LogisticRegression(random_state=0)
clf.fit(X_train_vectors,Y_train_vertors)

#使用剩下的数据进行测试
res = clf.predict(X_test_vectors)
res_prob = clf.predict_log_proba(X_test_vectors)

#模型估算正确率
#这里我们计算出对测试集预测正确率为97.8%,非常高
score = accuracy_score(Y_test_vertors,res)
print(score)