参考链接:
- http://blog.lisp4fun.com/2018/03/09/bayes
- https://www.kaggle.com/nnitiwe/spam-detection-with-sklearn
数据集: - spam.csv 在kaggle网站下载
- ChnSentiCorp情感分析酒店:https://pan.baidu.com/s/1hsF1Zbm
垃圾邮件分类
'''
垃圾邮件识别
'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# 【1】 读取数据
spam_file = r"E:\机器学习数据集\sms-spam-collection-dataset\spam.csv"
to_drop=['Unnamed: 2','Unnamed: 3','Unnamed: 4']
df = pd.read_csv(spam_file, engine='python')
df.drop(columns=to_drop,inplace=True)
df['encoded_label']=df.v1.map({'spam':0,'ham':1})
print(df.head())
# 【2】 数据处理
# split into train and test
train_data, test_data, train_label, test_label = train_test_split(
df.v2,
df.encoded_label,
test_size=0.7,
random_state=0) # df.v2是邮件内容,df.v1是邮件标签(ham和spam)
# 使用CountVectorizer将句子转化为向量
c_v = CountVectorizer(decode_error='ignore')
train_data = c_v.fit_transform(train_data)
test_data = c_v.transform(test_data)
# plt.matshow(train_data.toarray())
# plt.show()
# 朴素贝叶斯算法训练预测
from sklearn import naive_bayes as nb
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
clf=nb.MultinomialNB()
model=clf.fit(train_data, train_label)
predicted_label=model.predict(test_data)
print("train score:", clf.score(train_data, train_label))
print("test score:", clf.score(test_data, test_label))
print("Classifier Accuracy:",accuracy_score(test_label, predicted_label))
print("Classifier Report:\n",classification_report(test_label, predicted_label))
print("Confusion Matrix:\n",confusion_matrix(test_label, predicted_label))
结果如下:
ChnSentiCorp酒店评价数据文本分析
'''
读取文本数据集ChnSentiCorp情感分析酒店评论,将其转化为词向量
'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import pathlib
# 【1】 读取数据
data_dir = r"D:\Datasets\ChnSentiCorp情感分析酒店评论"
def read_files_from_dir(dir):
'''
从文件夹中读取ChnSentiCorp情感分析酒店评论数据,返回文件路径和标签
'''
file_names = []
labels = []
for roots, dirs, files in os.walk(dir):
for directory in dirs: # 子目录
new_dir = os.path.join(dir,directory)
for _,_, files in os.walk(new_dir):
for file in files:
file_names.append(os.path.join(new_dir,file))
labels.append(directory)
return [file_names, labels]
files_path,labels = read_files_from_dir(data_dir)
print(files_path[0])
# 将文本标签转换至数值标签
from sklearn.preprocessing import LabelEncoder
# 构建编码器
le = LabelEncoder()
# 编码
labels = le.fit_transform(labels)
def read_data(files_path):
'''
从含文本路径的列表数据中读取文本内容
'''
data = []
for file in files_path:
p = pathlib.Path(file)
data.append(p.read_text(encoding='utf-8'))
return data
data = read_data(files_path)
# 判断数据和标签数量是否一致
assert(len(labels)==len(data))
# 【2】 数据处理
# 打乱数据
data, labels = shuffle(data,labels)
# split into train and test
train_data, test_data, train_label, test_label = train_test_split(
data,
labels,
test_size=0.2,
random_state=0)
# 【3】 使用CountVectorizer将句子转化为向量
c_v = CountVectorizer(decode_error='ignore')
train_data = c_v.fit_transform(train_data)
test_data = c_v.transform(test_data)
# plt.matshow(train_data.toarray())
# plt.show()
# 【4】 朴素贝叶斯算法训练预测
from sklearn import naive_bayes as nb
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
clf=nb.MultinomialNB()
model=clf.fit(train_data, train_label)
predicted_label=model.predict(test_data)
print("train score:", clf.score(train_data, train_label))
print("test score:", clf.score(test_data, test_label))
print("Classifier Accuracy:",accuracy_score(test_label, predicted_label))
print("Classifier Report:\n",classification_report(test_label, predicted_label))
print("Confusion Matrix:\n",confusion_matrix(test_label, predicted_label))
结果如下:
从结果可以看出在这项任务中朴素贝叶斯方法的表现一般,分类器只获得了80%左右的准确率。