基于朴素贝叶斯实现文本分类

彤小彤_tong

已于 2022-11-13 18:29:56 修改

阅读量973

点赞数 4

文章标签： python

于 2022-11-13 17:53:36 首次发布

本文链接：https://blog.csdn.net/m0_64669072/article/details/127832376

版权

基于朴素贝叶斯实现文本分类

数据集介绍

在这里插入图片描述数据集为网上公开的新闻数据，其中数据集包含10个类别。

模型选择

贝叶斯分类
贝叶斯公式
朴素贝叶斯
拉普拉斯平滑引入
某个属性的条件概率为0，则会导致整体概率为0，为了避免这个情况，拉普拉斯平滑参数主要是将条件概率为0的属性设置为固定值

文本分类过程

sklearn包名介绍

sklearn.metrics: Metrics 度量（主要各种指标）
Model Selection Interface
Classification metrics
Regression metrics
Multilabel ranking metrics
Clustering metrics
Biclustering metrics
Pairwise metrics
sklearn.naive_bayes: Naive Bayes 朴素贝叶斯
导入包：

import random
import jieba
from sklearn import model_selection
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re, string  #re库是Python关于正则表达式的一个内置模块,用于字符串匹配

数据准备
数据预处理：主要是去除文本中的标点符号，并分词
jieba分词，主要是过滤其中特殊字符

def text_to_words(file_path):
	sentences_arr=[]
	lab_arr=[]
	with open(file_path,"r",encoding="utf-8")as f:
		for line in f.readlines():
			lab_arr.append(line.split("_!_")[1])
			#line.split语法，line.split("str")[n],n为返回索引
			sentence = line.split("_!_")[-1].strip()
			#移除字符串头尾指定字符
			sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”！，。？、~@#￥%……&*（）《》：]+", "",sentence) #去除标点符号，替换为空
			sentence=jieba.lcut(sentence,cut_all=False)
			sentence_arr.append(sentence)
		return sentence_arr,lab_arr

加载停用词表

def load_stopwords(file_path):
	stopwords=[line.strip() for line in open(file_path,encoding="utf8").readlines()]
	return stopwords

遍历数据，去除停用词，统计词频，生成字典

def get_dict(sentence_arr,stopwords):
	for sentence in sentence_arr:
		for word in sentence:
			if word !=' 'and word.asalpha():
			# word.isalpha() isalpha函数用于判断字符串是否全部由字母组成
				if word not in stopwords:
				word_dic[word]=word_dic.get(word,1)+1
	word_dic=sorted(word_dic.items(),key=lanbda x:x[0],reverse=True)
	#按照key降序排列
	return word_dic

构建特征词表，过滤掉频率低于 word_num 的单词，即词向量维度为 word_num， return: 特征词列表

def get_feature_words(word_dic,word_num):
	n=0
	feature_words=[]
	for word in word_dic:
		if n<word_num:
			feature_words.append(word[0])
		n+=1
	return feature_words

文本特征表示，根据特征词，将数据集中的句子转化为特征向量

def get_text_features(train_data_list, test_data_list, feature_words):
    def text_features(text, feature_words):
        text_words = set(text)   #转换为集合
        features = [1 if word in text_words else 0 for word in feature_words] # 形成特征向量
        return features
    train_feature_list = [text_features(text, feature_words) for text in train_data_list]    #生成训练集特征向量
    test_feature_list = [text_features(text, feature_words) for text in test_data_list]      #生成测试集特征向量
    return train_feature_list, test_feature_list

调用上述函数，完成词表构建

sentences_arr, lab_arr = text_to_words('news_classify_data.txt')   #读取数据集，获取分词后的数据及标签
stopwords = load_stopwords('stopwords_cn.txt')         #加载停用词表，生成停用词列表
word_dic = get_dict(sentences_arr,stopwords)           #去除停用词，统计词频，生成字典
#将数据集划分为训练集和测试集
train_data_list, test_data_list, train_class_list, test_class_list = model_selection.train_test_split(sentences_arr, 
                                                                                                      lab_arr, 
                                                                                                      test_size=0.1)
feature_words =  get_feature_words(word_dic,1000)   #生成特征词列表，词向量维度为1000

#将训练集和测试集数据生成特征向量
train_feature_list,test_feature_list = get_text_features(train_data_list,test_data_list,feature_words)

查看特征向量，分析输出结果

print(feature_words[:10])
print(feature_words.index('年'),feature_words.index('月'))
print(train_data_list[0])
print(train_feature_list[0])

模型创建、训练和评估

classifier=MultinomialNB(alpha=1.0,fit_prior=True,class_prior=None)
classifier.fit(train_feature_list,train_class_list)
predict=classifier.predict(test_feature_list)
test_accuracy=accuracy_score(predict,test_class_list)
print("accuracy_score:%.4lf"%(test_accuracy))

模型预测

def load_sentence(sentence):
    sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”！，。？、~@#￥%……&*（）《》：]+", "",sentence) #去除标点符号
    sentence = jieba.lcut(sentence, cut_all=False)   #分词
    return sentence
lab = [ '文化', '娱乐', '体育', '财经','房产', '汽车', '教育', '科技', '国际', '证券']

p_data = '快评：欠薪俱乐部获一线生机，中国足协为何总是心太软，还债金元足球到几时？'#'【中国稳健前行】应对风险挑战必须发挥制度优势'
sentence = load_sentence(p_data)
sentence= [sentence]
print('分词结果:', sentence)
#形成特征向量
p_words = get_text_features(sentence,sentence,feature_words)
res = classifier.predict(p_words[0])
print("所属类型：",lab[int(res)])