主要思路如下:
-
MyBayes类包含词频统计和概率计算相关的属性。
-
load方法读取CSV文件,对每个样本统计每个类别下每个词的词频。
-
freq_to_prob方法将词频转化为条件概率,统计类别先验概率。
-
get_word_prob和get_class_prob方法计算词在某个类下的概率和类条件下整个句子的概率。
-
classify方法对样本进行分类:
-
分词
-
对每个类计算句子概率
-
返回概率最大的类类别和概率
-
关键点:
- 使用jieba分词
- defaultdict统计词频
- 计算条件概率转化词频
- 使用贝叶斯公式计算类条件概率
- 返回概率最大的分类结果
实现了一个简单完整的文本分类器流程:
-
学习阶段:统计词频,计算概率
-
分类阶段:计算概率,排序返回结果
适用于简单文本分类任务,后续可以增加更好的特征处理与优化模型实现更高效与精确的分类。
import jieba
from collections import defaultdict
import pandas as pd
import time
jieba.initialize()
class MyBayes :
def __init__(self, path):
self.word_freqs = defaultdict(dict)
self.label_times = defaultdict(int)
self.load(path)
def load(self,path):
df = pd.read_csv(path)
result = defaultdict(dict)
for _, value in df.iterrows():
label_name = value['label']
words = jieba.lcut(value['review'])
# print(value['review'])
# print( words)
# print([w[0] for w in list(words)])
self.label_times[label_name] += 1
word_freq = self.word_freqs [label_name]
for word in words:
if word not in word_freq:
word_freq[word] = 1
else :word_freq[word ] += 1
# time.sleep(2)
self.freq_to_prob()
# print(self.word_freqs[1])
# print(self.label_times)
return
def freq_to_prob(self):
total_label_times = sum(self.label_times.values())
self.label_times = dict([c, self.label_times[c]/total_label_times ]
for c in self.label_times)
self.word_prob = defaultdict(dict)
for classname, word_dict in self.word_freqs.items():
# print(classname)
# print(len(word_dict))
total_word_times = sum(a for a in word_dict.values())
# print(total_label_times)
for word in word_dict:
prob =( word_dict[word ]+1 )/total_word_times
self.word_prob[classname][word] = prob
self.word_prob[classname]['<unk>'] = 1/total_word_times
# print(self.label_times)
return
def get_word_prob(self, words, classname):
result = 1
for word in words:
unkprob = self.word_prob[classname]['<unk>']
result *= self.word_prob[classname].get(word, unkprob)
return result
def get_class_prob(self, words, classname):
p_x = self.label_times[classname]
p_w_x = self.get_word_prob(words, classname)
return p_x* p_w_x
def classify(self, sentence):
words = jieba.lcut(sentence)
results = []
for classname in self.label_times:
prob = self.get_class_prob(words, classname)
results.append(["好评" if classname == 1 else "差评", prob] )
results = sorted (results, key = lambda x: x[1], reverse= True)
for classname, prob in results:
print("属于{}的概率{}".format(classname, prob))
if __name__ == "__main__":
m = MyBayes('./文本分类练习.csv')
query = '不好吃'
m.classify(query)