实例:新闻分类器:
参考:这篇
- 首先将文件当中的文字取出,分别存到列表当中,并且返回存放字出现频率从高到底排列的列表:
import os
import jieba
from sklearn.naive_bayes import MultinomialNB
from matplotlib import pyplot as plt
import random
def TextProcess(folder_path,test_size):
"""遍历切分新闻内容
Params:
文件夹所在目录,测试集大小
return:
字出现频率从高到底排列的列表,训练集列表和分类,测试集列表和分类
"""
file_list = os.listdir(folder_path) #获取文件夹下的所有文件名, 返回一个列表
data_list = [] #用来存放数据
class_list = [] #用来存放分类
for each in file_list: #遍历每个文件夹
new_folder = os.path.join(folder_path,each) #形成一个打开该文件夹的路径
files = os.listdir(new_folder) #返回该文件夹下所有文件名
j=1 #文件数量初始化
for file in files: #遍历每一个文件名
if j>100: #遍历数量最多不能超过100个
break
with open(os.path.join(new_folder,file),'r',encoding = 'utf-8') as f: #打开文件
raw = f.read() #获取内容
word_cut = jieba.cut(raw,cut_all=False) #用jieba库进行切割
word_list = list(word_cut) #形成列表
data_list.append(word_list)
class_list.append(each)
j +=1
all_data = list(zip(data_list,class_list)) #讲标签和文本对应压缩起来
random.shuffle(all_data) #打乱顺序,遵守随机原则
index = int(len(all_data)*test_size) + 1 #取处分割的索引值
train_set = all_data[index:]
test_set = all_data[:index]
train_data,train_class = zip(*train_set) #解压
test_data,test_class = zip(*test_set)
all_word_dict = {}
for word_list in train_data:
for word in word_list:
if word in all_word_dict.keys():
all_word_dict[word] +=1
else:
all_word_dict[word] =1
all_word_tuple_list = sorted(all_word_dict.items(),key= lambda f : f[1],reverse=True)#降序排序
all_word_list,all_word_nums = zip(*all_word_tuple_list) #解压
all_word_list = list(all_word_list)
return all_word_list,train_data,train_class,test_data,test_class
if __name__ == '__main__':
all_word_list,train_data,train_class,test_data,test_class=TextProcess(folder_path,test_size)
print(all_word_list)
结果如下:
由上图可以清楚的看到里面由很多无关紧要的词出现的频率非常的高,所以这要写个函数把这写字去掉
- 找出特征词
def WordsMaker(file_name):
"""将可去除的词变成一个集合"""
Words_set = set()
with open(file_name,'r',encoding = 'utf-8') as f:
for each_line in f.readlines():
each = each_line.strip()
if len(each) >0: #如果长度大于0就要
Words_set.add(each)
return Words_set
def words_dict(all_word_list,deleteN,stop_set=set()):
"""取出特征词
Params:
按出现频率进行排序的列表,要去除高频词的数量,无关词汇表"""
features = []
n=1
for each_word in range(deleteN,len(all_word_list),1):
if n>1000: #存1000个特征词就够了
break
if not all_word_list[each_word].isdigit() and all_word_list[each_word] not in stop_set and 1<len(all_word_list[each_word]) <5:
#如果这个词不是数字,并且不再要去除字的列表当中,并且长度大于1,小于5,就放入表中
features.append(all_word_list[each_word])
n+=1
return features
if __name__ == '__mian__':
file_name='E:/Data/stopwords_cn.txt'
Word_set = WordsMaker(file_name)
feature_list = words_dict(all_word_list,100,Word_set)
print(feature_list)
结果:
- 根据feature_list来向量化:
def TextFeatures(train_data_list,test_data_list,feature_words):
def text_feature(text,feature_words):
text_words =set(text)
features = [1 if word in text_words else 0 for word in feature_words]
return features
train_feature_list = [text_feature(text,feature_words) for text in train_data_list]
test_feature_list = [text_feature(text,feature_words) for text in test_data_list]
return train_feature_list,test_feature_list
- 建立贝叶斯分类器:
def Classify(train_feature_list,test_feature_list,train_class_list,test_class_list):
classifier = MultinomialNB().fit(train_feature_list, train_class_list)
test_accuracy = classifier.score(test_feature_list, test_class_list)
return test_accuracy
- 找出删除高频词的最佳数量:
if __name__ == '__main__':
folder_path = 'E:/Data/Sample'
all_word_list,train_data,train_class,test_data,test_class=TextProcess(folder_path,0.2)
file_path = 'E:/Data/stopwords_cn.txt'
stopset = WordsMaker(file_path)
test_accuracy = []
deleteNs = range(0,1000,20)
for deleteN in deleteNs:
feature_list = words_dict(all_word_list,deleteN,stopset)
train_feature_list,test_feature_list = TextFeatures(train_data,test_data,feature_list)
tests_accuracy = Classify(train_feature_list,test_feature_list,train_class,test_class)
test_accuracy.append(tests_accuracy)
plt.plot(deleteNs,test_accuracy)
plt.show()
选450比较好
if __name__ == '__main__':
folder_path = 'E:/Data/Sample'
all_word_list,train_data,train_class,test_data,test_class=TextProcess(folder_path,0.2)
file_path = 'E:/Data/stopwords_cn.txt'
stopset = WordsMaker(file_path)
test_accuracy = []
feature_list = words_dict(all_word_list,400,stopset)
train_feature_list,test_feature_list = TextFeatures(train_data,test_data,feature_list)
tests_accuracy = Classify(train_feature_list,test_feature_list,train_class,test_class)
test_accuracy.append(tests_accuracy)
ave = lambda c:sum(c)/len(c)
print(ave(test_accuracy))
结果: