def get_traindata(): #输出总的文本矩阵,文本向量,词典
folder_path = ('F:/train_data')
folder_list = os.listdir(folder_path)
sum_list = []
corpus = []
sum_vocab = set([])
sum_dict = {}
print('正在生成训练集总文本向量...')
for folder in folder_list: #先获取总的文本向量
new_folder_path = folder_path + '/' + str(folder)
files = os.listdir(new_folder_path)
for file in files:
rs = []
with open(new_folder_path+'/'+file,'r',encoding='utf-8') as fp:
for ln in fp:
rs.extend(ln.strip().split(' '))
# print(type(rs))
sum_list.append(rs)
print(len(sum_list))
print('生成完毕!')
sum_num = len(sum_list) #文本总数
past_num = 0
train_num = 0
print('正在生成词典...')
for folder in folder_list: #对每一类求类向量并降维
new_folder_path = folder_path + '/' + str(folder)
files = os.listdir(new_folder_path)
train_num = len(files)
class_list = []
for file in files:
rs = []
with open(new_folder_path + '/' + file, 'r', encoding='utf-8') as fp:
for ln in fp:
rs.extend(ln.strip().split(' '))
class_list.extend(rs)
corpus.append(str(class_list))
# class_vocab = createVocablist(class_list)
# for word in class_vocab:
# if word in sum_vocab:
# class_vocab.remove(word)
# class_vocab1 = class_tfidf(class_vocab,class_list,sum_list)
# sum_vocab = sum_vocab | set(class_vocab1)
sum_dict[str(folder)] = [0]*past_num+[1]*train_num+[0]*(sum_num-past_num-train_num)
past_num +=train_num
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
word = vectorizer.get_feature_names()
weight = tfidf.toarray()
vocab = set([])
for i in range(len(weight)):
list = sorted(weight[i], reverse=True)
num = list[5]
for j in range(len(word)):
if weight[i][j] >= num:
vocab.add(j)
vocab_list =[]
for i in vocab:
vocab_list.append(word[i])
# class_list1 = []
# for i in dict_list:
# class_list1.append(i[0])
# sum_vocab = sum_vocab | set(class_list1)
#
# vocab_list = list(sum_vocab) #最终的词典列表
print('生成的词典为:%s'% str(vocab_list))
vocab_file = open('F:/myVocab.txt','w')
vocab_file.write(str(vocab_list))
vocab_file.close()
return sum_list,sum_dict,vocab_list #返回总的文档列表、总的分类向量、词典
def createVocablist(dataSet): #去重复词创建词库
vocabSet=
手写朴素贝叶斯文本分类
最新推荐文章于 2023-11-13 15:41:59 发布
本文深入探讨了朴素贝叶斯算法在文本分类中的应用,详细介绍了如何从头开始手写朴素贝叶斯分类器。通过实例分析,解释了特征选择、概率估计和分类决策的过程,帮助读者理解其工作原理。
摘要由CSDN通过智能技术生成