import pandas as pd
import numpy as np
import functools
from gensim import corpora, models
from jieba import analyse
import jieba.posseg as posseg
import jieba
import gensim
import warnings
import math
import os
warnings.filterwarnings('ignore')
#导包import jieba.posseg as posseg
import jieba
import os
#定义一个文本预处理的类classPreprocessing_NLP(object):def__init__(self, stop_path, sentence):#初始化函数,stop_path为停用词路径,sentence为需要预处理的文本文件或其路径
self.stop_path=stop_path
if os.path.isfile(sentence):#判断是否为一个文件
self.sentences=''.join([i.replace('\n','')for i inopen(sentence, encoding='utf8').readlines()])else:
self.sentences=sentence
#print('类%s是文本预处理类,包含去除停用词和切词等操作!'%'Preprocessing_NLP')# 加载停用词defget_stop_word(self):
stop_list=open(file=self.stop_path, encoding='utf8').readlines()#去除换行符return[i.replace('\n','')for i in stop_list]# 进行分词defseg_to_list(self, pos=False):ifnot pos:#表示pos为False# 不进行词性标注分词
seg_list = jieba.cut(self.sentences)else:# 进行词性标注分词
seg_list = posseg.cut(self.sentences)returnlist(seg_list)#去除停用词defword_filter(self, pos=False):
stop_list=self.get_stop_word()
filter_list=[]#根据pos参数选择是否词性过滤# 不进行词性过滤,则将词性都标记为n,表示全部保留for seg in self.seg_to_list():ifnot pos:
word=seg
flag='n'else:
word=seg.word()
flag=neg.flag()#过滤停用词表中的词,以及长度小于2的词if word notin stop_list andlen(word)>1:
filter_list.append(word)return filter_list
#数据加载,并进行切词和去除停用词的操作defload_data(self):
doc_list=[]
seg_list=self.seg_to_list()
filter_list=self.word_filter()
doc_list+=filter_list
return doc_list
二、TF-IDF类
import math
classTfidf(object):# 3个参数,文档列表,处理后的待提取文本,关键词数量def__init__(self, doc_list, word_list, keyword_num):
self.doc_list=doc_list
self.word_list=word_list
self.keyword_num=keyword_num
self.tf_dic=self.get_tf_dic()# 计算tf值defget_tf_dic(self):
tf_dic={}for word in self.word_list:
tf_dic[word]=tf_dic.get(word,0.0)+1.0
word_count=len(self.word_list)for k, v in tf_dic.items():
tf_dic[k]=float(v)/word_count
return tf_dic
# 计算idf值defget_idf_dic(self):
idf_dic=dict()#总文档数
doc_count=len(self.doc_list)# 计算每个词出现的文档数,集合是去重的for doc in self.doc_list:for word inset(doc):#集合去重
idf_dic[word]=idf_dic.get(word,0.0)+1.0#按公式计算idf值,分母加1为了进行平滑处理。避免分母为0for k,v in idf_dic.items():
idf_dic[k]=math.log(doc_count/(v+1.0))#对于没有在字典中出现的词,默认仅在一个文档出现,及默认idf值为1
default_idf=idf=math.log(doc_count/1.0)return idf_dic, default_idf
# 计算tf-idf的值defget_tfidf(self):
tfidf_dic=dict()for word in self.word_list:
tf=self.get_tf_dic().get(word,0)
idf=self.get_idf_dic()[0].get(word, self.get_idf_dic()[1])
tfidf=tf*idf
tfidf_dic[word]=tfidf
#print(type(tfidf_dic))
ordered_tfidf_dic=sorted(tfidf_dic, key=lambda x:x[1], reverse=True)[:self.keyword_num]#排序后dict变为list#print(type(ordered_tfidf_dic)) return ordered_tfidf_dic
三、主题模型类
classTopicModel(object):def__init__(self, doc_list, keyword_num, model='LSI', num_topics=4):#使用gensim接口,将文本转换为向量#先构建词空间
self.dictionary=corpora.Dictionary([doc_list])#生成词典
corpus=[self.dictionary.doc2bow([doc])for doc in doc_list]#词典向量化#print(corpus)#对于每个词,计算其tfidf的值,得到加权后的向量表示
self.tfidf_model=models.TfidfModel(corpus)
self.tfidf_corpus=self.tfidf_model[corpus]# 初始化关键词数和主题数
self.keyword_num=keyword_num
self.num_topics=num_topics
#选择加载的主题模型if model=='LSI':
self.model=self.train_lsi()else:
self.model=self.train_lda()#得到数据集的主题词分布
word_dic=self.word_dictionary(doc_list)
self.wordtopic_dic=self.get_wordtopic(word_dic)#print(self.wordtopic_dic)#向量化# def doc2bowvec(self, word_list):# vec_list=[1 if word in word_list else 0 for word in self.dictionary]# #print('vec list', vec_list)# return vec_list# 词空间构建方法和向量化方法,在没有gensim接口时的一般处理方法defword_dictionary(self, doc_list):
dictionary =[]for doc in doc_list:# extend 和 append 方法有何异同 容易出错
dictionary.append(doc)
dictionary =list(set(dictionary))return dictionary
# 得到数据集的主题--词分布defget_wordtopic(self, word_dic):
wordtopic_dic={}for word in word_dic:
singlist=[word]#对每个词进行加权
word_corpus=self.tfidf_model[self.dictionary.doc2bow(singlist)]#计算每个词的主题向量
word_topic=self.model[word_corpus]
wordtopic_dic[word]=word_topic
return wordtopic_dic
#主题模型训练deftrain_lsi(self):
lsi=models.LsiModel(self.tfidf_corpus, id2word=self.dictionary,num_topics=self.num_topics)return lsi
deftrain_lda(self):
lda=models.LdaModel(self.tfidf_corpus, id2word=self.dictionary,num_topics=self.num_topics)return lda
#计算词的分布和文档的分布的相似度,取相似度最高的keyword_num个词作为关键词defget_simword(self, word_list):#文档的加权向量
sentcorpus=self.tfidf_model[self.dictionary.doc2bow(word_list)]#文档主题向量
senttopic=self.model[sentcorpus]#print(senttopic)## senttopic [(0, 0.03457821), (1, 0.034260772), (2, 0.8970413), (3, 0.034119748)]# 计算余弦相似度defcalsim(l1, l2):
a, b, c=0.0,0.0,0.0for t1, t2 inzip(l1, l2):#l1 l2中第一个位置是索引,第二个位置是值
x1=t1[1]
x2=t2[1]
a+=x1*x2
b+=x1*x1
c+=x2*x2
sim=a/math.sqrt(b*c)ifnot(b*c)==0.0else0.0return sim
#计算输入文本和每个词的主题分布相似度
sim_dic={}#print(self.wordtopic_dic)for k, v in self.wordtopic_dic.items():#计算每个在文档中的词和文档的相似度if k notin word_list:continue
sim=calsim(v, senttopic)
sim_dic[k]=sim
#print(sim_dic)
ordered_sim_dic=sorted(sim_dic, key=lambda x:x[1],reverse=True)return ordered_sim_dic[:self.keyword_num]