importgcimporttqdmimportnumpy as npfrom gensim importcorpora, models, similaritiesfrom sentence importSentencefrom collections importdefaultdictimporttimeclassSentenceSimilarity():def __init__(self, seg):
self.seg=segdefset_sentences(self, sentences):
self.sentences=[]for i inrange(0, len(sentences)):
self.sentences.append(Sentence(sentences[i], self.seg, i))
self.sentences_num=len(self.sentences)#获取切过词的句子
defget_cuted_sentences(self):
cuted_sentences=[]for sentence inself.sentences:
cuted_sentences.append(sentence.get_cuted_sentence())returncuted_sentences#构建其他复杂模型前需要的简单模型
def simple_model(self, min_frequency = 1):
self.texts=self.get_cuted_sentences()#删除低频词
frequency =defaultdict(int)for text inself.texts:for token intext:
frequency[token]+= 1self.texts= [[token for token in text if frequency[token] > min_freq