defget_word_weight(weightfile="", weightpara=2.7e-4): """ Get the weight of words by word_fre/sum_fre_words :param weightfile :param weightpara :return: word2weight[word]=weight : a dict of word weight """ if weightpara <= 0: # when the parameter makes no sense, use unweighted weightpara = 1.0 word2weight = {} word2fre = {} with open(weightfile, encoding='UTF-8') as f: lines = f.readlines() # sum_num_words = 0 sum_fre_words = 0 for line in lines: word_fre = line.split() # sum_num_words += 1 if (len(word_fre) >= 2): word2fre[word_fre[0]] = float(word_fre[1]) sum_fre_words += float(word_fre[1]) else: print(line) # 这地方效果:如果一个词出现的频率越高,那么它的权重就越低 # 可看: https://github.com/sunyilgdx/SIFRank_zh/issues/14 for key, value in word2fre.items(): word2weight[key] = weightpara / (weightpara + value / sum_fre_words) # word2weight[key] = 1.0 #method of RVA return word2weight
classInputTextObj: """Represent the input text in which we want to extract keyphrases"""
def__init__(self, zh_model, text=""): """ :param is_sectioned: If we want to section the text. :param zh_model: the pipeline of Chinese tokenization and POS-tagger :param considered_tags: The POSs we want to keep """ self.considered_tags = {'n', 'np', 'ns', 'ni', 'nz','a','d','i','j','x','g'}
self.tokens = [] self.tokens_tagged = [] # self.tokens = zh_model.cut(text) word_pos = zh_model.cut(text) self.tokens = [word_pos[0] for word_pos in word_pos] self.tokens_tagged = [(word_pos[0],word_pos[1]) for word_pos in word_pos] assert len(self.tokens) == len(self.tokens_tagged) for i, token in enumerate(self.tokens): # 停用词处理 if token.lower() in stopword_dict: self.tokens_tagged[i] = (token, "u") if token == '-': self.tokens_tagged[i] = (token, "-") self.keyphrase_candidate = extractor.extract_candidates(self.tokens_tagged, zh_model)
defextract_candidates(tokens_tagged, no_subset=False): """ Based on part of speech return a list of candidate phrases :param text_obj: Input text Representation see @InputTextObj :param no_subset: if true won't put a candidate which is the subset of an other candidate :return keyphrase_candidate: list of list of candidate phrases: [tuple(string,tuple(start_index,end_index))] """ np_parser = nltk.RegexpParser(GRAMMAR_zh) # Noun phrase parser keyphrase_candidate = [] np_pos_tag_tokens = np_parser.parse(tokens_tagged) count = 0 for token in np_pos_tag_tokens: if (isinstance(token, nltk.tree.Tree) and token._label == "NP"): np = ''.join(word for word, tag in token.leaves()) length = len(token.leaves()) start_end = (count, count + length) count += length keyphrase_candidate.append((np, start_end))
defget_sent_segmented(tokens): """ 按照.和。进行分段 但是这个分法很个性,只有大于16才分割,否则不分 :param tokens: :return: """ min_seq_len = 16 sents_sectioned = [] if (len(tokens) <= min_seq_len): sents_sectioned.append(tokens) else: position = 0 for i, token in enumerate(tokens): if (token == '.'or token =='。'): if (i - position >= min_seq_len): sents_sectioned.append(tokens[position:i + 1]) position = i + 1 if (len(tokens[position:]) > 0): sents_sectioned.append(tokens[position:])
defget_tokenized_words_embeddings(self, sents_tokened): """ @see EmbeddingDistributor :param tokenized_sents: list of tokenized words string (sentences/phrases) :return: ndarray with shape (len(sents), dimension of embeddings) """ max_len = max([len(sent) for sent in sents_tokened]) elmo_embedding = self.elmo.sents2elmo(sents_tokened,output_layer=-2) elmo_embedding = [np.pad(emb, pad_width=((0,0),(0,max_len-emb.shape[1]),(0,0)) , mode='constant') for emb in elmo_embedding] elmo_embedding = torch.from_numpy(np.array(elmo_embedding)) return elmo_embedding
output_layer: the target layer to output. 0 for the word encoder 1 for the first LSTM hidden layer 2 for the second LSTM hidden layer -1 for an average of 3 layers. (default) -2 for all 3 layers
""" Embeddings Alignment :param elmo_embeddings: The embeddings from elmo :param tokens_segmented: The list of tokens list <class 'list'>: [['今', '天', '天气', '真', '好', '啊'],['潮水', '退', '了', '就', '知道', '谁', '没', '穿', '裤子']] :return: """ token_emb_map = {} n = 0 for i in range(0, len(tokens_segmented)): # 一词多义,将相同词的emb append到一起 for j, token in enumerate(tokens_segmented[i]): # 获取第一层的embedding, 1 for the first LSTM hidden layer emb = elmo_embeddings[i, 1, j, :] if token notin token_emb_map: token_emb_map[token] = [emb] else: token_emb_map[token].append(emb) n += 1 # 求每个token的平均值 anchor_emb_map = {} for token, emb_list in token_emb_map.items(): average_emb = emb_list[0] for j in range(1, len(emb_list)): average_emb += emb_list[j] average_emb /= float(len(emb_list)) anchor_emb_map[token] = average_emb # 替换掉elmo最后一层的词对应的词向量 for i in range(0, elmo_embeddings.shape[0]): for j, token in enumerate(tokens_segmented[i]): emb = anchor_emb_map[token] elmo_embeddings[i, 2, j, :] = emb
defget_weight_list(word2weight_pretrain, word2weight_finetune, tokenized_sents, lamda, database=""): weight_list = [] for word in tokenized_sents: word = word.lower()
if(word in word2weight):# return word2weight[word]
if(word in stop_words): return0.0
if(word in english_punctuations or word in chinese_punctuations):#The oov_word is a punctuation return0.0
if(method=="max_weight"):#Return the max weight of word in the tokenized_sents max=0.0 for w in tokenized_sents: if(w in word2weight and word2weight[w]>max): max=word2weight[w] return max return0.0
就是根据dict.txt计算出每个词的权重,所获取到的.
获取整段话的每层平均向量
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
defget_weighted_average(tokenized_sents, sents_tokened_tagged,weight_list, embeddings_list, embeddings_type="elmo"): # weight_list=get_normalized_weight(weight_list) assert len(tokenized_sents) == len(weight_list) num_words = len(tokenized_sents) e_test_list=[] if (embeddings_type == "elmo"or embeddings_type == "elmo_sectioned"): # assert num_words == embeddings_list.shape[1] sum = torch.zeros((3, 1024)) for i in range(0, 3): for j in range(0, num_words): if(sents_tokened_tagged[j][1] in considered_tags): e_test=embeddings_list[i][j] e_test_list.append(e_test) sum[i] += e_test * weight_list[j]
sum[i] = sum[i] / float(num_words) return sum else: print('其他的忽略。')
for phrase, dist_list in dist_all.items(): sum_dist = 0.0 for dist in dist_list: sum_dist += dist if (phrase in stop_words): sum_dist = 0.0 final_dist[phrase] = sum_dist/float(len(dist_list)) return final_dist