6-24 根据队友找的提取关键词的模型,处理数据集+定义求杰卡德相似度的函数
创新实训记录10
处理数据集
根据队友找的模型(这里只采用了一个无监督SingleRank模型,没有进行模型的融合,想要优化的话可以从关键词的提取这入手),处理sentence_textcnn_deal2.json中每个子空间上的句子,提取某篇论文出在该子空间上的关键词。处理结果是生成五个npy文件,对应五个子空间,每个文件的内容是字典。字典的key是论文id,字典的value是集合,包含论文在该子空间上的所有关键词。
def test_unsupervised_run(num):
def test(model, file):
extractor = model()
extractor.load_document(file)
extractor.candidate_selection()
extractor.candidate_weighting()
keyphrases = [k for k, s in extractor.get_n_best(n=10)]
return keyphrases
models = [
TopicRank, SingleRank,
MultipartiteRank, PositionRank,
TopicalPageRank, ExpandRank,
TextRank, TfIdf, KPMiner,
YAKE, FirstPhrases
]
print("testing {}".format(models[num]))
key=test(models[num], test_file)
return key
if __name__ == '__main__':
# 五个字典
dict0 = {}
dict1 = {}
dict2 = {}
dict3 = {}
dict4 = {}
f = open('sentence_textcnn_deal2.json', 'r')
#index = 0
# 遍历json文件,处理每一个句子,生成对应的关键词,然后将关键词放入对应的词典中
for line in f:
user_dic = json.loads(line)
paper_id = user_dic['paper_id']
label = user_dic['TextCNN_label']
content = user_dic['content'].strip()
if content:
test_file = content
keyphrases = test_unsupervised_run(1)
else:
continue
keywords = set()
for i in keyphrases:
keyset = set(i.split())
keywords.update(keyset)
# 根据label选择字典,然后将关键词放入对应的paper-id
if label == 0:
if paper_id in dict0.keys():
dict0[paper_id].add(keywords)
else:
dict0[paper_id] = keywords
elif label == 1:
if paper_id in dict1.keys():
dict1[paper_id].add(keywords)
else:
dict1[paper_id] = keywords
elif label == 2:
if paper_id in dict2.keys():
dict2[paper_id].add(keywords)
else:
dict2[paper_id] = keywords
elif label == 3:
if paper_id in dict3.keys():
dict3[paper_id].add(keywords)
else:
dict3[paper_id] = keywords
else:
if paper_id in dict4.keys():
dict4[paper_id].add(keywords)
else:
dict4[paper_id] = keywords
#生成对应的.npy文件
np.save('sub0_keyword.npy', dict0)
np.save('sub1_keyword.npy', dict1)
np.save('sub2_keyword.npy', dict2)
np.save('sub3_keyword.npy', dict3)
np.save('sub4_keyword.npy', dict4)
print('数据已写入')
test = np.load('sub0_keyword.npy',allow_pickle=True).item()
print(test[102])
最后生成了对应五个子空间的文件。
数据格式如下:
定义求杰卡德相似度的函数
定义求杰卡德相似度的函数,参数有三个分别是id1,id2,subid。调用函数,返回两篇论文在某一个子空间上关键词的杰卡德相似度。
import numpy as np
# 定义获取杰卡德相似度德函数,id1,id2,subid都是int类型
def getJac1(id1,id2,subid):
filename = 'sub'+str(subid)+'_keyword.npy'
key_dict = np.load(filename, allow_pickle=True).item()
if id1 in key_dict.keys()and id2 in key_dict.keys():
keyword1 = key_dict[id1]
keyword2 = key_dict[id2]
temp = 0
for i in keyword1:
if i in keyword2:
temp = temp+1
fenmu = len(keyword1)+len(keyword2)-temp#并集
if fenmu ==0:
fenmu = 0.00001
jaccard_coefficient = float(temp/fenmu)
else:
jaccard_coefficient = 0
return jaccard_coefficient