一、数据介绍
本项目的主要任务是利用医疗抓取的医疗语料库对四种文本相似度计算方法进行实验。本项目初始文件有二个,分别是
- file_corpus.txt:医疗文本
- medfw.txt:词文件,包含单词、词频与词性
二、代码介绍
train_word2vec.py
:利用word2vec对file_corpus.txt进行词向量训练,生成voc.txt词向量文件
import os
import logging
import re
import time
import codecs
import jieba
from gensim.models import word2vec
file_corpus = '../data/file_corpus.txt' # 语料
file_userdict = '../data/medfw.txt' # 自定义词典
file_voc = '../data/voc.txt'
# 结巴分词中将文本切成句子的方法;正则化
# compile函数根据一个模式字符串与可选标志参数生成一个正则化表达式对象
# 基于正则表达式进行汉字判断,如果符合,则被判定为汉字
re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")
class MySentences(object):
def __init__(self, file_corpus, file_userdict):
self.file_corpus = file_corpus
# load_userdict:指定自己自定义的词典,以便包含 jieba 词库里没有的词
jieba.load_userdict(file_userdict)
def __iter__(self):
"""
句子分词后的迭代器
"""
with codecs.open(self.file_corpus, 'r', encoding='utf-8') as f:
for _, line in enumerate(f):
# split 方法按照能够匹配的子串将字符串分割后返回列表
blocks = re_han.split(line.strip())
seglist = []
for blk in blocks:
# re.match 尝试从字符串的起始位置匹配一个模式,如果不是起始位置匹配成功的话,match()就返回none
if re_han.match(blk):
# jieba.lcut:返回分词后的列表
seglist.extend(jieba.lcut(blk))
yield seglist
if __name__ == '__main__':
t1 = time.time()
# 会生成一个迭代对象--yield的妙用
sentences = MySentences(file_corpus, file_userdict)
# 生成日志
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# 训练word2vec模型
model = word2vec.Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=6)
# 保存word2vec模型至voc.txt,生成voc.txt词向量文件
model.wv.save_word2vec_format(file_voc, binary=False)
print('-------------------------------------------')
print("Training word2vec model cost %.3f seconds...\n" % (time.time() - t1))
voc.txt词向量文件如图所示:
load_data.py
:加载数据文件
import codecs
import numpy as np
import json
def load_voc(file_voc):
"""
加载词向量文件
:param file_voc: 词向量文件
:return:返回词向量字典,key为单词,value为词向量
"""
vector_file = codecs.open(file_voc, 'r', encoding='utf-8')
# readline():从文件中读取整行
line = vector_file.readline()
voc_size, vec_dim = map(int, line.split(' ')) # 表头转换为整数
embedding = dict()
# 逐行遍历读取,并存入词向量字典中
line = vector_file.readline()
while line:
items = line.split(' ')
item = items[0]
vec = np.array(items[1:], dtype='float32')
embedding[item] = vec
line = vector_file.readline()
return embedding
def load_idf(file_idf):
"""
加载idf文件
:param file_idf:
:return:
"""
with codecs.open(file_idf, 'r', encoding='utf-8') as f:
idf = json.loads(f.read())
return idf
compute_idf.py
:对训练出来的词,计算其在语料库中idf,生成idf.txt文件
from __future__ import absolute_import
import multiprocessing
import codecs
import math
import json
import time
from load_data import *
file_corpus = '../data/file_corpus.txt' # 语料文件
file_voc = '../data/voc.txt' # 词向量文件
file_idf = '../data/idf.txt' # idf文件
class ComIdf(object):
def __init__(self, file_corpus, file_voc, file_idf):
self.file_corpus = file_corpus
self.file_voc = file_voc
self.file_idf = file_idf
self.voc = load_voc(self.file_voc) # 词向量字典
self.corpus_data = self.load_corpus() # 语料
self.N = len(self.corpus_data) # 文档数
def load_corpus(self):
"""
加载语料文件所有行并返回列表
:return:
"""
input_data = codecs.open(self.file_corpus, 'r', encoding='utf-8')
return input_data.readlines()
def com_idf(self, word):
n = 0
# word出现的总次数
for _, line in enumerate(self.corpus_data):
# count() 方法用于统计字符串里某个字符出现的次数
n += line.count(word)
idf = math.log(1.0 * self.N / n + 1)
return {word: idf}
def parts(self):
words = set(self.voc.keys()) # 词集合
multiprocessing.freeze_support() # 多进程
cores = multiprocessing.cpu_count() # cpu数量
pool = multiprocessing.Pool(processes=cores - 2)
# 创建了一个容许cores - 2个进程的进程池。Pool运行的每个进程都执行com_idf函数,使得word集合中的每个词都执行com_idf函数
reuslt = pool.map(self.com_idf, words)
idf_dict = dict()
for r in reuslt:
k = list(r.keys())[0]
v = list(r.values())[0]
idf_dict[k] = idf_dict.get(k, 0) + v
with codecs.open(self.file_idf, 'w', encoding='utf-8') as f:
# json.dumps:将dict转化为json字符串,ensure_ascii=False:输出为中文
f.write(json.dumps(idf_dict, ensure_ascii=False, indent=2, sort_keys=False))
if __name__ == '__main__':
t1 = time.time()
# 计算每个词的idf并保存到idf.txt
IDF = ComIdf(file_corpus, file_voc, file_idf)
IDF.parts()
print('-------------------------------------------')
print("Computing idf of words cost %.3f seconds...\n" % (time.time() - t1))
idf.txt文件如图所示:
get_sentence.py
:统计语料库中存在的句子,生成file_sentece.txt文件;考虑计算量问题,本实验只取了出现频率最高的前10000个句子
import codecs
import re
# 专门用于匹配汉字的正则表达式,若符合该正则表达式,则判定为汉字
re_han = re.compile(u"([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)")
def get_sentence():
file_corpus = codecs.open('../data/file_corpus.txt', 'r', encoding='utf-8')
file_sentence = codecs.open('../data/file_sentence.txt', 'w', encoding='utf-8')
# 将匹配正则式后的文本存放到字典中并统计文本出现频次
st = dict()
for _, line in enumerate(file_corpus):
line = line.strip()
blocks = re_han.split(line)
for blk in blocks:
if re_han.match(blk) and len(blk) > 10:
st[blk] = st.get(blk, 0) + 1
# 按照句子出现次数进行排序
st = sorted(st.items(), key=lambda x: x[1], reverse=True)
for s in st[:10000]:
file_sentence.write(s[0] + '\n')
file_corpus.close()
file_sentence.close()
get_sentence()
file_sentece.txt文件如图所示:
similarity.py
:四种文本相似度计算方法的实现:cosine,cosine+idf,bm25,jaccard
# encoding:utf-8
from __future__ import absolute_import
import jieba
import time
from scipy import spatial
import numpy as np
from Utils.load_data import *
file_voc = './data/voc.txt'
file_idf = './data/idf.txt'
file_userdict = './data/medfw.txt'
class SSIM(object):
def __init__(self):
t1 = time.time()
self.voc = load_voc(file_voc)
print("Loading word2vec vector cost %.3f seconds...\n" % (time.time() - t1))
t1 = time.time()
self.idf = load_idf(file_idf)
print("Loading idf data cost %.3f seconds...\n" % (time.time() - t1))
jieba.load_userdict(file_userdict)
def M_cosine(self, s1, s2):
s1_list = jieba.lcut(s1)
s2_list = jieba.lcut(s2)
v1 = np.array([self.voc[s] for s in s1_list if s in self.voc])
v2 = np.array([self.voc[s] for s in s2_list if s in self.voc])
v1 = v1.sum(axis=0)
v2 = v2.sum(axis=0)
sim = 1 - spatial.distance.cosine(v1, v2)
return sim
def M_idf(self, s1, s2):
v1, v2 = [], []
s1_list = jieba.lcut(s1)
s2_list = jieba.lcut(s2)
for s in s1_list:
idf_v = self.idf.get(s, 1)
if s in self.voc:
v1.append(1.0 * idf_v * self.voc[s])
for s in s2_list:
idf_v = self.idf.get(s, 1)
if s in self.voc:
v2.append(1.0 * idf_v * self.voc[s])
v1 = np.array(v1).sum(axis=0)
v2 = np.array(v2).sum(axis=0)
sim = 1 - spatial.distance.cosine(v1, v2)
return sim
def M_bm25(self, s1, s2, s_avg=10, k1=2.0, b=0.75):
bm25 = 0
s1_list = jieba.lcut(s1)
for w in s1_list:
idf_s = self.idf.get(w, 1)
bm25_ra = s2.count(w) * (k1 + 1)
bm25_rb = s2.count(w) + k1 * (1 - b + b * len(s2) / s_avg)
bm25 += idf_s * (bm25_ra / bm25_rb)
return bm25
def M_jaccard(self, s1, s2):
s1 = set(s1)
s2 = set(s2)
ret1 = s1.intersection(s2)
ret2 = s1.union(s2)
jaccard = 1.0 * len(ret1) / len(ret2)
return jaccard
def ssim(self, s1, s2, model='cosine'):
if model == 'idf':
f_ssim = self.M_idf
elif model == 'bm25':
f_ssim = self.M_bm25
elif model == 'jaccard':
f_ssim = self.M_jaccard
else:
f_ssim = self.M_cosine
sim = f_ssim(s1, s2)
return sim
sm = SSIM()
ssim = sm.ssim
test.py
:测试文件,对设定好的5个句子,按照不同的算法得出最相似的结果,生成test_result.txt文件
import codecs
import similarity
import json
import time
def test():
test_data = [u'临床表现及实验室检查即可做出诊断',
u'面条汤等容易消化吸收的食物为佳',
u'每天应该摄入足够的维生素A',
u'视患者情况逐渐恢复日常活动',
u'术前1天开始预防性运用广谱抗生素']
model_list = ['cosine', 'idf', 'bm25', 'jaccard']
file_sentence = codecs.open('./data/file_sentence.txt', 'r', encoding='utf-8')
train_data = file_sentence.readlines()
for model in model_list:
t1 = time.time()
dataset = dict()
result = dict()
for s1 in test_data:
dataset[s1] = dict()
# 遍历计算训练语句与测试语句的相似度
for s2 in train_data:
s2 = s2.strip()
if s1 != s2:
sim = similarity.ssim(s1, s2, model=model)
dataset[s1][s2] = dataset[s1].get(s2, 0) + sim
for r in dataset:
top = sorted(dataset[r].items(), key=lambda x: x[1], reverse=True)
result[r] = top[0]
with codecs.open('./data/test_result.txt', 'a', encoding='utf-8') as f:
f.write('--------------The result of %s method------------------\n ' % model)
f.write('\tThe computing cost %.3f seconds\n' % (time.time() - t1))
f.write(json.dumps(result, ensure_ascii=False, indent=2, sort_keys=False))
f.write('\n\n')
file_sentence.close()
test()
其结果如下:
--------------The result of cosine method------------------
The computing cost 24.719 seconds
{
“临床表现及实验室检查即可做出诊断”: [
“临床表现及实验室检查即可作出诊断”,
0.9985765838835549
],
“面条汤等容易消化吸收的食物为佳”: [
“面条等容易消化吸收的食物为佳”,
0.929288047242335
],
“每天应该摄入足够的维生素A”: [
“维生素和微量元素的饮食”,
0.8553203804391963
],
“视患者情况逐渐恢复日常活动”: [
“术后根据患者情况逐渐恢复日常活动”,
0.9533089675637009
],
“术前1天开始预防性运用广谱抗生素”: [
“术前1天开始口服甲硝唑和新霉素”,
0.8705011525311505
]
}
--------------The result of idf method------------------
The computing cost 28.208 seconds
{
“临床表现及实验室检查即可做出诊断”: [
“临床表现和实验室检查即可作出诊断”,
0.9906356955821258
],
“面条汤等容易消化吸收的食物为佳”: [
“面条等容易消化吸收的食物为佳”,
0.9607584778121556
],
“每天应该摄入足够的维生素A”: [
“人们每天必须摄入一定量蛋白质”,
0.9011069764513063
],
“视患者情况逐渐恢复日常活动”: [
“术后根据患者情况逐渐恢复日常活动”,
0.9750193238540082
],
“术前1天开始预防性运用广谱抗生素”: [
“术前3天开始局部应用抗生素点眼”,
0.91921473224723
]
}
--------------The result of bm25 method------------------
The computing cost 8.368 seconds
{
“临床表现及实验室检查即可做出诊断”: [
“临床表现及检查即可做出诊断”,
4.624040308455444
],
“面条汤等容易消化吸收的食物为佳”: [
“面条等容易消化吸收的食物为佳”,
14.400130547777492
],
“每天应该摄入足够的维生素A”: [
“医生应该具备足够的同情心”,
5.043998585649506
],
“视患者情况逐渐恢复日常活动”: [
“术后根据患者情况逐渐恢复日常活动”,
6.08201372756448
],
“术前1天开始预防性运用广谱抗生素”: [
“术前3天开始局部应用抗生素点眼”,
5.60157455543482
]
}
--------------The result of jaccard method------------------
The computing cost 0.409 seconds
{
“临床表现及实验室检查即可做出诊断”: [
“临床表现及实验室检查即可作出诊断”,
0.8823529411764706
],
“面条汤等容易消化吸收的食物为佳”: [
“面条等容易消化吸收的食物为佳”,
0.9333333333333333
],
“每天应该摄入足够的维生素A”: [
“医生应该具备足够的同情心”,
0.3157894736842105
],
“视患者情况逐渐恢复日常活动”: [
“术后根据患者情况逐渐恢复日常活动”,
0.7058823529411765
],
“术前1天开始预防性运用广谱抗生素”: [
“术前应用抗生素预防感染”,
0.42105263157894735
]
}