上篇文章中笔者已经实现了对实体关系的抽取,形成了对应的三元组,这篇主要讲如何根据抽取的三元组去生成对应的医疗知识图谱。
前述部分
- 知识图谱构建的方式有很多,有基于Protege去构建,它是一个使用本体去一层层构建,手动定义一层一层关系,最终的结果是RDF或者OWL文件保存。具体的参考流程可以查看:Protege本体构建,另一种方法可以使用工业界比较常用的Neo4j图数据库管理工具,它操作简单,分为桌面版,社区版本。具体的可以根据用户需要去自行下载。笔者就是基于Neo4j去实现数据的批量读取最终生成医疗知识图谱。
1、数据的格式:
2、根据已知的数据去构建对应的医疗指示图谱
- 2.1构建实体类型
- 2.2构建实体关系
- 2.3 访问Neo4图数据库接口[self.g = Graph(“http://10.10.108.75:7474”, username=“neo4j”, password=“123456789”)]这部分根据开发者自行设定,桌面版本的改成localhost就行,另外默认的Neo4j登录账号是neo4j,密码是neo4j,登录即可提示修改。默认端口是7474和7687
class MedicalGraph:
def __init__(self): \
#cur_dir=''
cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
#读物数据文件
self.data_path = os.path.join(cur_dir, 'data/realText.json')
#连接图数据库
#self.g = Graph("http://localhost:7474", username="last_chinese_medicine", password="123456789")
self.g = Graph("http://10.10.108.75:7474", username="neo4j", password="123456789")
- 2.4 构建图谱中的节点代码如下:
def create_graphnodes(self):
disease_list, drugs, prescription_list, zhengji_list, symptom_list, disease_infos, cur_way_list, bingji_list, \
rels_disease_symptoms, rels_disease_drug, rels_symptoms_prescription, rels_disease_zhengji, rels_disease_prescription, rels_symptoms_drug, \
rels_disease_bingji, rels_zhengji_bingji, rels_disease_cur_way, rels_zhengji_cur_way= self.read_nodes()
#第一个创建的节点是中心节点,是DISEASE标签
#去除zhengji与drugs中的空白字符串
while '' in list(zhengji_list):
zhengji_list.remove('')
zhengji_list = set(zhengji_list)
while '' in list(drugs):
drugs.remove('')
drugs = set(drugs)
#创建知识图谱的中心节点是疾病
self.create_diseases_nodes(disease_infos)
#创建下面的七种实体标签类别(包括疾病部分)
self.create_node('Drug', drugs)
print(len(drugs))
self.create_node('Prescription', prescription_list)
print(len(prescription_list))
self.create_node('Zhenghou', zhengji_list)
print(len(zhengji_list))
self.create_node('Cur_way', cur_way_list)
print(len(cur_way_list))
self.create_node('Bingji', bingji_list)
print(len(bingji_list))
self.create_node('Symptom', symptom_list)
print(len(symptom_list))
return
构造节点时候需要定义一个中心节点,关联它跟所有的实体之间的关系,这部分是笔者需要,结合自身需求去修改即可。具体代码如下:
def create_diseases_nodes(self, disease_infos):
count = 0
# cur_way:治法(一个或者俩个或者多个,是一个列表)
# prescription:方剂(一个)
# chinese_medicine:中药成分(列表)
# suggestion:建议(注意)
# desc:整个疾病的描述
for disease_dict in disease_infos:
#Disease是疾病的标签名字
node = Node("Disease", name=disease_dict['disease'], symptoms=disease_dict['symptoms'],zhenghou=disease_dict['zhenghou'],bingji=disease_dict['bingji'],
cur_way=disease_dict['cur_way'],prescription=disease_dict['prescription'],
chinese_medicine=disease_dict['chinese_medicine'],suggestion=disease_dict['suggestion'],desc=disease_dict['desc']
)
self.g.create(node)
count += 1
print(count)
return
- 2.5实体之间关系的构建代码如下:
'''创建实体关系边'''
def create_graphrels(self):
#返回十八个对象
disease_list, drugs, prescription_list, zhengji_list, symptom_list, disease_infos, cur_way_list, bingji_list, \
rels_disease_symptoms, rels_disease_drug, rels_symptoms_prescription, rels_disease_zhengji, rels_disease_prescription, rels_symptoms_drug, \
rels_disease_bingji, rels_zhengji_bingji, rels_disease_cur_way, rels_zhengji_cur_way = self.read_nodes()
# def create_relationship(self, start_node, end_node, edges, rel_type, rel_name):
#Disease,Food都是对应的实体的标签
self.create_relationship('Disease', 'Prescription', rels_disease_prescription, 'rels_disease_prescription', '疾病对应的方剂')
self.create_relationship('Disease', 'Drug', rels_disease_drug, 'rels_disease_drug', '疾病对应药物')
#主义zhengji是大写的形式
self.create_relationship('Disease', 'Zhenghou', rels_disease_zhengji, 'rels_disease_zhenghou', '疾病对应的症候')
self.create_relationship('Disease', 'Symptom', rels_disease_symptoms, 'rels_disease_symptoms', '疾病对应症状')
#症状对应的方剂(一开始append的时候是症状在前面,这里也要按照顺序)
self.create_relationship('Symptom', 'Prescription', rels_symptoms_prescription, 'rels_symptoms_prescription', '症状对应的方剂')
self.create_relationship('Symptom', 'Drug', rels_symptoms_drug, 'rels_symptoms_drug', '症状对应的药物')
self.create_relationship('Zhenghou', 'Bingji', rels_zhengji_bingji, 'rels_zhengji_bingji', '症候对应的病机')
self.create_relationship('Disease', 'Bingji', rels_disease_bingji, 'rels_disease_bingji', '疾病对应的病机')
self.create_relationship('Disease', 'Cur_way', rels_disease_cur_way, 'rels_disease_cur_way', '疾病对应的治法')
self.create_relationship('Zhenghou', 'Cur_way', rels_zhengji_cur_way, 'rels_zhengji_cur_way', '症候对应的治法')
#少了证候对应的治法,症状对应的方剂
'''创建实体关联边'''
def create_relationship(self, start_node, end_node, edges, rel_type, rel_name):
count = 0
# 去重处理
set_edges = []
for edge in edges:
#将关系类型用###分离开
set_edges.append('###'.join(edge))
all = len(set(set_edges))
for edge in set(set_edges):
edge = edge.split('###')
p = edge[0]
q = edge[1]
#插入图谱的操作方式
#rel时表示节点之间的关系以及后面表示的是关系之间的类型
query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' create (p)-[rel:%s{name:'%s'}]->(q)" % (
start_node, end_node, p, q, rel_type, rel_name)
try:
self.g.run(query)
count += 1
print(rel_type, count, all)
except Exception as e:
print(e)
return
3、医疗知识图谱可视化
- 1、实体可视化
- 2、关系可视化
4、完整知识图谱代码如下:
import os
import json
from py2neo import Graph,Node
class MedicalGraph:
def __init__(self): \
#cur_dir=''
cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
#读物数据文件
self.data_path = os.path.join(cur_dir, 'data/realText.json')
#连接图数据库
#self.g = Graph("http://localhost:7474", username="last_chinese_medicine", password="123456789")
self.g = Graph("http://10.10.108.75:7474", username="neo4j", password="123456789")
'''读取文件'''
def read_nodes(self):
# 找出json文件中各个节点之家的关系,开始构造节点
# 总共五类节点(疾病、症状、证候、方剂、中药等等)
#目前的json文件中包括下面这些字段:
#总共有9种实体类别
"""
disease:疾病(一个)
symptoms:症状(列表)
zhenghou:症候(1个)
bingji:病机(列表)
cur_way:治法(一个或者俩个或者多个,是一个列表)
prescription:方剂(一个)
chinese_medicine:中药成分(列表)
suggestion:建议(注意)
desc:整个疾病的描述
#一种疾病有多种治法
:return:
"""
# 构建实体节点
disease_list=[]#疾病的节点
symptom_list = [] # 症状列表
zhengji_list = [] # 证候
bingji_list=[]#病机
cur_way_list = [] # 治法(一种疾病可能有多种治法)
prescription_list = [] # 方剂英文
drugs = [] # 中药
disease_infos = [] # 疾病的信息
# 构建节点之间的关系(10种关系)
rels_disease_drug = [] # 疾病与中药之间的关系1
rels_disease_prescription = [] # 疾病与方剂之间的关系
rels_disease_zhengji = [] # 疾病与证候之间的关系
rels_disease_symptoms = []#1
rels_symptoms_prescription=[]#症状与方剂之间的关系
#添加症状和药物之间的关系(多对多的关系)
rels_symptoms_drug=[]#1
rels_disease_cur_way=[]#疾病与治法之间的关系
rels_disease_bingji=[]#疾病和病机之间的关系
rels_zhengji_bingji=[]#症候与病机之间的关系
rels_zhengji_cur_way=[]#症候与治法之间的关系
#开始数据的操作
count=0
for data in open(self.data_path,encoding='utf-8'):
disease_dict={}
count+=1
print(count)
#转为字典的形式
data_json=json.loads(data)
disease=data_json['disease']
disease_dict['disease']=disease
#统计出所有的疾病
disease_list.append(disease)
disease_dict['symptoms']=''
disease_dict['chinese_medicine']=''
disease_dict['prescription']=''
disease_dict['zhenghou']=''
#治法
disease_dict['cur_way']=''
#意见建议
disease_dict['suggestion']=data_json['suggestion']
#疾病的描述
disease_dict['desc']=data_json['desc']
#判断这个症状的词使否在json文件中key值
if 'symptoms' in data_json:
#统计出所有的症状
disease_dict['symptoms']=data_json['symptoms']
symptom_list+=data_json['symptoms']
for symptom in data_json['symptoms']:
# 建立一个疾病与症状之间的关系
rels_disease_symptoms.append([disease,symptom])
for drug in data_json['chinese_medicine']:
#处理症状和药物之间的多对多的关系
rels_symptoms_drug.append([symptom,drug])
#处理症状和药物之间多对多的关系
#判断接下来的key的值是否在data_json中
#疾病都吃哪些药物
if 'chinese_medicine' in data_json:
#统计出有多少药物
disease_dict['chinese_medicine'] = data_json['chinese_medicine']
drugs+=data_json['chinese_medicine']
for drug in data_json['chinese_medicine']:
rels_disease_drug.append([disease,drug])
#将方剂放入疾病词典中
if 'prescription' in data_json:
#统计出有多少方剂
prescription=data_json['prescription']
#统计出所有的方剂
prescription_list.append(prescription)
disease_dict['prescription']=prescription
rels_disease_prescription.append([disease, data_json['prescription']])
for symptom in data_json['symptoms']:
#建立症状与方剂之间的关系
rels_symptoms_prescription.append([symptom,prescription])
#建立疾病与证候之间的关系()
if 'zhenghou' in data_json:
#统计出有多少证候
disease_dict['zhenghou'] = data_json['zhenghou']
zhengji_list.append(data_json['zhenghou'])
zhenghou=data_json['zhenghou']
rels_disease_zhengji.append([disease,zhenghou])
#建立疾病与治法之间的关系
if 'cur_way' in data_json:
#cur_way是一个列表
cur_way=data_json['cur_way']
cur_way_list+=cur_way
disease_dict['cur_way']=data_json['cur_way']
#建立疾病与治法之间的关系
zhenghou = data_json['zhenghou']
for cur in cur_way:
#建立疾病与治法之间的关系
rels_disease_cur_way.append([disease,cur])
rels_zhengji_cur_way.append([zhenghou,cur])
#建立疾病和病机之间的关系,症候与病机之间的关系
if 'bingji' in data_json:
# bignji还是一个列表
disease_dict['bingji']=data_json['bingji']
bingji_list+=data_json['bingji']
zhenghou = data_json['zhenghou']
for bing in data_json['bingji']:
#建立疾病与病机之间的关系
rels_disease_bingji.append([disease,bing])
rels_zhengji_bingji.append([zhenghou,bing])
#建立症状与方剂之间的关系
#一个方剂对应的是多个症状
#建立疾病与方剂之间的关系
# if 'prescription' in data_json:
# disease_dict['prescription']=data_json['prescription']
# rels_disease_prescription.append([disease,data_json['prescription']])
#将疾病的词典加入疾病的信息的列表中
disease_infos.append(disease_dict)
#返回十一种结果
#在zhengji_list与drugs中都有一个空格
return set(disease_list),set(drugs),set(prescription_list),set(zhengji_list),set(symptom_list),disease_infos,set(cur_way_list),set(bingji_list), \
rels_disease_symptoms,rels_disease_drug,rels_symptoms_prescription,rels_disease_zhengji,rels_disease_prescription,rels_symptoms_drug, \
rels_disease_bingji,rels_zhengji_bingji,rels_disease_cur_way,rels_zhengji_cur_way
'''建立节点'''
def create_node(self, label, nodes):
count = 0
for node_name in nodes:
node = Node(label, name=node_name)
self.g.create(node)
count += 1
print(count, len(nodes))
return
'''创建知识图谱中心疾病的节点'''
#中心节点是disease
def create_diseases_nodes(self, disease_infos):
count = 0
# cur_way:治法(一个或者俩个或者多个,是一个列表)
# prescription:方剂(一个)
# chinese_medicine:中药成分(列表)
# suggestion:建议(注意)
# desc:整个疾病的描述
for disease_dict in disease_infos:
#Disease是疾病的标签名字
node = Node("Disease", name=disease_dict['disease'], symptoms=disease_dict['symptoms'],zhenghou=disease_dict['zhenghou'],bingji=disease_dict['bingji'],
cur_way=disease_dict['cur_way'],prescription=disease_dict['prescription'],
chinese_medicine=disease_dict['chinese_medicine'],suggestion=disease_dict['suggestion'],desc=disease_dict['desc']
)
self.g.create(node)
count += 1
print(count)
return
'''创建知识图谱实体节点类型schema'''
def create_graphnodes(self):
disease_list, drugs, prescription_list, zhengji_list, symptom_list, disease_infos, cur_way_list, bingji_list, \
rels_disease_symptoms, rels_disease_drug, rels_symptoms_prescription, rels_disease_zhengji, rels_disease_prescription, rels_symptoms_drug, \
rels_disease_bingji, rels_zhengji_bingji, rels_disease_cur_way, rels_zhengji_cur_way= self.read_nodes()
#第一个创建的节点是中心节点,是DISEASE标签
#去除zhengji与drugs中的空白字符串
while '' in list(zhengji_list):
zhengji_list.remove('')
zhengji_list = set(zhengji_list)
while '' in list(drugs):
drugs.remove('')
drugs = set(drugs)
#创建知识图谱的中心节点是疾病
self.create_diseases_nodes(disease_infos)
#创建下面的七种实体标签类别(包括疾病部分)
self.create_node('Drug', drugs)
print(len(drugs))
self.create_node('Prescription', prescription_list)
print(len(prescription_list))
self.create_node('Zhenghou', zhengji_list)
print(len(zhengji_list))
self.create_node('Cur_way', cur_way_list)
print(len(cur_way_list))
self.create_node('Bingji', bingji_list)
print(len(bingji_list))
self.create_node('Symptom', symptom_list)
print(len(symptom_list))
return
'''创建实体关系边'''
def create_graphrels(self):
#返回十八个对象
disease_list, drugs, prescription_list, zhengji_list, symptom_list, disease_infos, cur_way_list, bingji_list, \
rels_disease_symptoms, rels_disease_drug, rels_symptoms_prescription, rels_disease_zhengji, rels_disease_prescription, rels_symptoms_drug, \
rels_disease_bingji, rels_zhengji_bingji, rels_disease_cur_way, rels_zhengji_cur_way = self.read_nodes()
# def create_relationship(self, start_node, end_node, edges, rel_type, rel_name):
#Disease,Food都是对应的实体的标签
self.create_relationship('Disease', 'Prescription', rels_disease_prescription, 'rels_disease_prescription', '疾病对应的方剂')
self.create_relationship('Disease', 'Drug', rels_disease_drug, 'rels_disease_drug', '疾病对应药物')
#主义zhengji是大写的形式
self.create_relationship('Disease', 'Zhenghou', rels_disease_zhengji, 'rels_disease_zhenghou', '疾病对应的症候')
self.create_relationship('Disease', 'Symptom', rels_disease_symptoms, 'rels_disease_symptoms', '疾病对应症状')
#症状对应的方剂(一开始append的时候是症状在前面,这里也要按照顺序)
self.create_relationship('Symptom', 'Prescription', rels_symptoms_prescription, 'rels_symptoms_prescription', '症状对应的方剂')
self.create_relationship('Symptom', 'Drug', rels_symptoms_drug, 'rels_symptoms_drug', '症状对应的药物')
self.create_relationship('Zhenghou', 'Bingji', rels_zhengji_bingji, 'rels_zhengji_bingji', '症候对应的病机')
self.create_relationship('Disease', 'Bingji', rels_disease_bingji, 'rels_disease_bingji', '疾病对应的病机')
self.create_relationship('Disease', 'Cur_way', rels_disease_cur_way, 'rels_disease_cur_way', '疾病对应的治法')
self.create_relationship('Zhenghou', 'Cur_way', rels_zhengji_cur_way, 'rels_zhengji_cur_way', '症候对应的治法')
#少了证候对应的治法,症状对应的方剂
'''创建实体关联边'''
def create_relationship(self, start_node, end_node, edges, rel_type, rel_name):
count = 0
# 去重处理
set_edges = []
for edge in edges:
#将关系类型用###分离开
set_edges.append('###'.join(edge))
all = len(set(set_edges))
for edge in set(set_edges):
edge = edge.split('###')
p = edge[0]
q = edge[1]
#插入图谱的操作方式
#rel时表示节点之间的关系以及后面表示的是关系之间的类型
query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' create (p)-[rel:%s{name:'%s'}]->(q)" % (
start_node, end_node, p, q, rel_type, rel_name)
try:
self.g.run(query)
count += 1
print(rel_type, count, all)
except Exception as e:
print(e)
return
'''导出数据'''
#将json需要的数据给他取出来后期实体需要使用(里面包括的都是实体部分,疾病以及一些科室啥的)
#
def export_data(self):
#将实体数据写入到txt中
#h获取所有的节点
disease_list, drugs, prescription_list, zhengji_list, symptom_list, disease_infos, cur_way_list, bingji_list, \
rels_disease_symptoms, rels_disease_drug, rels_symptoms_prescription, rels_disease_zhengji, rels_disease_prescription, rels_symptoms_drug, \
rels_disease_bingji, rels_zhengji_bingji, rels_disease_cur_way, rels_zhengji_cur_way = self.read_nodes()
f_drug = open('dict/drug.txt', 'w+',encoding='utf-8')
f_zhengji_list=open('dict/zhenghou.txt','w+',encoding='utf-8')
f_prescription_list = open('dict/prescription_list.txt', 'w+',encoding='utf-8')
f_symptom = open('dict/symptoms.txt', 'w+',encoding='utf-8')
f_disease = open('dict/disease.txt', 'w+',encoding='utf-8')
f_cur_way = open('dict/cur_way.txt', 'w+', encoding='utf-8')
f_bingji = open('dict/bingji.txt', 'w+', encoding='utf-8')
#去除集合中的空字符串,更新zhengji——list与drugs的值。
#去除空值得部分
while '' in list(zhengji_list):
zhengji_list.remove('')
zhengji_list=set(zhengji_list)
while '' in list(drugs):
drugs.remove('')
drugs=set(drugs)
while '' in list(cur_way_list):
cur_way_list.remove('')
while '' in list(bingji_list):
bingji_list.remove('')
while '' in disease_list:
disease_list.remove('')
while '' in list(symptom_list):
symptom_list.remove('')
while '' in prescription_list:
prescription_list.remove('')
# for i in range(len(zhengji_list)):
# if list(zhengji_list)[i]=='':
# print("有空格")
# for i in range(len(drugs)):
# if list(drugs)[i]== '':
# print("有空格")
# for i in range(len(disease_list)):
# if list(disease_list)[i] == '':
# print("有空格")
# exit()
f_drug.write('\n'.join(list(drugs)))
f_symptom.write('\n'.join(list(symptom_list)))
f_disease.write('\n'.join(list(disease_list)))
f_prescription_list.write('\n'.join(list(prescription_list)))
f_zhengji_list.write('\n'.join(list(zhengji_list)))
f_bingji.write('\n'.join(list(bingji_list)))
f_cur_way.write('\n'.join(list(cur_way_list)))
f_drug.close()
f_symptom.close()
f_disease.close()
f_prescription_list.close()
f_zhengji_list.close()
f_bingji.close()
f_cur_way.close()
return
if __name__ == '__main__':
handler = MedicalGraph()
#初始化知识图谱部分
handler.__init__()
#创建只是图谱中的节点
handler.create_graphnodes()
#创造图谱的节点之间的关系
handler.create_graphrels()
#这部分是知识图谱中的
#数据导入成功
#handler.export_data()
5、总结
- 本篇是基于已有的rdf数据或者json数据去批量写入最终生成知识图谱,但是因为数据问题,领域内图谱不是很大,但是笔者在这个上面实现了医疗知识图谱的单轮问答和多轮问答。需要相关数据集或者源码可以邮箱我:sessioncookies@163.com,欢迎大家点赞收藏喔,更多干货下一期继续讲解,针对这些数据去做医疗实体识别的工作,这部分会比较多,主要设计了基于规则的前后向最大匹配算法还有BILSTM-CRF实体识别等,分别进行实验对比分析。