很长时间没写这个相关的内容了,因此重新看了下基础,这一篇写的很透彻,建议每个想做知识图谱的人至少看一下:
https://zhuanlan.zhihu.com/p/32122644?from_voters_page=true
我发现白嫖的太严重了,只看只收藏不点赞,我要加门槛了。
安装和说明
安装rdflib库:
pip install rdflib
RDFlib是一个可以快速构建知识图谱的工具。
优点如下:
快速构建任意三元结构体,满足用户写owl本体文件和数据生成需求;
自带SPARQL查询接口;
带了很多标准的命名定义;
缺点如下:
本身不支持推理,无法做到像jena fuseki那样智能,例如之前章节提到的,hasActedIn和hasActor是一对反转的对象属性,生成hasActedIn即可反推hasActor,但如果使用rdflib,即使你在owl本体中设置了反转属性,但数据生成的时候是,依旧要自行根据规则,写入hasActedIn的数据和hasActor的数据。使用sparql可以完成一部分静态规则的补充推理,但仅仅是补充。
无法动态推理,即不能更新数据后,动态推理指的是:同样的规则可以对新导入的数据生效,不需要花时间生成图谱数据,显然rdflib不具有这样的优势。
是否可以配合其它插件完成推理,目前查到的是可以的,正在研究中,后续有结果在更新。
本体构建
基于前面章节,protege设计的结构,我们写这个本体,本体构建逻辑见logic函数:
OWLHelper.py
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD
class MyOntology:
def __init__(self):
self.g = Graph()
# 创建一个命名空间
self.ns = Namespace("http://kg_movie.org/ontology#")
def _insert_property(self, key, key_type, key_domain, key_range):
self.g.add((key, RDF.type, key_type))
self.g.add((key, RDFS.domain, key_domain))
self.g.add((key, RDFS.range, key_range))
def _namespace_bind(self):
self.g.bind("owl", OWL)
self.g.bind("xsd", XSD)
self.g.bind("rdf", RDF)
self.g.bind("rdfs", RDFS)
self.g.bind("foaf", FOAF)
self.g.bind("", self.ns)
def logic(self):
self.__create_industry()
self.__actor_onto_create()
self.__movie_onto_create()
self.__genre_onto_create()
self.__obj_property_create()
self.__class_disjoint()
self._namespace_bind()
self._serialise(save_url="kg_movie_owl.ttl",is_print=True)
def _serialise(self, save_url= None, is_print=False, format="turtle"):
res = self.g.serialize(destination=save_url, format=format)
if is_print:
print(res)
def __create_industry(self):
# 创建电影相关的顶层基类
movie_relevant = self.ns.MovieRelevant
self.g.add((movie_relevant, RDF.type, OWL.Class))
# 所有的顶层基类都应该归属于Thing
self.g.add((movie_relevant, RDFS.subClassOf, OWL.Thing))
def __actor_onto_create(self):
actor = self.ns.Actor
self.g.add((actor, RDF.type, OWL.Class)) # 创建一个类
self.g.add((actor, RDFS.subClassOf, self.ns.MovieRelevant)) # 归属于顶层基类
# 如果不认为自己创建的类全部属于一个基类下,可以创建其他类做替代,例如kg_movie三个类全部属于自己新建的MovieRelevant类,MovieRelevant属于Thing,如果有其他知识图谱,例如医疗
# 则创建MedicineRelevant,在其下面建立具体类,让MedicineRelevant属于Thing即可
# 创建一条属于此类的数据属性
actor_id = self.ns.actor_id
self._insert_property(actor_id, OWL.DatatypeProperty, actor, XSD.integer)
actor_bio = self.ns.actor_bio
self._insert_property(actor_bio, OWL.DatatypeProperty, actor, XSD.string)
actor_chName = self.ns.actor_chName
self._insert_property(actor_chName, OWL.DatatypeProperty, actor, XSD.string)
actor_foreName = self.ns.actor_foreName
self._insert_property(actor_foreName, OWL.DatatypeProperty, actor, XSD.string)
actor_nationality = self.ns.actor_nationality
self._insert_property(actor_nationality, OWL.DatatypeProperty, actor, XSD.string)
actor_constellation = self.ns.actor_constellation
self._insert_property(actor_constellation, OWL.DatatypeProperty, actor, XSD.string)
actor_birthplace = self.ns.actor_birthplace
self._insert_property(actor_birthplace, OWL.DatatypeProperty, actor, XSD.string)
actor_birthday = self.ns.actor_birthday
self._insert_property(actor_birthday, OWL.DatatypeProperty, actor, XSD.string)
actor_repWorks = self.ns.actor_repWorks
self._insert_property(actor_repWorks, OWL.DatatypeProperty, actor, XSD.string)
actor_achiem = self.ns.actor_achiem
self._insert_property(actor_achiem, OWL.DatatypeProperty, actor, XSD.string)
actor_brokerage = self.ns.actor_brokerage
self._insert_property(actor_brokerage, OWL.DatatypeProperty, actor, XSD.string)
def __movie_onto_create(self):
movie = self.ns.Movie
self.g.add((movie, RDF.type, OWL.Class))
self.g.add((movie, RDFS.subClassOf, self.ns.MovieRelevant)) # 归属于顶层基类
movie_id = self.ns.movie_id
self._insert_property(movie_id, OWL.DatatypeProperty, movie, XSD.integer)
movie_bio = self.ns.movie_bio
self._insert_property(movie_bio, OWL.DatatypeProperty, movie, XSD.string)
movie_chName = self.ns.movie_chName
self._insert_property(movie_chName, OWL.DatatypeProperty, movie, XSD.string)
movie_foreName = self.ns.movie_foreName
self._insert_property(movie_foreName, OWL.DatatypeProperty, movie, XSD.string)
movie_prodTime = self.ns.movie_prodTime
self._insert_property(movie_prodTime, OWL.DatatypeProperty, movie, XSD.string)
movie_prodCompany = self.ns.movie_prodCompany
self._insert_property(movie_prodCompany, OWL.DatatypeProperty, movie, XSD.string)
movie_director = self.ns.movie_director
self._insert_property(movie_director, OWL.DatatypeProperty, movie, XSD.string)
movie_screenwriter = self.ns.movie_screenwriter
self._insert_property(movie_screenwriter, OWL.DatatypeProperty, movie, XSD.string)
movie_genre = self.ns.movie_genre
self._insert_property(movie_genre, OWL.DatatypeProperty, movie, XSD.string)
movie_star = self.ns.movie_star
self._insert_property(movie_star, OWL.DatatypeProperty, movie, XSD.string)
movie_length = self.ns.movie_length
self._insert_property(movie_length, OWL.DatatypeProperty, movie, XSD.string)
movie_releaseTime = self.ns.movie_releaseTime
self._insert_property(movie_releaseTime, OWL.DatatypeProperty, movie, XSD.string)
movie_language = self.ns.movie_language
self._insert_property(movie_language, OWL.DatatypeProperty, movie, XSD.string)
movie_achiem = self.ns.movie_achiem
self._insert_property(movie_achiem, OWL.DatatypeProperty, movie, XSD.string)
def __genre_onto_create(self):
genre = self.ns.Genre
self.g.add((genre, RDF.type, OWL.Class))
self.g.add((genre, RDFS.subClassOf, self.ns.MovieRelevant)) # 归属于顶层基类
genre_id = self.ns.genre_id
self._insert_property(genre_id, OWL.DatatypeProperty, genre, XSD.integer)
genre_name = self.ns.genre_name
self._insert_property(genre_name, OWL.DatatypeProperty, genre, XSD.string)
def __obj_property_create(self):
has_actor = self.ns.hasActor
self._insert_property(has_actor, OWL.ObjectProperty, self.ns.Movie, self.ns.Actor)
has_genre = self.ns.hasGenre
self._insert_property(has_genre, OWL.ObjectProperty, self.ns.Movie, self.ns.Genre)
has_acted_in = self.ns.hasActedIn
self._insert_property(has_acted_in, OWL.ObjectProperty, self.ns.Actor, self.ns.Movie)
# 添加反转属性
self.g.add((has_actor, OWL.inverseOf, has_acted_in))
self.g.add((has_acted_in, OWL.inverseOf, has_actor))
def __class_disjoint(self):
self.g.add((self.ns.Movie, OWL.disjointWith, self.ns.Actor))
self.g.add((self.ns.Movie, OWL.disjointWith, self.ns.Genre))
这里有一个小小的区别,就是多了个MovieRelevant类别,之前是Thing下面有Movie/Actor/Genre,现在变成了Thing下面为MovieRelevant,MovieRelevant下面是Movie/Actor/Genre,多一个层级方便构建不通体系的本体文件。
数据构建
把本体和数据分开存,本体文件体积小,数据体积大,方便不同场景使用;
OWLHelper.py
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib.namespace import DC, DCTERMS, DOAP, FOAF, SKOS, OWL, RDF, RDFS, VOID, XMLNS, XSD
class MyKGMovieDataHelper:
def __init__(self):
self.g = Graph()
# 创建一个命名空间
self.ns_onto = Namespace("http://kg_movie.org/ontology#")
def _namespace_bind(self):
self.g.bind("owl", OWL)
self.g.bind("xsd", XSD)
self.g.bind("rdf", RDF)
self.g.bind("rdfs", RDFS)
self.g.bind("foaf", FOAF)
self.g.bind("", self.ns_onto)
def _serialise(self, save_url= None, is_print=False, format="turtle"):
res = self.g.serialize(destination=save_url, format=format)
if is_print:
print(res)
# 考虑到电影名称可能具有相同的,并不唯一,因此使用id做主键生成RDF数据
def _insert_movie(self, id, bio,chName, foreName, prodTime, prodCompany, director, screenwriter, genre, star, length, releaseTime, language, achiem):
key = URIRef(self.ns_onto+"Movie_"+str(id)) # 使用movie_id作为一个电影类的实例对象
self.g.add((key, RDF.type, self.ns_onto.Movie)) # 生成实例
self.g.add((key, self.ns_onto.movie_id, Literal(id,datatype=XSD.integer))) # 插入属性
self.g.add((key, self.ns_onto.movie_bio, Literal(bio)))
self.g.add((key, self.ns_onto.movie_chName, Literal(chName)))
self.g.add((key, self.ns_onto.movie_foreName, Literal(foreName)))
self.g.add((key, self.ns_onto.movie_prodTime, Literal(prodTime)))
self.g.add((key, self.ns_onto.movie_prodCompany, Literal(prodCompany)))
self.g.add((key, self.ns_onto.movie_director, Literal(director)))
self.g.add((key, self.ns_onto.movie_screenwriter, Literal(screenwriter)))
self.g.add((key, self.ns_onto.movie_genre, Literal(genre)))
self.g.add((key, self.ns_onto.movie_star, Literal(star)))
self.g.add((key, self.ns_onto.movie_length, Literal(length)))
self.g.add((key, self.ns_onto.movie_releaseTime, Literal(releaseTime)))
self.g.add((key, self.ns_onto.movie_language, Literal(language)))
self.g.add((key, self.ns_onto.movie_achiem, Literal(achiem)))
# 使用id做唯一主键,避免重复
def _insert_actor(self, id, bio, chName, foreName, nationality, constellation, birthplace, birthday, repWorks, achiem, brokerage):
key = URIRef(self.ns_onto+ "Actor_"+ str(id))
self.g.add((key, RDF.type, self.ns_onto.Actor)) # 生成实例
self.g.add((key, self.ns_onto.actor_id, Literal(id, datatype=XSD.integer))) # 插入属性
self.g.add((key, self.ns_onto.actor_bio, Literal(bio)))
self.g.add((key, self.ns_onto.actor_chName, Literal(chName)))
self.g.add((key, self.ns_onto.actor_foreName, Literal(foreName)))
self.g.add((key, self.ns_onto.actor_nationality, Literal(nationality)))
self.g.add((key, self.ns_onto.actor_constellation, Literal(constellation)))
self.g.add((key, self.ns_onto.actor_birthplace, Literal(birthplace)))
self.g.add((key, self.ns_onto.actor_birthday, Literal(birthday)))
self.g.add((key, self.ns_onto.actor_repWorks, Literal(repWorks)))
self.g.add((key, self.ns_onto.actor_achiem, Literal(achiem)))
self.g.add((key, self.ns_onto.actor_brokerage, Literal(brokerage)))
def _insert_genre(self,id, name):
key = URIRef(self.ns_onto + "Genre_"+ str(id))
self.g.add((key, RDF.type, self.ns_onto.Genre))
self.g.add((key, self.ns_onto.genre_id, Literal(id,datatype=XSD.integer)))
self.g.add((key, self.ns_onto.genre_name, Literal(name)))
def _insert_has_acted_in_property(self, actor_id, movie_id):
key = URIRef(self.ns_onto + "Actor_" + str(actor_id,))
value = URIRef(self.ns_onto + "Movie_" + str(movie_id)) # 使用movie_id作为一个电影类的实例对象
self.g.add((key, self.ns_onto.hasActedIn, value))
def _insert_has_genre_property(self,movie_id, genre_id):
key = URIRef(self.ns_onto + "Movie_" + str(movie_id,))
value = URIRef(self.ns_onto + "Genre_" + str(genre_id)) # 使用movie_id作为一个电影类的实例对象
self.g.add((key, self.ns_onto.hasGenre, value))
def test(self):
bio = "大话西游之月光宝盒》是周星驰彩星电影公司1994年制作和出品的一部经典的无厘头搞笑片,改编依据是吴承恩所撰写的神怪小说《西游记》,该片是《大话西游》系列的第一部,由刘镇伟导演,技安编剧,周星驰制作,周星驰、莫文蔚、吴孟达、蓝洁瑛等人主演。该片主要讲述了唐僧师徒前去西天取经之前的孙悟空因要杀唐僧被观音菩萨惩罚转世为至尊宝而后遇见白骨精、蜘蛛精的传奇故事。该片于1995年1月22日在香港首映并入围第十五届香港电影金像奖最佳编剧奖,周星驰凭借该片获得第二届香港电影评论学会奖最佳男主角奖。该片第二部《大话西游之大圣娶亲》已于1995年2月4日正式上映。2014年10月24日该片重映。编辑摘要"
chname = "大话西游之月光宝盒"
fore = "西游记第一百零一回之月光宝盒,月光宝盒,齐天大圣东游记"
prod = None
prodc = "彩星电影公司"
self._insert_movie(1,bio,chname,fore,prod,prodc,None,None,None,None,None,None,None,None)
self._namespace_bind()
self._serialise(save_url="data.ttl")
本体和数据生成
这里服用了之前章节写的SQLHelper,方便打开读取数据库数据。
MovieKG.py
from SQLHelper import SQL_OP
from OWLHelper import MyKGMovieDataHelper, MyOntology
def run():
onto_op = MyOntology()
onto_op.logic() # 生成本体文件
kg_data_op = MyKGMovieDataHelper()
sql_op = SQL_OP(ip="localhost", user="root", psd=".root", db="kg_movie", charset='utf8')
sql_op.open()
data_movie = sql_op.get_execute("select * from movie")
data_actor = sql_op.get_execute("select * from actor")
data_genre = sql_op.get_execute("select * from genre")
data_actor_to_movie = sql_op.get_execute("select * from actor_to_movie")
data_movie_to_genre = sql_op.get_execute("select * from movie_to_genre")
for item in data_movie:
kg_data_op._insert_movie(item[0], item[1], item[2], item[3], item[4], item[5], item[6], item[7], item[8], item[9], item[10], item[11], item[12], item[13])
for item in data_actor:
kg_data_op._insert_actor(item[0], item[1], item[2], item[3], item[4], item[5], item[6], item[7], item[8], item[9], item[10])
for item in data_genre:
kg_data_op._insert_genre(item[0], item[1])
for item in data_actor_to_movie:
kg_data_op._insert_has_acted_in_property(item[0], item[1])
for item in data_movie_to_genre:
kg_data_op._insert_has_genre_property(item[0],item[1])
kg_data_op._namespace_bind()
kg_data_op._serialise(save_url="kg_movie_data.ttl")
数据查询
分别打开本体文件和数据文件,使用Graph() + Graph()就可以合并图数据。
MovieKG.py
def query_run():
from rdflib import Graph
o = Graph()
o.parse("kg_movie_owl.ttl", format="turtle")
d = Graph()
d.parse("kg_movie_data.ttl", format="turtle")
g = o+d
# q = """
# prefix : <http://kg_movie.org/ontology#>
# prefix owl: <http://www.w3.org/2002/07/owl#>
# prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# prefix xsd: <http://www.w3.org/2001/XMLSchema#>
# SELECT * WHERE {
# ?genre rdf:type :Genre .
# ?genre :genre_name ?genre_name .
# }
# """
# q = """
# prefix : <http://kg_movie.org/ontology#>
# prefix owl: <http://www.w3.org/2002/07/owl#>
# prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# prefix xsd: <http://www.w3.org/2001/XMLSchema#>
# SELECT * WHERE {
# ?x :actor_chName '周星驰'.
# ?x :actor_bio ?n.
# }
# """
# q = """
# prefix : <http://kg_movie.org/ontology#>
# prefix owl: <http://www.w3.org/2002/07/owl#>
# prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# prefix xsd: <http://www.w3.org/2001/XMLSchema#>
# SELECT * WHERE {
# ?p :actor_chName '朱茵'.
# ?p :hasActedIn ?movie.
# ?movie :movie_chName ?out
# }
# """
q = """
prefix : <http://kg_movie.org/ontology#>
SELECT * WHERE {
?p :hasActedIn ?m.
} limit 1
"""
res = g.query(q)
print(res.serialize(format='json').decode("utf-8"))
推理我后面再研究研究吧。
参考:
https://rdflib.readthedocs.io/en/stable/gettingstarted.html