按照word标题格式设置三级标题,然后就可以自动提取了,默认关系都是“包含”。而且可以通过“粗体+下划线”对特定关键词进行提取。
# coding:utf-8
import re
import docx
import xlrd
import difflib
import base64
import pandas as pd
from time import sleep
import pyhanlp
import rdflib
from rdflib import RDF, Namespace, URIRef, Literal
from rdflib.namespace import OWL, RDF, RDFS
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#特殊字符正则表达式,排除中文、英文、数字之外的所有字符
#pattern = '([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005A\u0061-\u007a])'
# 创建RDF图
graph = rdflib.Graph()
#graph.parse("E:\\foaf.rdf", format="xml")
# 构造链接数据工具的命名空间
BOOK = Namespace('https://www.example.com/wiki#')
BAIDU = Namespace('https://baike.baidu.com/item/') #百度百科,关联关键词解释
# 绑定命名空间 PREFIX
graph.bind("owl", OWL)
graph.bind("book", BOOK)
graph.bind("baidu", BAIDU)
#是否是数字类型
def is_number(value):
if isinstance(value, str):
return False
else:
return True
#---------------- OWL 文件构造 ----------------------------------------
# 添加本体类
def addClass(classname, subclass=OWL.Thing):
cobj = classname
graph.add((cobj, RDF.type, OWL.Class)) # 将Animals设置为一个类
graph.add((cobj, RDFS.subClassOf, subclass)) # Animals为OWL.Thing的子类
return cobj
#添加关系
def addRel(relname, rdomain, rrange, subrel=OWL.topObjectProperty):
robj = relname
graph.add((robj, RDF.type, OWL.ObjectProperty))
graph.add((robj, RDFS.subPropertyOf, subrel)) #OWL.topObjectProperty)
for o1 in rdomain:
graph.add((robj, RDFS.domain, o1))
for o2 in rrange:
graph.add((robj, RDFS.range, o2))
return robj
#添加属性
def addData(dataname, ddomain, subdata=OWL.topDataProperty):
name = dataname
graph.add((name, RDF.type, OWL.DatatypeProperty))
graph.add((name, RDFS.subPropertyOf, subdata)) #OWL.topDataProperty
for oc in ddomain:
graph.add((name, RDFS.domain, oc))
return name
#添加实例
def addIndiv(iname, itype):
name = iname
graph.add((name, RDF.type, itype))
return name
#设置实例的属性值(OWL格式 Dataproperties)
def setNameValue(oind,oname,ovalue):
s = str(pyhanlp.HanLP.extractSummary(ovalue, 3))[1:-1] #提取正文内容摘要,返回值头尾有[ ] 符号,必须去掉!
ovalue = s #对于有复杂公式等特殊字符的正文,采取提取摘要方式处理
graph.add((oind, oname, Literal(ovalue)))
#直接设置 annotation,不用 OWL
def setAnnotation(oind,rdfstype,value):
graph.add((oind,rdfstype, Literal(value))) # 如 Label,comment 等
#graph.add((oind, RDFS.comment, Literal("The class of all animal types"))) # 添加描述
#添加三元组,为了与neo4j同步,增加了标签(rdf中的class)
def addSPO(s,p,o):
graph.add((s, p, o)) # 老虎属于动物的一种,后面的以此类推
#处理图书文档内容
def read_para(doc):
data=[]
#输出每一段的内容
for para in doc.paragraphs:
style_name =para.style.name
if para.text.strip()=='':
continue
if style_name.startswith("Heading"):
data.append(style_name + ":" + para.text)
else:
data.append(para.text)
return data
#获取知识点小标题,格式是:[1.ABC ]
def getKeys(linestring):
pattern = '\\b^[0-9]{1,2}\.\D*\s\\b'
new = re.search(pattern,linestring)
if new:
return new.group().strip()
else:
return None
#返回文档中用粗体和下划线标注的文字
def tagWords(para):
runs = para.runs
block_with_underline = []
for run in runs:
if run.underline == True and run.bold == True:
block_with_underline.append(run.text)
#print('run:',run.text)
return block_with_underline
#处理程序
def docx_main(bookdocfile, bookname):
global obj0, obj1, obj2, obj3, obj, docm
temp = "" #临时保存上一行
con = "" #当前实例的内容值
obj = None
bname = None
line = ''
tclass = None #当前类
tindiv = None #当前实例
# 获取文档对象
# docm = docx.Document("中国近现代史纲要全稿(2018).docx")
# 添加图书本科类
bname = addClass(BOOK["书名"])
bookname = BOOK[bookname]
obj0 = addIndiv(bookname, bname)
chapter = addClass(BOOK["章"],bname)
section = addClass(BOOK["节"], chapter)
lession = addClass(BOOK["小节"], section)
knowkey = addClass(BOOK["知识点"],lession)
# 添加关系名
include_of = addRel(BOOK["包含"], [chapter, section], [lession])
relate_of = addRel(BOOK["关联"], [chapter, section], [lession])
#设置书名属性值
setAnnotation(obj0, RDFS.label, bookname)
docm = docx.Document(bookdocfile)
#df = pd.DataFrame(read_para(docm))
#处理逐行段落实例数据,按照RDF格式写入
for para in docm.paragraphs:
style_name = para.style.name
if para.text.strip() == '':
continue
if style_name.startswith("Heading"):
line = style_name + ":" + para.text
else:
line = para.text
if not "Heading" in line:
temp=temp+"\n"+line #标题自动添加到内容首行
else:
con=temp
temp=line[10:]
if "Heading 1:" in line:
# 添加实例-章
cname=line[10:]
cname = cname.strip()
cname=cname.replace(" ","_")
cname=cname.replace(" ","_")
s = BOOK[cname]
obj1 = addIndiv(s, chapter)
obj=obj1
#设置label值
setAnnotation(obj1, RDFS.label, cname)
#设置图书与章节关系
addSPO(obj0,include_of,obj1)
#给后续的标注建立关系做准备
tclass = chapter
tindiv = obj1
if "Heading 2:" in line:
# 添加实例-节
sname = line[10:]
sname = sname.strip()
sname=sname.replace(" ","_")
sname=sname.replace(" ","_")
s = BOOK[sname]
obj2 = addIndiv(s, section)
obj=obj2
# 设置label值
setAnnotation(obj2, RDFS.label, sname)
#添加章-节关系
if not obj1 is None:
addSPO(obj1, include_of, obj2)
#给后续的标注建立关系做准备
tclass = section
tindiv = obj2
if "Heading 3:" in line:
# 添加实例-小节
lname = line[10:]
lname = lname.strip()
lname=lname.replace(" ","_")
lname=lname.replace(" ","_")
s = BOOK[lname]
obj3 = addIndiv(s, lession)
obj=obj3
# 设置属性值
setAnnotation(obj3, RDFS.label, lname)
# 添加节-小节关系
if not obj2 is None:
addSPO(obj2, include_of, obj3)
#给后续的标注建立关系做准备
tclass = lession
tindiv = obj3
#关联知识点
ks = getKeys(line) #一个自然段的开头是知识点名称
if ks != None:
s = BOOK[ks]
ko = addIndiv(s,knowkey)
setAnnotation(ko,RDFS.label,ks)
addSPO(obj3,relate_of,ko)
print("知识点:", ks)
# 根据标注自动添加新实例
tags = tagWords(para)
if tags != []:
for t in tags:
s = BAIDU[t]
ti = addIndiv(s, tclass)
setAnnotation(ti, RDFS.label, t)
addSPO(tindiv, relate_of, ti)
#关联度检测
def corelation(stra,strb):
degree = difflib.SequenceMatcher(None, stra, strb).quick_ratio()
return degree
#代码入口在这里 main()
def main(sfile,rdffile,bookname):
docx_main(sfile,bookname) #图书生成图谱
graph.serialize(rdffile, format="xml") # 保存为RDF/XML格式,当然也可以保存为其他格式
ttlfile = rdffile[:-3] + 'ttl'
graph.serialize(ttlfile, format="turtle")
#==================
if __name__== "__main__" :
main('图书','我的书.docx','E:/我的书.rdf','我的书名')
print('处理完成!')