知识工程课程实践

1. 整体任务说明

主题:基于天津大学基本信息构建知识库,并实现基于模板匹配的知识库问答系统
目的:实践内容包括知识获取、知识表示、知识存储、知识管理、知识问答以及推理。通过实现一个知识库问答系统,将本课程所学的理论知识以及实践操作结合,并融会贯通到实际应用中。
所需软件以及编程环境:python3、Apache Jena Fuseki

2. 数据准备

本文选用的数据包括智算学部所有硕士生导师的信息、所有专业的信息、天津大学所有院系的信息、天津大学所有职能部门的信息,本文主要以智算学部所有导师的信息为例。
从数据所在的网址中获取请求URL和请求方式,在Chrome浏览器中打开网页,在网页中右击,选择“检查”,在Network中的Doc下面可以找到,如图所示:
在这里插入图片描述
然后在pycharm中来编写抓取数据的代码。
第一步:获取要爬虫的网页

url = 'http://cic.tju.edu.cn/jyjx/yjsjy/yjsdsml.htm'
strhtml = requests.get(url)
strhtml.encoding = 'utf-8'

第二步:从Elements中找到要爬取的内容所在的位置
在这里插入图片描述
找到网页中对应的块,然后提取出块中的信息。
以智算学部所有导师的信息为例,需要从教师名字超链接中进入教师的个人网页,然后爬取数据。在这里我先爬取教师的个人网页,然后保存在list中,再逐个读取并进入读取到的网址,获取每位教师的信息。

def get_teacher_id(url):
    strhtml = requests.get(url)
    strhtml.encoding = 'utf-8'
    soup = BeautifulSoup(strhtml.text, 'html.parser')
    #print(strhtml.text)
    data = soup.select('#vsb_content > table > tbody > tr > td > a')
    results = []
    for item in data:
        results.append(item.get('href'))
    #print(results)
    #print(len(results))
    list2 = []
    for i in results:
        if i not in list2:
            list2.append(i)
    #print(list2)
    #print(len(list2))
    return list2

第三步:因为并不是所有教师主页中都包含相同格式的内容,而且在后续将数据转为RDF时,需要按照标签来进行转换。因此我们设定每位老师需要爬取的信息,提取具有相同内容的教师的信息,若某位教师主页内容与我们设定的不同,我们则跳过这位老师。最后将所有爬取到的教师的信息以json格式保存到json文档中。

#从教师的个人主页中爬取数据,并以json格式保存到json文档中
for teacher_url in teacher_url:
    print(teacher_url)
    teacher_info_html = requests.get(teacher_url)
    teacher_info_html.encoding = 'utf-8'
    teacher_info_html = teacher_info_html.text
    #print(teacher_info_html)

    if getinfo(teacher_url, teacher_info_html) == 0:
        continue	# 信息不同,跳过,继续下一位老师
    else:
        final.append(getinfo(teacher_url, teacher_info_html))

with open(os.path.join('E:/kg', 'teacher.json'), 'w', encoding='utf-8') as opt_file:
    json.dump(final, opt_file, ensure_ascii=False)  # 将opt数据写入json文件

部分结果如图所示:
在这里插入图片描述
完整代码如下:

# -*- coding: utf-8 -*-
import re
import requests
from bs4 import BeautifulSoup
import json
import os

def get_teacher_id(url):
    strhtml = requests.get(url)
    strhtml.encoding = 'utf-8'
    soup = BeautifulSoup(strhtml.text, 'html.parser')
    #print(strhtml.text)
    data = soup.select('#vsb_content > table > tbody > tr > td > a')
    results = []
    for item in data:
        results.append(item.get('href'))
    #print(results)
    #print(len(results))
    list2 = []
    for i in results:
        if i not in list2:
            list2.append(i)
    #print(list2)
    #print(len(list2))
    return list2

def getinfo(teacher_id, teacher_info_html):
    """
    :param teacher_id:
    :param teacher_info_html:
    """
    teacher_soup = BeautifulSoup(teacher_info_html, 'html.parser')
    #print(teacher_info_html)
    info = []
    for p in teacher_soup.select('.v_news_content p')[1:2]:
        print(p)
        info.append(p.text.strip())
    print(len(info))
    print(info)
    if info == [''] or len(info)==0 :
        print("no infomation")
        return 0
    all_info = "".join(info[0].split())
    print(all_info)#str类型

    name_list = all_info.split("姓名:")
    if len(name_list) == 1:
        print("no infomation")
        return 0
    name = name_list[1].split("职称:")[0]
    print(name)

    title_list = name_list[1].split("职称:")  # 职称
    if len(title_list) == 1:
        print("no infomation")
        return 0
    title = title_list[1].split("所在系别:")[0]
    print(title)

    faculty_list = title_list[1].split("所在系别:")  # 所在系别
    if len(faculty_list) == 1:
        print("no infomation")
        return 0
    faculty = faculty_list[1].split("主讲课程:")[0]
    print(faculty)

    course_list = faculty_list[1].split("主讲课程:")  # 主讲课程
    if len(course_list) == 1:
        print("no infomation")
        return 0
    course = course_list[1].split("导师类型:")[0]
    course = re.split('[、,/。]',course)
    print(course)
    print(len(course))

    type_list = course_list[1].split("导师类型:")   # 导师类型
    if len(type_list) == 1:
        print("no infomation")
        return 0
    type = type_list[1].split("电子邮件:")[0]
    type = re.split("[、,/。]",type)
    print(type)
    print(len(type))

    email_list = type_list[1].split("电子邮件:")  # 电子邮件
    if len(email_list) == 1:
        print("no infomation")
        return 0
    email = email_list[1].split("研究领域:")[0]
    print(email)

    field_list = email_list[1].split("研究领域:")  # 研究领域
    if len(field_list) == 1:
        print("no infomation")
        return 0
    field = field_list[1].split("研究方向:")[0]
    print(field)

    direction_list = field_list[1].split("研究方向:") # 研究方向
    if len(direction_list) == 1:
        print("no infomation")
        return 0
    direction = direction_list[1].split("个人主页:")[0]
    print(direction)

    if len(direction_list[1].split("个人主页:")) == 1:
        print("no infomation")
        return 0
    homepage = direction_list[1].split("个人主页:")[1]  # 个人主页
    print(homepage)

    teacher_info_json = {
            'id': teacher_id,
            'name': name,
            'title': title,
            'faculty':faculty,
            'course': course,
            'type':type,
            'email': email,
            'field': field,
            'direction': direction,
            'homepage': homepage,
        }
    print(course,type,teacher_info_json)
    return teacher_info_json

if __name__ == '__main__':
    try:
        final = []
        url = 'http://cic.tju.edu.cn/jyjx/yjsjy/yjsdsml.htm'
        teacher_url = get_teacher_id(url)
        teacher_url.remove('../../info/1067/1147.htm')
        print(teacher_url)
        print(len(teacher_url))
        for teacher_url in teacher_url:
            print(teacher_url)
            teacher_info_html = requests.get(teacher_url)
            teacher_info_html.encoding = 'utf-8'
            teacher_info_html = teacher_info_html.text
            #print(teacher_info_html)

            if getinfo(teacher_url, teacher_info_html) == 0:
                continue
            else:
                final.append(getinfo(teacher_url, teacher_info_html))

        with open(os.path.join('E:kg', 'teacher.json'), 'w', encoding='utf-8') as opt_file:
            json.dump(final, opt_file, ensure_ascii=False)  # 将opt数据写入json文件
    except requests.exceptions.ConnectionError:
        print('Handle Exception')

然后将数据转为RDF数据,供后面进行SPARQL语句查询。

第一步:定义三元组的格式,如代码所示,其中‘%05d’会被教师的顺序所代替,“%s”会被原始数据代替

teacher_id = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_id> \"%s\" ."
name = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_name> \"%s\" ."
title = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_title> \"%s\" ."
faculty = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_faculty> \"%s\" ."
course = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_course> \"%s\" ."
type = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_type> \"%s\" ."
email = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_email> \"%s\" ."
field = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_field> \"%s\" ."
direction = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_direction> \"%s\" ."
homepage = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_homepage> \"%s\" ."

第二步:逐行读取数据,然后按照定义好的三元组,将数据存为实体关系三元组。以教师——主讲课程和教师——类型三元组为例,每名教师可能教授几门课程,每门课程都要和对应的教师建立实体关系三元组,每名教师可能有多种类型,每种类型都要和对应的教师建立实体关系三元组

 for course_1 in load_dict[i]['course']:
     #print(course_1)
     course_str = course % (i+1, course_1)
     print(course_str)
     triples.append(course_str)
     triples_sum += 1

 for type_1 in load_dict[i]['type']:
     #print(type_1)
     type_str= type % (i+1, type_1)
     print(type_str)
     triples.append(type_str)
     triples_sum += 1

第三步:在后续操作中,需要对问题文本进行分词、词性标注,为了避免教师姓名分词出现错误,要提前制作教师姓名词性字典,代码如下所示


file_3 = open('E:/kg/triple/teachers_name.txt', 'w', encoding='UTF-8')

完整代码如下:

#!/usr/bin/env python
#encoding=utf-8

import random
import sys
import os
import json

teacher_id = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_id> \"%s\" ."
name = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_name> \"%s\" ."
title = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_title> \"%s\" ."
faculty = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_faculty> \"%s\" ."
course = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_course> \"%s\" ."
type = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_type> \"%s\" ."
email = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_email> \"%s\" ."
field = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_field> \"%s\" ."
direction = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_direction> \"%s\" ."
homepage = "<http://kg.course/homework/teacher_%05d> <http://kg.course/homework/teacher_homepage> \"%s\" ."

with open(os.path.join('E:/kg/triple', 'teacher.json'),'r',encoding='utf-8') as load_f:
    load_dict = json.load(load_f)
    print(load_dict)
    print(len(load_dict))

file_3 = open('E:/kg/triple/teachers_name.txt', 'w', encoding='UTF-8')

triples_sum  = 0
triples = []

for i in range(0, len(load_dict)) :
    print(i)
    id_str = teacher_id % (i+1, load_dict[i]['id'])
    triples.append(id_str)
    triples_sum += 1

    name_str = name % (i+1, load_dict[i]['name'])
    triples.append(name_str)
    triples_sum += 1
    file_3.write(load_dict[i]['name'] + ' ' + 'nr' + '\n')

    title_str = title % (i+1, load_dict[i]['title'])
    triples.append(title_str)
    triples_sum += 1

    faculty_str = faculty % (i+1, load_dict[i]['faculty'])
    triples.append(faculty_str)
    triples_sum += 1

    print(load_dict[i]['course'],len(load_dict[i]['course']))
    #print()
    for course_1 in load_dict[i]['course']:
        #print(course_1)
        course_str = course % (i+1, course_1)
        print(course_str)
        triples.append(course_str)
        triples_sum += 1

    for type_1 in load_dict[i]['type']:
        #print(type_1)
        type_str= type % (i+1, type_1)
        print(type_str)
        triples.append(type_str)
        triples_sum += 1

    email_str =  email % (i+1, load_dict[i]['email'])
    triples.append(email_str)
    triples_sum += 1

    field_str = field % (i+1, load_dict[i]['field'])
    triples.append(field_str)
    triples_sum += 1

    direction_str = direction % (i+1, load_dict[i]['direction'])
    triples.append(direction_str)
    triples_sum += 1

    homepage_str = homepage % (i+1, load_dict[i]['homepage'])
    triples.append(homepage_str)
    triples_sum += 1

filename = ("'E:/kg/triple/teacher_%d_triples.nt") % (triples_sum)
with open(filename,"w+",encoding='utf-8') as fd:
    fd.write("\n".join(triples))

部分结果如下:
在这里插入图片描述

3.知识库导入Apache Jena Fuseki

首先从官网下载Jena Fuseki,解压到指定位置安装。
在这里插入图片描述
然后启动cmd,进入Jena Fuseki所在的位置。然后启动Jena Fuseki,并创建数据库名称,命令如下图所示:
在这里插入图片描述
然后从浏览器中输入localhost:3030,进入Jena Fuseki。从dataset中选择刚刚建立的数据库testds,并将创建的RDF数据上传到数据库中。上传成功后,如下图所示
在这里插入图片描述
接下来就是设计问答系统

4.设计问答系统

第一步:要通过SPARQLWrapper包来连接数据库

sparql_base = SPARQLWrapper("http://localhost:3030/testds")

第二步:设计SPARQL查询语句的模板

# SPARQL模板
SPARQL_PREAMBLE = u"""
PREFIX school: <http://kg.course/homework/>
"""

SPARQL_TEM = u"{preamble}\n" + \
             u"SELECT DISTINCT {select} WHERE {{\n" + \
             u"{expression}\n" + \
             u"}}\n"

SPARQL_TEM_count = u"{preamble}\n" + \
                    u"SELECT (COUNT({select}) AS {count}) WHERE {{\n" + \
                    u"{expression}\n" + \
                    u"}}\n"

SPARQL_ASK_TEM = u"{preamble}\n" + \
                u"ASK WHERE{{\n" + \
                u"{expression}\n" + \
                u"}}\n"

第三步:设计正则匹配
首先要对问句列表中的问句进行分词处理,为了避免教师姓名、学院、专业、职能部门名称分词有误,因此导入外部字典,代码如下所示:

# 引入外部字典
jieba.load_userdict("all_name.txt")

分词代码如下所示:

default_questions = [
    "天津大学有哪些学院?",
    "化工学院的简介是什么?",
    "化工学院电话?",
    "化工学院的网址是什么?",
    "天津大学有哪些职能部门?",
    "研究生院的介绍是什么?",
    "研究生院的电话是什么?",
    "研究生院的网址是什么?",
    "智算学部有哪些专业?",
    "教授类型有多少老师?",
    "老师类型有哪些?",
    "硕士生导师类型有哪些老师?",
    "硕士生导师类型有多少老师?",
    "动画专业的专业培养是什么?",
    "计算机专业的考研就业情况?",
    "王晓飞老师主讲了哪些课?",
    "王晓飞老师主讲了几门课?",
    "王晓飞老师的研究方向是什么?",
    "王晓飞老师是博士生导师吗?",
    "王晓飞老师的个人主页是什么?"
]
questions = default_questions[0:]
seg_lists = []
# tokenizing questions
for question in questions:
    words = pseg.cut(question)                             #分词 词性标注
    seg_list = [Word(word.encode("utf-8"), flag) for word, flag in words]           #分词后用Word类初始化,把words看成objects
    seg_lists.append(seg_list)              

然后设置关键词,使正则匹配时可以根据关键词匹配到正确的问题

# 正则匹配关键词设置
tutor_type_master = (W("硕士生导师") | W("硕导")| W("硕士导师")| W("硕士生"))
tutor_type_PhD = (W("博士生导师") | W("博导")| W("博士导师")| W("博士生"))
teacher = (W(pos = "nr") | W(pos = "x"))
whose = (W("谁") | W("哪些"))
quantity = (W("多少") | W("几") | W("几门"))

institution = (W("学院")|W("职能部门"))
college = (W(pos="nr"))
attribute = (W("简介")|W("电话")|W("网址")|W("介绍"))

teacher_title=(W("老师"))
class_1=(W('类型'))
teacher_title_name=(W("教授"))
college_1=(W("智算学部"))
major=(W('计算机专业')|W('动画专业')|W('软件工程专业'))
development=(W('培养'))
work=(W('考研')|W('就业'))

direction = (W("方向") | W("研究方向"))
page = (W("个人主页") | W("主页"))

接下来编写正则匹配规则。以第一个Rule为例,condition表示当遇见关键词institution和whose时,就采用how_many_institution_question这个查询函数

# 正则匹配规则编写
rules = [
    #天津有哪些机构(学院或职能部门)?
    Rule(condition = Star(Any(), greedy=False) + whose + institution, action=how_many_institution_question),
    #某学院的电话?
    Rule(condition= college + Star(Any(), greedy=False) + attribute, action=what_attribute_institution_question),
    #某导师类型有哪些老师?
    Rule(condition = tutor_type_master + Star(Any(), greedy = False) + whose, action = who_is_master_tutor_question),
    #某导师类型有多少老师?
    Rule(condition = tutor_type_master + Star(Any(), greedy = False) + quantity, action = how_many_teachers_are_master_tutor_question),
    #老师类型有哪些?
    Rule(condition = teacher_title + Star(Any(), greedy = False)+ class_1,action=teacher_title_question),
    #教授类型有多少老师?
    Rule(condition = teacher_title_name + Star(Any(), greedy=False)+quantity,action=how_many_professor_question),
    #智算学部有哪些专业?
    Rule(condition = college_1 + Star(Any(), greedy = False)+ whose,action=which_majors_in_cal_question),
    #某专业的专业培养是什么?
    Rule(condition = major + Star(Any(), greedy = False)+ development,action=what_development_question),
    #某专业的考研就业情况?
    Rule(condition = major + Star(Any(), greedy = False)+ work,action=how_work_question),
    # 某老师主讲了哪些课?
    Rule(condition=teacher + Star(Any(), greedy=False) + whose, action=what_courses_teacher_question),
    # 某老师主讲了几门课?
    Rule(condition=teacher + Star(Any(), greedy=False) + quantity, action=how_many_courses_teacher_question),
    # 某老师的研究方向是什么?
    Rule(condition=teacher + Star(Any(), greedy=False) + direction, action=what_direction_teacher_question),
    # 某老师是博士生导师吗?
    Rule(condition=teacher + Star(Any(), greedy=False) + tutor_type_PhD, action=teacher_is_PhD_tutor_question),
    # 某老师的个人主页是什么?
    Rule(condition=teacher + Star(Any(), greedy=False) + page, action=what_homepage_teacher_question)
]

编写查询函数,部分查询函数如下所示:

# 某老师的研究方向是什么?
def what_direction_teacher_question(x):
    select = u"?x0"
    sparql = None
    for w in x:
        if w.pos == "nr":
            e=u"?teacherid school:teacher_name \"{person}\". " \
              u"?teacherid school:teacher_direction ?x0.".format(person=w.token.decode("utf-8"))
            sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select, expression=INDENT + e)
            break
    return sparql

# 某老师是博士生导师吗?
def teacher_is_PhD_tutor_question(x):
    sparql = None
    for w in x:
        if w.pos == "nr":
            e=u"?teacherid school:teacher_name \"{person}\". " \
              u"?teacherid school:teacher_type \"博导\".".format(person=w.token.decode("utf-8"))
            sparql = SPARQL_ASK_TEM.format(preamble=SPARQL_PREAMBLE, expression=INDENT + e)
            break
    return sparql

# 某老师的个人主页是什么?
def what_homepage_teacher_question(x):
    select = u"?x0"
    sparql = None
    for w in x:
        if w.pos == "nr":
            e=u"?teacherid school:teacher_name \"{person}\". " \
              u"?teacherid school:teacher_homepage ?x0.".format(person=w.token.decode("utf-8"))
            sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select, expression=INDENT + e)
            break
    return sparql

完整代码如下:

# coding: utf-8
# standard import
import re
from refo import finditer, Predicate, Star, Any
import jieba.posseg as pseg
from jieba import suggest_freq
import jieba
from SPARQLWrapper import SPARQLWrapper, JSON
import io

import importlib,sys
importlib.reload(sys)

# 引入外部字典
jieba.load_userdict("all_name.txt")
sparql_base = SPARQLWrapper("http://localhost:3030/testds")

# SPARQL config
# SPARQL模板
SPARQL_PREAMBLE = u"""
PREFIX school: <http://kg.course/homework/>
"""

SPARQL_TEM = u"{preamble}\n" + \
             u"SELECT DISTINCT {select} WHERE {{\n" + \
             u"{expression}\n" + \
             u"}}\n"

SPARQL_TEM_count = u"{preamble}\n" + \
                    u"SELECT (COUNT({select}) AS {count}) WHERE {{\n" + \
                    u"{expression}\n" + \
                    u"}}\n"

SPARQL_ASK_TEM = u"{preamble}\n" + \
                u"ASK WHERE{{\n" + \
                u"{expression}\n" + \
                u"}}\n"

INDENT = "    "

class Word(object):
    """treated words as objects"""
    def __init__(self, token, pos):
        self.token = token
        self.pos = pos


class W(Predicate):
    """object-oriented regex for words"""
    def __init__(self, token=".*", pos=".*"):
        self.token = re.compile(token + "$")
        self.pos = re.compile(pos + "$")
        super(W, self).__init__(self.match)

    def match(self, word):
        m1 = self.token.match(word.token.decode('utf-8')) #.decode('utf-8')
        m2 = self.pos.match(word.pos)
        return m1 and m2


class Rule(object):
    def __init__(self, condition=None, action=None):
        assert condition and action
        self.condition = condition
        self.action = action

    def apply(self, sentence):
        matches = []                                    #  #finditer是要返回sentence中所有与self.condition相匹配的全部字串,返回形式为迭代器。 m为其中某个
        for m in finditer(self.condition, sentence):      
            i, j = m.span()                                          #以tuple的形式返回范围,m在sentence中的范围
            matches.extend(sentence[i:j])#将关键词依次放进matches中
        if __name__ == '__main__':
            print("----------applying %s----------" % self.action.__name__)
        return self.action(matches)#将关键词列表给action代表的函数

#   有哪些机构?
def how_many_institution_question(x):
    select ="?x0"
    sparql = None;
    for w in x:
        if w.token.decode("utf-8")=="学院" or w.token.decode("utf-8")=="职能部门":
            if w.token.decode("utf-8")=="学院":
                s = "college"
            if w.token.decode("utf-8")=="职能部门":
                s = "office"
            e="?id school:{institution}_name ?x0".format(institution=s)
            sparql=SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select,expression=INDENT + e)
            break
    return sparql

#   某机构的属性是什么?
def what_attribute_institution_question(x):
    s1=s2=s3="null"
    select = "?x0"
    sparql = None
    for w in x:
        if w.token.decode("utf-8")=="机械工程学院" or w.token.decode("utf-8")=="精密仪器与光电子工程学院" or w.token.decode("utf-8")=="电气自动化与信息工程学院" or w.token.decode("utf-8")=="微电子学院" or w.token.decode("utf-8")=="建筑工程学院" or w.token.decode("utf-8")=="建筑学院" or w.token.decode("utf-8")=="化工学院" or w.token.decode("utf-8")=="环境科学与工程学院" or w.token.decode("utf-8")=="管理与经济学部" or w.token.decode("utf-8")=="马克思主义学院" or w.token.decode("utf-8")=="理学院" or w.token.decode("utf-8")=="生命科学学院" or w.token.decode("utf-8")=="医学部" or w.token.decode("utf-8")=="教育学院" or w.token.decode("utf-8")=="国际教育学院":
            s1 = w.token.decode("utf-8")
            s3 = "college"
        if w.token.decode("utf-8")=="纪委、监察室" or w.token.decode("utf-8")=="研究生院" or w.token.decode("utf-8")=="党委离退休工作处" or w.token.decode("utf-8")=="发展战略研究中心" or w.token.decode("utf-8")=="党委教师工作部、人事处" or w.token.decode("utf-8")=="科学技术发展研究院(医科建设办公室)" or w.token.decode("utf-8")=="国际合作与交流处、港澳台事务办公室" or w.token.decode("utf-8")=="审计处" or w.token.decode("utf-8")=="保卫处":
            s1 = w.token.decode("utf-8")
            s3 = "office"
        if w.token.decode("utf-8")=="简介" or w.token.decode("utf-8")=="介绍":
            s2 = "introduction"
        if w.token.decode("utf-8")=="电话":
            s2 = "phone"
        if w.token.decode("utf-8") == "网址":
            s2 = "network"
        if s1!="null" and s2!="null":
            e = "?id school:{institution}_name \"{name}\".?id school:{institution}_{attribute} ?x0.".format(institution=s3,attribute=s2,name=s1)
            sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select,expression=INDENT + e)
            break
    return sparql

#某专业的专业培养是什么?#正确
def what_development_question(x):
    select = "?x0"
    sparql = None
    for w in x:
        if w.token.decode("utf-8") == "动画专业" or w.token.decode("utf-8") == "计算机专业" or w.token.decode("utf-8") == "软件工程专业" :
            e = "?projectid school:project_name \"{type}\". ?projectid school:project_cultivation ?x0.".format(type = w.token.decode("utf-8"))     
            sparql = SPARQL_TEM.format(preamble = SPARQL_PREAMBLE, select = select, expression = INDENT + e)
            break
    return sparql

#某专业的考研就业情况? #正确
def how_work_question(x):
    select = "?x0"
    sparql = None
    for w in x:
        if w.token.decode("utf-8") == "计算机专业" or w.token.decode("utf-8") == "动画专业" or w.token.decode("utf-8") == "软件工程专业" :
            e = "?projectid school:project_name \"{type}\". ?projectid school:project_work ?x0.".format(type = w.token.decode("utf-8"))     
            sparql = SPARQL_TEM.format(preamble = SPARQL_PREAMBLE, select = select, expression = INDENT + e)
            break
    return sparql

#智算学部有哪些专业? #正确
def which_majors_in_cal_question(x):
    select = "?x0"
    sparql = None
    for w in x:
        if w.token.decode("utf-8") == "智算学部" or w.token.decode("utf-8")== "哪些":
            e = "?projectid school:project_name ?x0"
            sparql = SPARQL_TEM.format(preamble = SPARQL_PREAMBLE, select = select, expression = INDENT + e)
            break
    return sparql

#老师类型有哪些?   #正确
def teacher_title_question(x):
    select = "?x0"
    sparql = None
    for w in x:
        if w.token.decode("utf-8") == "老师"or w.token.decode("utf-8") == "类型":
            e = "?teacherid school:teacher_title ?x0."       
            sparql = SPARQL_TEM.format(preamble = SPARQL_PREAMBLE,select = select, expression = INDENT + e)
            break
    return sparql

#教授类型有多少老师?   #正确
def how_many_professor_question(x):
    select = "?teacher"
    count= "?x0"
    sparql = None
    for w in x:
        if w.token.decode("utf-8") == "教授"or w.token.decode("utf-8") == "多少":
            e = "?teacherid school:teacher_title \"教授\". ?teacherid school:teacher_name ?teacher."      
            sparql = SPARQL_TEM_count.format(preamble = SPARQL_PREAMBLE,select = select,count = count, expression = INDENT + e)
            break
    return sparql

#某导师类型有哪些老师?  #正确
def who_is_master_tutor_question(x):
    select = "?x0"
    sparql = None
    for w in x:
        if w.token.decode("utf-8") == "硕士生" or w.token.decode("utf-8")== "哪些":
            e = "?x school:teacher_type \"{type}导师\". ?x school:teacher_name ?x0.".format(type = w.token.decode("utf-8"))     
            sparql = SPARQL_TEM.format(preamble = SPARQL_PREAMBLE, select = select, expression = INDENT + e)
            break
    return sparql

#某导师类型有多少老师?  #正确
def how_many_teachers_are_master_tutor_question(x):
    select = "?teachers"
    count = "?x0"
    sparql = None
    for w in x:
        if w.token.decode("utf-8") == "硕士生" or w.token.decode("utf-8") == "多少":
            e = "?teachers school:teacher_type \"{type}导师\".".format(type = w.token.decode("utf-8"))                     
            sparql = SPARQL_TEM_count.format(preamble = SPARQL_PREAMBLE, select = select, count = count, expression = INDENT + e)
            break
    return sparql

# 某老师主讲了哪些课?
def what_courses_teacher_question(x):
    select = u"?x0"
    sparql = None
    for w in x:
        if w.pos == "nr":
            e=u"?teacherid school:teacher_name \"{person}\". " \
              u"?teacherid school:teacher_course ?x0.".format(person=w.token.decode("utf-8"))
            sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select, expression=INDENT + e)
            break
    return sparql

# 某老师主讲了几门课?
def how_many_courses_teacher_question(x):
    select = u"?courses"
    count = u"?x0"
    sparql = None
    for w in x:
        if w.pos == "nr":
            e=u"?teacherid school:teacher_name \"{person}\". " \
              u"?teacherid school:teacher_course ?courses.".format(person=w.token.decode("utf-8"))
            sparql = SPARQL_TEM_count.format(preamble=SPARQL_PREAMBLE, select=select, count=count,
                                             expression=INDENT + e)
            break
    return sparql

# 某老师的研究方向是什么?
def what_direction_teacher_question(x):
    select = u"?x0"
    sparql = None
    for w in x:
        if w.pos == "nr":
            e=u"?teacherid school:teacher_name \"{person}\". " \
              u"?teacherid school:teacher_direction ?x0.".format(person=w.token.decode("utf-8"))
            sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select, expression=INDENT + e)
            break
    return sparql

# 某老师是博士生导师吗?
def teacher_is_PhD_tutor_question(x):
    sparql = None
    for w in x:
        if w.pos == "nr":
            e=u"?teacherid school:teacher_name \"{person}\". " \
              u"?teacherid school:teacher_type \"博导\".".format(person=w.token.decode("utf-8"))
            sparql = SPARQL_ASK_TEM.format(preamble=SPARQL_PREAMBLE, expression=INDENT + e)
            break
    return sparql

# 某老师的个人主页是什么?
def what_homepage_teacher_question(x):
    select = u"?x0"
    sparql = None
    for w in x:
        if w.pos == "nr":
            e=u"?teacherid school:teacher_name \"{person}\". " \
              u"?teacherid school:teacher_homepage ?x0.".format(person=w.token.decode("utf-8"))
            sparql = SPARQL_TEM.format(preamble=SPARQL_PREAMBLE, select=select, expression=INDENT + e)
            break
    return sparql

def encode(s):
    return ' '.join([bin(ord(c)).replace('0b', '') for c in s])

if __name__ == "__main__":
    default_questions = [
        "天津大学有哪些学院?",
        "化工学院的简介是什么?",
        "化工学院电话?",
        "化工学院的网址是什么?",
        "天津大学有哪些职能部门?",
        "研究生院的介绍是什么?",
        "研究生院的电话是什么?",
        "研究生院的网址是什么?",
        "智算学部有哪些专业?",
        "教授类型有多少老师?",
        "老师类型有哪些?",
        "硕士生导师类型有哪些老师?",
        "硕士生导师类型有多少老师?",
        "动画专业的专业培养是什么?",
        "计算机专业的考研就业情况?",
        "王晓飞老师主讲了哪些课?",
        "王晓飞老师主讲了几门课?",
        "王晓飞老师的研究方向是什么?",
        "王晓飞老师是博士生导师吗?",
        "王晓飞老师的个人主页是什么?"
    ]
    questions = default_questions[0:]
    seg_lists = []
    # tokenizing questions
    for question in questions:
        words = pseg.cut(question)                             #分词 词性标注
        seg_list = [Word(word.encode("utf-8"), flag) for word, flag in words]           #分词后用Word类初始化,把words看成objects
        seg_lists.append(seg_list)              

    # some rules for matching
    # TODO: customize your own rules here
    # 正则匹配关键词设置
    tutor_type_master = (W("硕士生导师") | W("硕导")| W("硕士导师")| W("硕士生"))
    tutor_type_PhD = (W("博士生导师") | W("博导")| W("博士导师")| W("博士生"))
    teacher = (W(pos = "nr") | W(pos = "x"))
    whose = (W("谁") | W("哪些"))
    quantity = (W("多少") | W("几") | W("几门"))
    
    institution = (W("学院")|W("职能部门"))
    college = (W(pos="nr"))
    attribute = (W("简介")|W("电话")|W("网址")|W("介绍"))
    
    teacher_title=(W("老师"))
    class_1=(W('类型'))
    teacher_title_name=(W("教授"))
    college_1=(W("智算学部"))
    major=(W('计算机专业')|W('动画专业')|W('软件工程专业'))
    development=(W('培养'))
    work=(W('考研')|W('就业'))
    
    direction = (W("方向") | W("研究方向"))
    page = (W("个人主页") | W("主页"))
    
    # 正则匹配规则编写
    rules = [        
        #天津有哪些机构(学院或职能部门)?
        Rule(condition = Star(Any(), greedy=False) + whose + institution, action=how_many_institution_question),
        #某学院的电话?
        Rule(condition= college + Star(Any(), greedy=False) + attribute, action=what_attribute_institution_question),        
        #某导师类型有哪些老师?
        Rule(condition = tutor_type_master + Star(Any(), greedy = False) + whose, action = who_is_master_tutor_question),
        #某导师类型有多少老师?
        Rule(condition = tutor_type_master + Star(Any(), greedy = False) + quantity, action = how_many_teachers_are_master_tutor_question),
        #老师类型有哪些?
        Rule(condition = teacher_title + Star(Any(), greedy = False)+ class_1,action=teacher_title_question),
        #教授类型有多少老师?
        Rule(condition = teacher_title_name + Star(Any(), greedy=False)+quantity,action=how_many_professor_question),
        #智算学部有哪些专业?
        Rule(condition = college_1 + Star(Any(), greedy = False)+ whose,action=which_majors_in_cal_question),
        #某专业的专业培养是什么?
        Rule(condition = major + Star(Any(), greedy = False)+ development,action=what_development_question),
        #某专业的考研就业情况?
        Rule(condition = major + Star(Any(), greedy = False)+ work,action=how_work_question),
        # 某老师主讲了哪些课?
        Rule(condition=teacher + Star(Any(), greedy=False) + whose, action=what_courses_teacher_question),
        # 某老师主讲了几门课?
        Rule(condition=teacher + Star(Any(), greedy=False) + quantity, action=how_many_courses_teacher_question),
        # 某老师的研究方向是什么?
        Rule(condition=teacher + Star(Any(), greedy=False) + direction, action=what_direction_teacher_question),
        # 某老师是博士生导师吗?
        Rule(condition=teacher + Star(Any(), greedy=False) + tutor_type_PhD, action=teacher_is_PhD_tutor_question),
        # 某老师的个人主页是什么?
        Rule(condition=teacher + Star(Any(), greedy=False) + page, action=what_homepage_teacher_question)
    ]

    file_3 = open('result.txt', 'w', encoding='UTF-8')

    # matching and querying
    for seg in seg_lists:#提取问题
        # display question each
        question = []
        for s in seg:
            print(str(s.token,encoding='utf-8'))#输出问题,分词后的版本
            question.append(s.token)
        
        for q in question:
            file_3.write(str(q,encoding='utf-8'))                #file_3.write(u','.join(question))
        print()

        for rule in rules:#提取一个rule
            query = rule.apply(seg)

            if query is None:
                continue
            print(query)
            file_3.write(query + '\n')

            if query:
                sparql_base.setQuery(query)
                sparql_base.setReturnFormat(JSON)
                results = sparql_base.query().convert()

                if "results" in results.keys():
                    if not results["results"]["bindings"]:
                        print("No answer found :(")
                        print('\n')
                        continue
                    for result in results["results"]["bindings"]:
                        print("Result: ", result["x0"]["value"])
                        file_3.write("Result: " + result["x0"]["value"] + '\n')
                        print('\n')
                    file_3.write('\n')                                        #add
                else:
                    print("Result: ", results["boolean"])
                    boo = str(results["boolean"])
                    if boo == "True":
                        file_3.write(u"Result: " + "True" + '\n')
                    else:
                        file_3.write(u"Result: " + "False" + '\n')

最终的输出结果部分如图所示:
在这里插入图片描述在这里插入图片描述在这里插入图片描述

5.结语

其余的数据处理过程类似,希望可以与大家互相交流~~

参考资料:
天津大学《知识工程》课程;
https://blog.csdn.net/u010744489/article/details/105923730。
在这里插入图片描述

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值