# -*- coding: utf-8 -*-
import os
import re
import operator
from neo4j import neo4j
from numpy import *
from pyltp import SentenceSplitter
from pyltp import Segmentor
# from pyltp import Postagger
# from pyltp import NamedEntityRecognizer
# from rediscluster import StrictRedisCluster
# from pyltp import CustomizedSegmentor
LTP_DATA_DIR = 'E:\model\ltp_data' # ltp模型目录的路径
OUTSIDE_DIC_DIR = 'E:\model\lexic.txt' # 外部字典路径
OUTSIDE_ENTITY_NW_DIR = 'E:\model\ltp_data\entity_nw.txt' # 外部实体模型路径
BAYES_VOCAB_DIR = 'bayes_vocab_list.txt'
BAYES_DIC_INDEX_DIR = "bayes_dic_index.txt"
BAYES_DIC_VEC_DIR = "bayes_dic_vec.txt"
# 模型加载
segmentor = Segmentor() # 分词模型
segmentor.load_with_lexicon(os.path.join(LTP_DATA_DIR, 'cws.model'), OUTSIDE_DIC_DIR)
# 个性化分词
# customized_segmentor = CustomizedSegmentor() # 初始化实例
# customized_segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model'), os.path.join(LTP_DATA_DIR, 'new.model'))
# postagger = Postagger() # 词性模型
# postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model'))
# recognizer = NamedEntityRecognizer() # 实体模型
# recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))
# def redis_cluster():
# redis_nodes = [{'host': '99.12.90.102', 'port': 17000},
# {'host': '99.12.90.102', 'port': 17001},
# {'host': '99.12.90.102', 'port': 17002},
# {'host': '99.12.90.6', 'port': 17003},
# {'host': '99.12.90.6', 'port': 17004},
# {'host': '99.12.90.6', 'port': 17005}
# ]
# try:
# redis_conn = StrictRedisCluster(startup_nodes=redis_nodes)
# return redis_conn
# except Exception as e:
# print(e)
#
#
# r = redis_cluster()
# 分句
# def sentence_splitter(sentences):
# sentence_list = SentenceSplitter.split(sentences)
# return sentence_list
# 分词
def segment(sentence):
words = segmentor.segment(sentence)
# words = customized_segmentor.segment(sentence)
words_list = list(words)
return words_list
# 词性标注
# def pos_tag(words):
# pos_tags = postagger.postag(words) # 词性标注
# return pos_tags
# 加载外部实体模型 和贝叶斯训练模型
def load_outside_entity():
try:
nw_in_file = open(OUTSIDE_ENTITY_NW_DIR, 'r', encoding='UTF-8')
bayes_list_in_file = open(BAYES_VOCAB_DIR, 'r', encoding='UTF-8')
bayes_dic_index_in_file = open(BAYES_DIC_INDEX_DIR, 'r', encoding='UTF-8')
bayes_dic_vec_in_file = open(BAYES_DIC_VEC_DIR, 'r', encoding='UTF-8')
nw_text_line = nw_in_file.readline()
bayes_list_text_line = bayes_list_in_file.readline()
bayes_dic_index_text_line = bayes_dic_index_in_file.readline()
bayes_dic_vec_text_line = bayes_dic_vec_in_file.readline()
list_nw = []
bayes_vocab_list = []
bayes_dic_index = {}
bayes_dic_vec = {}
while nw_text_line:
list_nw.append(nw_text_line.replace('\n', ''))
nw_text_line = nw_in_file.readline()
while bayes_list_text_line:
bayes_vocab_list.append(bayes_list_text_line.replace('\n', ''))
bayes_list_text_line = bayes_list_in_file.readline()
while bayes_dic_index_text_line:
temp = bayes_dic_index_text_line.replace('\n', '').split(',')
bayes_dic_index.setdefault(temp[0], temp[1])
bayes_dic_index_text_line = bayes_dic_index_in_file.readline()
while bayes_dic_vec_text_line:
temp = bayes_dic_vec_text_line.replace('\n', '').split(':')
zz1 = re.sub(r'\[', "", temp[1])
zz2 = re.sub(r'\]', "", zz1)
a = zz2.split(',')
list_tmp = []
for val in a:
list_tmp.append(float(val))
bayes_dic_vec.setdefault(temp[0], list_tmp)
bayes_dic_vec_text_line = bayes_dic_vec_in_file.readline()
except Exception as e:
print(e)
finally:
nw_in_file.close()
# nj_in_file.close()
bayes_list_in_file.close()
bayes_dic_index_in_file.close()
bayes_dic_vec_in_file.close()
return list_nw, bayes_vocab_list, bayes_dic_index, bayes_dic_vec
# 问题抽象
def question_extraction(sentence, list_nw):
try:
dic_nw = {}
nw_index = 0
for index in range(len(list_nw)):
if sentence.find(list_nw[index]) > -1:
sentence = sentence.replace(list_nw[index], 'nw')
dic_nw.setdefault('nw' + str(nw_index), list_nw[index])
nw_index += 1
words = segment(sentence)
print('分词结果:', words)
except Exception as e:
print(e)
return
return words, dic_nw
# 将文档词条转换成词向量
def set_words2vec(bayes_vocab_list, input_set):
return_vec = [0]*len(bayes_vocab_list)
for word in input_set:
if word in bayes_vocab_list:
return_vec[bayes_vocab_list.index(word)] += 1
return return_vec
# 朴素贝叶斯分类器
def classify_nb(vec2classify, bayes_dic_index, bayes_dic_vec):
try:
dic_var = {}
for key in bayes_dic_vec:
dic_var.setdefault(key, sum(vec2classify*bayes_dic_vec.get(key)))
max_zip_dic = max(zip(dic_var.values(), dic_var.keys()))
except Exception as e:
print(e)
return
return bayes_dic_index.get(max_zip_dic[1])
"""
'在中国上映的但丁密码的票房怎么样?'
'电影但丁密码的导演是谁?'
'李根在但丁密码中饰演的角色是谁?'
'忘记签到怎么办?'
'五险一金是什么?'
'养老保险的缴纳比例是多少?'
'看护假如何申请?'
短时外出假如何申请?
应届本科毕业生如何落户?
员工落户条件是什么?
人力资源室办公地点在哪里?
公积金个人缴纳比例是多少
渠道团队的经理是谁?
养老保险属于什么?
考勤管理员是谁
医疗保险享受待遇是什么
"""
if __name__ == "__main__":
try:
list_nw, bayes_vocab_list, bayes_dic_index, bayes_dic_vec = load_outside_entity()
sentence = '考勤管理员是谁?'
print('原句:', sentence)
word_list, dic_nw = question_extraction(sentence, list_nw)
print('问题抽象结果:', word_list)
this_doc = array(set_words2vec(bayes_vocab_list, word_list))
classify_result = classify_nb(this_doc, bayes_dic_index, bayes_dic_vec)
lst_tmp = classify_result.split(' ')
print('问题分类结果:', classify_result)
word_index = 0
nw_index = 0
for word in lst_tmp:
match_result = re.match('nw', word)
if match_result is not None and operator.eq(match_result.group(), 'nw'):
lst_tmp[word_index] = dic_nw['nw' + str(nw_index)]
nw_index += 1
word_index += 1
print('问题抽象解析:', lst_tmp)
if lst_tmp[0] == 'entity': # 反向关系 实体-关系-实体 fe1
print(neo4j.get_cypher('fe1').format('{', lst_tmp[2], '}', lst_tmp[1]))
result = neo4j.selectByCQL(neo4j.get_cypher('fe1').format('{', '\'' + lst_tmp[2] + '\'', '}', lst_tmp[1]))
print(list(result))
elif lst_tmp[2] == 'entity': # 正向关系 实体-关系-实体 ze1
print(neo4j.get_cypher('ze1').format('{', lst_tmp[0], '}', lst_tmp[1]))
result = neo4j.selectByCQL(neo4j.get_cypher('ze1').format('{', '\'' + lst_tmp[0] + '\'', '}', lst_tmp[1]))
print(list(result))
elif lst_tmp[2] == 'att': # 正向关系 实体-属性-值 za1
print(neo4j.get_cypher('za1').format(lst_tmp[0], lst_tmp[1]))
result = neo4j.selectByCQL(neo4j.get_cypher('za1').format('\'' + lst_tmp[0] + '\'', lst_tmp[1]))
print(list(result))
except Exception as e:
print(e)