22

# -*- coding: utf-8 -*-
import os
import re
import operator

from neo4j import neo4j
from numpy import *

from pyltp import SentenceSplitter
from pyltp import Segmentor
# from pyltp import Postagger
# from pyltp import NamedEntityRecognizer
# from rediscluster import StrictRedisCluster
# from pyltp import CustomizedSegmentor


LTP_DATA_DIR = 'E:\model\ltp_data'  # ltp模型目录的路径
OUTSIDE_DIC_DIR = 'E:\model\lexic.txt'  # 外部字典路径
OUTSIDE_ENTITY_NW_DIR = 'E:\model\ltp_data\entity_nw.txt'  # 外部实体模型路径
BAYES_VOCAB_DIR = 'bayes_vocab_list.txt'
BAYES_DIC_INDEX_DIR = "bayes_dic_index.txt"
BAYES_DIC_VEC_DIR = "bayes_dic_vec.txt"

# 模型加载
segmentor = Segmentor()  # 分词模型
segmentor.load_with_lexicon(os.path.join(LTP_DATA_DIR, 'cws.model'), OUTSIDE_DIC_DIR)

# 个性化分词
# customized_segmentor = CustomizedSegmentor()  # 初始化实例
# customized_segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model'), os.path.join(LTP_DATA_DIR, 'new.model'))

# postagger = Postagger()  # 词性模型
# postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model'))

# recognizer = NamedEntityRecognizer()  # 实体模型
# recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))


# def redis_cluster():
#     redis_nodes = [{'host': '99.12.90.102', 'port': 17000},
#                    {'host': '99.12.90.102', 'port': 17001},
#                    {'host': '99.12.90.102', 'port': 17002},
#                    {'host': '99.12.90.6', 'port': 17003},
#                    {'host': '99.12.90.6', 'port': 17004},
#                    {'host': '99.12.90.6', 'port': 17005}
#                   ]
#     try:
#         redis_conn = StrictRedisCluster(startup_nodes=redis_nodes)
#         return redis_conn
#     except Exception as e:
#         print(e)
#
#
# r = redis_cluster()


# 分句
# def sentence_splitter(sentences):
#     sentence_list = SentenceSplitter.split(sentences)
#     return sentence_list


# 分词
def segment(sentence):
    words = segmentor.segment(sentence)
    # words = customized_segmentor.segment(sentence)
    words_list = list(words)
    return words_list


# 词性标注
# def pos_tag(words):
#     pos_tags = postagger.postag(words)  # 词性标注
#     return pos_tags


# 加载外部实体模型 和贝叶斯训练模型
def load_outside_entity():
    try:
        nw_in_file = open(OUTSIDE_ENTITY_NW_DIR, 'r', encoding='UTF-8')
        bayes_list_in_file = open(BAYES_VOCAB_DIR, 'r', encoding='UTF-8')
        bayes_dic_index_in_file = open(BAYES_DIC_INDEX_DIR, 'r', encoding='UTF-8')
        bayes_dic_vec_in_file = open(BAYES_DIC_VEC_DIR, 'r', encoding='UTF-8')

        nw_text_line = nw_in_file.readline()
        bayes_list_text_line = bayes_list_in_file.readline()
        bayes_dic_index_text_line = bayes_dic_index_in_file.readline()
        bayes_dic_vec_text_line = bayes_dic_vec_in_file.readline()

        list_nw = []
        bayes_vocab_list = []
        bayes_dic_index = {}
        bayes_dic_vec = {}

        while nw_text_line:
            list_nw.append(nw_text_line.replace('\n', ''))
            nw_text_line = nw_in_file.readline()

        while bayes_list_text_line:
            bayes_vocab_list.append(bayes_list_text_line.replace('\n', ''))
            bayes_list_text_line = bayes_list_in_file.readline()

        while bayes_dic_index_text_line:
            temp = bayes_dic_index_text_line.replace('\n', '').split(',')
            bayes_dic_index.setdefault(temp[0], temp[1])
            bayes_dic_index_text_line = bayes_dic_index_in_file.readline()

        while bayes_dic_vec_text_line:
            temp = bayes_dic_vec_text_line.replace('\n', '').split(':')
            zz1 = re.sub(r'\[', "", temp[1])
            zz2 = re.sub(r'\]', "", zz1)
            a = zz2.split(',')
            list_tmp = []
            for val in a:
                list_tmp.append(float(val))

            bayes_dic_vec.setdefault(temp[0], list_tmp)
            bayes_dic_vec_text_line = bayes_dic_vec_in_file.readline()

    except Exception as e:
        print(e)

    finally:
        nw_in_file.close()
        # nj_in_file.close()
        bayes_list_in_file.close()
        bayes_dic_index_in_file.close()
        bayes_dic_vec_in_file.close()

    return list_nw, bayes_vocab_list, bayes_dic_index, bayes_dic_vec


# 问题抽象
def question_extraction(sentence, list_nw):
    try:
        dic_nw = {}
        nw_index = 0
        for index in range(len(list_nw)):
            if sentence.find(list_nw[index]) > -1:
                sentence = sentence.replace(list_nw[index], 'nw')
                dic_nw.setdefault('nw' + str(nw_index), list_nw[index])
                nw_index += 1
        words = segment(sentence)
        print('分词结果:', words)
    except Exception as e:
        print(e)
        return

    return words, dic_nw


# 将文档词条转换成词向量
def set_words2vec(bayes_vocab_list, input_set):
    return_vec = [0]*len(bayes_vocab_list)
    for word in input_set:
        if word in bayes_vocab_list:
            return_vec[bayes_vocab_list.index(word)] += 1
    return return_vec


# 朴素贝叶斯分类器
def classify_nb(vec2classify, bayes_dic_index, bayes_dic_vec):
    try:
        dic_var = {}
        for key in bayes_dic_vec:
            dic_var.setdefault(key, sum(vec2classify*bayes_dic_vec.get(key)))

        max_zip_dic = max(zip(dic_var.values(), dic_var.keys()))

    except Exception as e:
        print(e)
        return

    return bayes_dic_index.get(max_zip_dic[1])


"""
'在中国上映的但丁密码的票房怎么样?'
'电影但丁密码的导演是谁?'
'李根在但丁密码中饰演的角色是谁?'
'忘记签到怎么办?'
'五险一金是什么?'
'养老保险的缴纳比例是多少?'
'看护假如何申请?'
短时外出假如何申请?
应届本科毕业生如何落户?
员工落户条件是什么?
人力资源室办公地点在哪里?
公积金个人缴纳比例是多少
渠道团队的经理是谁?
养老保险属于什么?
考勤管理员是谁
医疗保险享受待遇是什么
"""
if __name__ == "__main__":
    try:
        list_nw, bayes_vocab_list, bayes_dic_index, bayes_dic_vec = load_outside_entity()
        sentence = '考勤管理员是谁?'
        print('原句:', sentence)
        word_list, dic_nw = question_extraction(sentence, list_nw)
        print('问题抽象结果:', word_list)

        this_doc = array(set_words2vec(bayes_vocab_list, word_list))
        classify_result = classify_nb(this_doc, bayes_dic_index, bayes_dic_vec)
        lst_tmp = classify_result.split(' ')
        print('问题分类结果:', classify_result)

        word_index = 0
        nw_index = 0
        for word in lst_tmp:
            match_result = re.match('nw', word)
            if match_result is not None and operator.eq(match_result.group(), 'nw'):
                lst_tmp[word_index] = dic_nw['nw' + str(nw_index)]
                nw_index += 1
            word_index += 1
        print('问题抽象解析:', lst_tmp)

        if lst_tmp[0] == 'entity':  # 反向关系 实体-关系-实体 fe1
            print(neo4j.get_cypher('fe1').format('{', lst_tmp[2], '}', lst_tmp[1]))
            result = neo4j.selectByCQL(neo4j.get_cypher('fe1').format('{', '\'' + lst_tmp[2] + '\'', '}', lst_tmp[1]))
            print(list(result))
        elif lst_tmp[2] == 'entity':  # 正向关系 实体-关系-实体 ze1
            print(neo4j.get_cypher('ze1').format('{', lst_tmp[0], '}', lst_tmp[1]))
            result = neo4j.selectByCQL(neo4j.get_cypher('ze1').format('{', '\'' + lst_tmp[0] + '\'', '}', lst_tmp[1]))
            print(list(result))
        elif lst_tmp[2] == 'att':  # 正向关系 实体-属性-值 za1
            print(neo4j.get_cypher('za1').format(lst_tmp[0], lst_tmp[1]))
            result = neo4j.selectByCQL(neo4j.get_cypher('za1').format('\'' + lst_tmp[0] + '\'', lst_tmp[1]))
            print(list(result))

    except Exception as e:
        print(e)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值