三、贝叶斯模型

__author__ = 'ding'
'''
pyspark 贝叶斯模型的处理
'''
import numpy as np
from pyhanlp import *
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vector, Vectors
from scipy.sparse import csr_matrix
from .Bayes import NaiveBayesModel
import jieba.posseg as pseg
import jieba
from pypinyin import pinyin,lazy_pinyin


class ModelProcess:
    def __init__(self):
        self.abstractDict = dict()
        self.modelIndex = 0
        self.questionsPattern = self.loadQuestionsPattern()
        self.vocabulary = self.loadVocabulary()
        self.nbModel = self.loadClassifierModel()

    def analyQuery(self, querySentence):
        print("原始句子:" + querySentence)
        print("========HanLP分词开始========")

        # 抽象句子,对关键词进行抽象
        abstr = self.queryAbstract(querySentence)
        print('句子抽象化结果:' + abstr)

        # 将抽象的句子与训练集模板匹配,拿到对应模板
        strPatt = self.queryClassify(abstr)
        print('句子套用模板结果:' + strPatt)

        finalPattern = self.queryExtenstion(strPatt)
        print("原始句子替换成系统可识别的结果:" + finalPattern)
        return finalPattern

    # 问句抽象化
    def queryAbstract(self, querySentence):
        # terms = HanLP.segment(querySentence)
        terms = pseg.cut(querySentence)
        abstractQuery = ''
        nrCount = 0
        # for term in terms:
        jieba.load_userdict('./movie.txt')
        for word, cx in terms:
            # word = term.word
            # cx = term.nature.name
            print(word, cx)
            if cx == 'nz':
                abstractQuery += 'nz '
                self.abstractDict['nz'] = word
            elif cx == "nr" and nrCount == 0:
                abstractQuery += "nnt "
                self.abstractDict["nnt"] = word
                nrCount += 1
            elif cx == 'nr' and nrCount == 1:
                abstractQuery += "nnr "
                self.abstractDict["nnr"] = word
                nrCount += 1
            elif cx == 'x':
                abstractQuery += "x "
                self.abstractDict["x"] = word
            elif cx == 'ng':
                abstractQuery += "ng "
                self.abstractDict["ng"] = word
            else:
                abstractQuery += word + " "
        print("========HanLP分词结束========")
        return abstractQuery

    # 模板还原成句子
    def queryExtenstion(self, queryPattern):
        keys = self.abstractDict.keys()
        for key in keys:
            if key in queryPattern:
                value = self.abstractDict[key]
                queryPattern = queryPattern.replace(key, value)
            else:
                queryPattern = '没有匹配到正确的模板'
        extendedQuery = queryPattern
        # 当前句子处理完成,抽象dict释放空间,等待下一个句子的处理
        self.abstractDict.clear()
        # self.abstractDict = None
        return extendedQuery

    # 加载词汇表
    def loadVocabulary(self):
        vocabulary = dict()
        with open('./question/vocabulary.txt', 'r', encoding='utf-8') as fp:
            lines = fp.readlines()
            for line in lines:
                vocabulary[line.split(':')[0]] = line.split(':')[1].replace('\n', '')
            return vocabulary

    # 加载文件返回内容
    def loadFile(self, filename):
        content = ''
        with open(filename, 'r', encoding='utf-8') as fp:
            lines = fp.readlines()
            for line in lines:
                content += line + '`'
        return content

    # 句子分词后与词汇表记性key匹配转换为向量数组
    def sentenceToVector(self, sentence):
        vector = np.zeros(len(self.vocabulary))
        segment = HanLP.segment(sentence)
        for word in segment:
            word = word.word
            try:
                key = int(self.get_key(self.vocabulary, word)[0])
                vector[key] = 1
            except:
                continue
        return vector

    # 根据value返回字典中的key
    def get_key(self, dic, value):
        return [k for k, v in dic.items() if v == value]

    # 加载贝叶斯模型
    def loadClassifierModel(self):
        train_list = list()

        # 0,评分
        scoreQuestions = self.loadFile("./question/【0】评分.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('0.0', Vectors.dense(array))
            train_list.append(train_one)

        # 1,上映时间
        scoreQuestions = self.loadFile("./question/【1】上映.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('1.0', Vectors.dense(array))
            train_list.append(train_one)

        # 2,类型
        scoreQuestions = self.loadFile("./question/【2】风格.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('2.0', Vectors.dense(array))
            train_list.append(train_one)

        # 3,剧情
        scoreQuestions = self.loadFile("./question/【3】剧情.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('3.0', Vectors.dense(array))
            train_list.append(train_one)

        # 4,某电影有哪些演员出演
        scoreQuestions = self.loadFile("./question/【4】某电影有哪些演员出演.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('4.0', Vectors.dense(array))
            train_list.append(train_one)

        # 5,演员简介
        scoreQuestions = self.loadFile("./question/【5】演员简介.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('5.0', Vectors.dense(array))
            train_list.append(train_one)

        # 6,某演员出演过的类型电影有哪些
        scoreQuestions = self.loadFile("./question/【6】某演员出演过的类型电影有哪些.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('6.0', Vectors.dense(array))
            train_list.append(train_one)

        # 7,某演员演了什么电影
        scoreQuestions = self.loadFile("./question/【7】某演员演了什么电影.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('7.0', Vectors.dense(array))
            train_list.append(train_one)

        # 8,演员参演的电影评分【大于】
        scoreQuestions = self.loadFile("./question/【8】演员参演的电影评分【大于】.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('8.0', Vectors.dense(array))
            train_list.append(train_one)

        # 9,演员参演的电影评分【小于】
        scoreQuestions = self.loadFile("./question/【9】演员参演的电影评分【小于】.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('9.0', Vectors.dense(array))
            train_list.append(train_one)

        # 10,某演员出演过哪些类型的电影
        scoreQuestions = self.loadFile("./question/【10】某演员出演过哪些类型的电影.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('10.0', Vectors.dense(array))
            train_list.append(train_one)

        # 11,演员A和演员B合作了哪些电影
        scoreQuestions = self.loadFile("./question/【11】演员A和演员B合作了哪些电影.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('11.0', Vectors.dense(array))
            train_list.append(train_one)

        # 12,某演员一共演过多少电影
        scoreQuestions = self.loadFile("./question/【12】某演员一共演过多少电影.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('12.0', Vectors.dense(array))
            train_list.append(train_one)

        # 13,演员出生日期
        scoreQuestions = self.loadFile("./question/【13】演员出生日期.txt")
        sentences = scoreQuestions.split("`")
        for sentence in sentences:
            array = self.sentenceToVector(sentence)
            train_one = LabeledPoint('13.0', Vectors.dense(array))
            train_list.append(train_one)

        conf = SparkConf().setAppName('NaiveBayesTest').setMaster('local[*]')
        sc = SparkContext(conf=conf)
        distData = sc.parallelize(train_list, numSlices=10)
        nb_model = NaiveBayes.train(distData)
        return nb_model

    # 加载问题模板
    def loadQuestionsPattern(self):
        questionsPattern = dict()
        with open('./question/question_classification.txt', 'r', encoding='utf-8') as fp:
            lines = fp.readlines()
            for line in lines:
                questionsPattern[line.split(':')[0]] = line.split(':')[1].replace('\n', '')
        return questionsPattern

    # 贝叶斯分类器的结果,拿到匹配的分类标签,返回问题模板
    def queryClassify(self, sentence):
        array = self.sentenceToVector(sentence)
        v = Vectors.dense(array)
        index = self.nbModel.predict(v)
        self.modelIndex = int(index)
        print("the model index is " + str(self.modelIndex))
        return self.questionsPattern[str(self.modelIndex)]

data相关资料
https://pan.baidu.com/s/1YjNqa68jVMR2KDxMnImUgA
pyspark的搭建可以百度
使用pip install pyspark得看人品 我多试了几次 最后成功了。 还是因为懒~~
具体实现原理可以参考https://blog.csdn.net/Appleyk/article/details/80422055中博客的内容我这里就不做引申。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值