__author__ = 'ding'
'''
pyspark 贝叶斯模型的处理
'''
import numpy as np
from pyhanlp import *
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vector, Vectors
from scipy.sparse import csr_matrix
from .Bayes import NaiveBayesModel
import jieba.posseg as pseg
import jieba
from pypinyin import pinyin,lazy_pinyin
class ModelProcess:
def __init__(self):
self.abstractDict = dict()
self.modelIndex = 0
self.questionsPattern = self.loadQuestionsPattern()
self.vocabulary = self.loadVocabulary()
self.nbModel = self.loadClassifierModel()
def analyQuery(self, querySentence):
print("原始句子:" + querySentence)
print("========HanLP分词开始========")
# 抽象句子,对关键词进行抽象
abstr = self.queryAbstract(querySentence)
print('句子抽象化结果:' + abstr)
# 将抽象的句子与训练集模板匹配,拿到对应模板
strPatt = self.queryClassify(abstr)
print('句子套用模板结果:' + strPatt)
finalPattern = self.queryExtenstion(strPatt)
print("原始句子替换成系统可识别的结果:" + finalPattern)
return finalPattern
# 问句抽象化
def queryAbstract(self, querySentence):
# terms = HanLP.segment(querySentence)
terms = pseg.cut(querySentence)
abstractQuery = ''
nrCount = 0
# for term in terms:
jieba.load_userdict('./movie.txt')
for word, cx in terms:
# word = term.word
# cx = term.nature.name
print(word, cx)
if cx == 'nz':
abstractQuery += 'nz '
self.abstractDict['nz'] = word
elif cx == "nr" and nrCount == 0:
abstractQuery += "nnt "
self.abstractDict["nnt"] = word
nrCount += 1
elif cx == 'nr' and nrCount == 1:
abstractQuery += "nnr "
self.abstractDict["nnr"] = word
nrCount += 1
elif cx == 'x':
abstractQuery += "x "
self.abstractDict["x"] = word
elif cx == 'ng':
abstractQuery += "ng "
self.abstractDict["ng"] = word
else:
abstractQuery += word + " "
print("========HanLP分词结束========")
return abstractQuery
# 模板还原成句子
def queryExtenstion(self, queryPattern):
keys = self.abstractDict.keys()
for key in keys:
if key in queryPattern:
value = self.abstractDict[key]
queryPattern = queryPattern.replace(key, value)
else:
queryPattern = '没有匹配到正确的模板'
extendedQuery = queryPattern
# 当前句子处理完成,抽象dict释放空间,等待下一个句子的处理
self.abstractDict.clear()
# self.abstractDict = None
return extendedQuery
# 加载词汇表
def loadVocabulary(self):
vocabulary = dict()
with open('./question/vocabulary.txt', 'r', encoding='utf-8') as fp:
lines = fp.readlines()
for line in lines:
vocabulary[line.split(':')[0]] = line.split(':')[1].replace('\n', '')
return vocabulary
# 加载文件返回内容
def loadFile(self, filename):
content = ''
with open(filename, 'r', encoding='utf-8') as fp:
lines = fp.readlines()
for line in lines:
content += line + '`'
return content
# 句子分词后与词汇表记性key匹配转换为向量数组
def sentenceToVector(self, sentence):
vector = np.zeros(len(self.vocabulary))
segment = HanLP.segment(sentence)
for word in segment:
word = word.word
try:
key = int(self.get_key(self.vocabulary, word)[0])
vector[key] = 1
except:
continue
return vector
# 根据value返回字典中的key
def get_key(self, dic, value):
return [k for k, v in dic.items() if v == value]
# 加载贝叶斯模型
def loadClassifierModel(self):
train_list = list()
# 0,评分
scoreQuestions = self.loadFile("./question/【0】评分.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('0.0', Vectors.dense(array))
train_list.append(train_one)
# 1,上映时间
scoreQuestions = self.loadFile("./question/【1】上映.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('1.0', Vectors.dense(array))
train_list.append(train_one)
# 2,类型
scoreQuestions = self.loadFile("./question/【2】风格.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('2.0', Vectors.dense(array))
train_list.append(train_one)
# 3,剧情
scoreQuestions = self.loadFile("./question/【3】剧情.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('3.0', Vectors.dense(array))
train_list.append(train_one)
# 4,某电影有哪些演员出演
scoreQuestions = self.loadFile("./question/【4】某电影有哪些演员出演.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('4.0', Vectors.dense(array))
train_list.append(train_one)
# 5,演员简介
scoreQuestions = self.loadFile("./question/【5】演员简介.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('5.0', Vectors.dense(array))
train_list.append(train_one)
# 6,某演员出演过的类型电影有哪些
scoreQuestions = self.loadFile("./question/【6】某演员出演过的类型电影有哪些.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('6.0', Vectors.dense(array))
train_list.append(train_one)
# 7,某演员演了什么电影
scoreQuestions = self.loadFile("./question/【7】某演员演了什么电影.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('7.0', Vectors.dense(array))
train_list.append(train_one)
# 8,演员参演的电影评分【大于】
scoreQuestions = self.loadFile("./question/【8】演员参演的电影评分【大于】.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('8.0', Vectors.dense(array))
train_list.append(train_one)
# 9,演员参演的电影评分【小于】
scoreQuestions = self.loadFile("./question/【9】演员参演的电影评分【小于】.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('9.0', Vectors.dense(array))
train_list.append(train_one)
# 10,某演员出演过哪些类型的电影
scoreQuestions = self.loadFile("./question/【10】某演员出演过哪些类型的电影.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('10.0', Vectors.dense(array))
train_list.append(train_one)
# 11,演员A和演员B合作了哪些电影
scoreQuestions = self.loadFile("./question/【11】演员A和演员B合作了哪些电影.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('11.0', Vectors.dense(array))
train_list.append(train_one)
# 12,某演员一共演过多少电影
scoreQuestions = self.loadFile("./question/【12】某演员一共演过多少电影.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('12.0', Vectors.dense(array))
train_list.append(train_one)
# 13,演员出生日期
scoreQuestions = self.loadFile("./question/【13】演员出生日期.txt")
sentences = scoreQuestions.split("`")
for sentence in sentences:
array = self.sentenceToVector(sentence)
train_one = LabeledPoint('13.0', Vectors.dense(array))
train_list.append(train_one)
conf = SparkConf().setAppName('NaiveBayesTest').setMaster('local[*]')
sc = SparkContext(conf=conf)
distData = sc.parallelize(train_list, numSlices=10)
nb_model = NaiveBayes.train(distData)
return nb_model
# 加载问题模板
def loadQuestionsPattern(self):
questionsPattern = dict()
with open('./question/question_classification.txt', 'r', encoding='utf-8') as fp:
lines = fp.readlines()
for line in lines:
questionsPattern[line.split(':')[0]] = line.split(':')[1].replace('\n', '')
return questionsPattern
# 贝叶斯分类器的结果,拿到匹配的分类标签,返回问题模板
def queryClassify(self, sentence):
array = self.sentenceToVector(sentence)
v = Vectors.dense(array)
index = self.nbModel.predict(v)
self.modelIndex = int(index)
print("the model index is " + str(self.modelIndex))
return self.questionsPattern[str(self.modelIndex)]
data相关资料
https://pan.baidu.com/s/1YjNqa68jVMR2KDxMnImUgA
pyspark的搭建可以百度
使用pip install pyspark得看人品 我多试了几次 最后成功了。 还是因为懒~~
具体实现原理可以参考https://blog.csdn.net/Appleyk/article/details/80422055中博客的内容我这里就不做引申。