#!/usr/bin/python
# coding=utf-8
from numpy import *
# from rediscluster import StrictRedisCluster
# def redis_cluster():
#
# redis_nodes = [{'host': '99.12.90.102', 'port': 17000},
# {'host': '99.12.90.102', 'port': 17001},
# {'host': '99.12.90.102', 'port': 17002},
# {'host': '99.12.90.6', 'port': 17003},
# {'host': '99.12.90.6', 'port': 17004},
# {'host': '99.12.90.6', 'port': 17005}
# ]
#
# try:
# redis_conn = StrictRedisCluster(startup_nodes=redis_nodes)
# return redis_conn
# except Exception as e:
# print(e)
#
#
# r = redis_cluster()
"""
P(问题类别 | 单词1,单词2,单词3) = P(单词1,单词2,单词3 | 问题类别) * P(问题类别) / P(单词1,单词2,单词3)
因为分母都相同,所以只用比较分子即可--->P(单词1,单词2,单词3 | 问题类别) P(问题类别)
每个单词之间都是相互独立的---->P(单词1 | 问题类别)P(单词2 | 问题类别)P(单词3 | 问题类别)*P(问题类别)
P(单词1 | 问题类别) = 单词1在样本中出现的总次数/样本句子中总的单词数
P(问题类别) = 样本该问题类别的条数/样本问题类别的总条数
"""
# 加载问题模板
def load_data(in_file_name):
try:
in_file = open(in_file_name, 'r', encoding='UTF-8')
text_line = in_file.readline()
posting_list = []
class_vec = []
dic = {}
dic_index = {}
line_index = 0
while text_line:
posting_list.append(text_line.split(';')[0].split(' '))
que = text_line.split(';')[1].replace('\n', '')
class_vec.append(que)
dic.setdefault(que, 'p' + str(line_index))
dic_index.setdefault(line_index, que)
line_index += 1
text_line = in_file.readline()
except Exception as e:
print(e)
return
finally:
in_file.close()
return posting_list, class_vec, dic, dic_index
# 创建一个包含在所有文档中出现的不重复词的列表
def create_vocab_list(data_set):
vocab_set = set([])
for document in data_set:
vocab_set = vocab_set | set(document)
return list(vocab_set)
# 将文档词条转换成词向量
def set_words2vec(vocab_list, input_set):
return_vec = [0]*len(vocab_list)
for word in input_set:
if word in vocab_list:
return_vec[vocab_list.index(word)] += 1
return return_vec
# 朴素贝叶斯分类器训练函数 从词向量计算概率
def train(train_matrix, train_category, dic):
try:
num_train_docs = len(train_matrix)
num_words = len(train_matrix[0])
# p_model = 1/float(num_train_docs) # as the question model is exclusive and independent, the p_model is not necessary
dic_matrix = {}
dic_num = {}
dic_vec = {}
for i in range(num_train_docs):
dic_matrix['p' + str(i) + '_num'] = ones(num_words) # 避免一个概率值为0,最后的乘积也为0 用来统计两类数据中,各词的词频
dic_num['p' + str(i) + '_denom'] = num_train_docs
for i in range(num_train_docs):
dic_matrix[dic.get(train_category[i]) + '_num'] += train_matrix[i]
dic_num[dic.get(train_category[i]) + '_denom'] += sum(train_matrix[i])
# 避免下溢出或者浮点数舍入导致的错误
for i in range(num_train_docs):
dic_vec[i] = log(dic_matrix['p' + str(i) + '_num'] / dic_num['p' + str(i) + '_denom'])
except Exception as e:
print(e)
return
return dic_vec
# 调用测试方法
if __name__ == "__main__":
in_file_name = "question_template.txt" # question_template
out_file_name1 = "bayes_vocab_list.txt"
out_file_name2 = "bayes_dic_index.txt"
out_file_name3 = "bayes_dic_vec.txt"
list_posts, list_classes, dic_vocab, dic_index = load_data(in_file_name)
my_vocab_list = create_vocab_list(list_posts)
try:
out_file1 = open(out_file_name1, 'a', encoding='UTF-8')
out_file2 = open(out_file_name2, 'a', encoding='UTF-8')
for val in my_vocab_list:
out_file1.write("%s\n" % val)
out_file1.flush()
for key, val in dic_index.items():
out_file2.write("%s,%s\n" % (key, val))
out_file2.flush()
except Exception as e:
print(e)
finally:
out_file1.close()
out_file2.close()
train_mat = []
for post_in_doc in list_posts:
train_mat.append(set_words2vec(my_vocab_list, post_in_doc))
dic_vec = train(array(train_mat), array(list_classes), dic_vocab)
try:
out_file3 = open(out_file_name3, 'a', encoding='UTF-8')
str_tmp = ' '
for key, val in dic_vec.items():
out_file3.write("%s:%s\n" % (key, list(val)))
out_file3.flush()
except Exception as e:
print(e)
finally:
out_file3.close()
print('-----------')