33

#!/usr/bin/python
# coding=utf-8
from numpy import *
# from rediscluster import StrictRedisCluster


# def redis_cluster():
#
#     redis_nodes = [{'host': '99.12.90.102', 'port': 17000},
#                    {'host': '99.12.90.102', 'port': 17001},
#                    {'host': '99.12.90.102', 'port': 17002},
#                    {'host': '99.12.90.6', 'port': 17003},
#                    {'host': '99.12.90.6', 'port': 17004},
#                    {'host': '99.12.90.6', 'port': 17005}
#                   ]
#
#     try:
#         redis_conn = StrictRedisCluster(startup_nodes=redis_nodes)
#         return redis_conn
#     except Exception as e:
#         print(e)
#
#
# r = redis_cluster()

"""
P(问题类别 | 单词1,单词2,单词3) = P(单词1,单词2,单词3 | 问题类别) * P(问题类别) / P(单词1,单词2,单词3)

    因为分母都相同,所以只用比较分子即可--->P(单词1,单词2,单词3 | 问题类别) P(问题类别)

      每个单词之间都是相互独立的---->P(单词1 | 问题类别)P(单词2 | 问题类别)P(单词3 | 问题类别)*P(问题类别)

P(单词1 | 问题类别) = 单词1在样本中出现的总次数/样本句子中总的单词数

P(问题类别) = 样本该问题类别的条数/样本问题类别的总条数
"""


# 加载问题模板
def load_data(in_file_name):
    try:
        in_file = open(in_file_name, 'r', encoding='UTF-8')
        text_line = in_file.readline()
        posting_list = []
        class_vec = []
        dic = {}
        dic_index = {}
        line_index = 0
        while text_line:
            posting_list.append(text_line.split(';')[0].split(' '))
            que = text_line.split(';')[1].replace('\n', '')
            class_vec.append(que)
            dic.setdefault(que, 'p' + str(line_index))
            dic_index.setdefault(line_index, que)
            line_index += 1
            text_line = in_file.readline()
    except Exception as e:
        print(e)
        return

    finally:
        in_file.close()

    return posting_list, class_vec, dic, dic_index


# 创建一个包含在所有文档中出现的不重复词的列表
def create_vocab_list(data_set):
    vocab_set = set([])
    for document in data_set:
        vocab_set = vocab_set | set(document)
    return list(vocab_set)


# 将文档词条转换成词向量
def set_words2vec(vocab_list, input_set):
    return_vec = [0]*len(vocab_list)
    for word in input_set:
        if word in vocab_list:
            return_vec[vocab_list.index(word)] += 1
    return return_vec


# 朴素贝叶斯分类器训练函数   从词向量计算概率
def train(train_matrix, train_category, dic):
    try:
        num_train_docs = len(train_matrix)
        num_words = len(train_matrix[0])
        # p_model = 1/float(num_train_docs)  # as the question model is exclusive and independent, the p_model is not necessary

        dic_matrix = {}
        dic_num = {}
        dic_vec = {}
        for i in range(num_train_docs):
            dic_matrix['p' + str(i) + '_num'] = ones(num_words)  # 避免一个概率值为0,最后的乘积也为0  用来统计两类数据中,各词的词频
            dic_num['p' + str(i) + '_denom'] = num_train_docs

        for i in range(num_train_docs):
            dic_matrix[dic.get(train_category[i]) + '_num'] += train_matrix[i]
            dic_num[dic.get(train_category[i]) + '_denom'] += sum(train_matrix[i])

        # 避免下溢出或者浮点数舍入导致的错误
        for i in range(num_train_docs):
            dic_vec[i] = log(dic_matrix['p' + str(i) + '_num'] / dic_num['p' + str(i) + '_denom'])

    except Exception as e:
        print(e)
        return

    return dic_vec


# 调用测试方法
if __name__ == "__main__":
    in_file_name = "question_template.txt"  # question_template
    out_file_name1 = "bayes_vocab_list.txt"
    out_file_name2 = "bayes_dic_index.txt"
    out_file_name3 = "bayes_dic_vec.txt"

    list_posts, list_classes, dic_vocab, dic_index = load_data(in_file_name)
    my_vocab_list = create_vocab_list(list_posts)

    try:
        out_file1 = open(out_file_name1, 'a', encoding='UTF-8')
        out_file2 = open(out_file_name2, 'a', encoding='UTF-8')
        for val in my_vocab_list:
            out_file1.write("%s\n" % val)
        out_file1.flush()

        for key, val in dic_index.items():
            out_file2.write("%s,%s\n" % (key, val))
        out_file2.flush()

    except Exception as e:
        print(e)
    finally:
        out_file1.close()
        out_file2.close()

    train_mat = []
    for post_in_doc in list_posts:
        train_mat.append(set_words2vec(my_vocab_list, post_in_doc))
    dic_vec = train(array(train_mat), array(list_classes), dic_vocab)
    try:
        out_file3 = open(out_file_name3, 'a', encoding='UTF-8')

        str_tmp = ' '
        for key, val in dic_vec.items():
            out_file3.write("%s:%s\n" % (key, list(val)))
        out_file3.flush()

    except Exception as e:
        print(e)
    finally:
        out_file3.close()

    print('-----------')

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值