手写朴素贝叶斯文本分类

本文深入探讨了朴素贝叶斯算法在文本分类中的应用,详细介绍了如何从头开始手写朴素贝叶斯分类器。通过实例分析,解释了特征选择、概率估计和分类决策的过程,帮助读者理解其工作原理。
摘要由CSDN通过智能技术生成
def get_traindata():   #输出总的文本矩阵,文本向量,词典
    folder_path = ('F:/train_data')
    folder_list = os.listdir(folder_path)
    sum_list = []
    corpus = []
    sum_vocab = set([])
    sum_dict = {}
    print('正在生成训练集总文本向量...')
    for folder in folder_list:                            #先获取总的文本向量
        new_folder_path = folder_path + '/' + str(folder)
        files = os.listdir(new_folder_path)
        for file in files:
            rs = []
            with open(new_folder_path+'/'+file,'r',encoding='utf-8') as fp:
                for ln in fp:
                    rs.extend(ln.strip().split(' '))
                    # print(type(rs))
                sum_list.append(rs)
    print(len(sum_list))
    print('生成完毕!')
    sum_num = len(sum_list)                              #文本总数
    past_num = 0
    train_num = 0
    print('正在生成词典...')
    for folder in folder_list:                           #对每一类求类向量并降维
        new_folder_path = folder_path + '/' + str(folder)
        files = os.listdir(new_folder_path)
        train_num = len(files)
        class_list = []
        for file in files:
            rs = []
            with open(new_folder_path + '/' + file, 'r', encoding='utf-8') as fp:
                for ln in fp:
                    rs.extend(ln.strip().split(' '))
                    class_list.extend(rs)
        corpus.append(str(class_list))
        # class_vocab  = createVocablist(class_list)
        # for word in class_vocab:
        #     if word in sum_vocab:
        #         class_vocab.remove(word)
        # class_vocab1 = class_tfidf(class_vocab,class_list,sum_list)
        # sum_vocab = sum_vocab | set(class_vocab1)
        sum_dict[str(folder)] = [0]*past_num+[1]*train_num+[0]*(sum_num-past_num-train_num)
        past_num +=train_num

    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    word = vectorizer.get_feature_names()
    weight = tfidf.toarray()
    vocab = set([])
    for i in range(len(weight)):
        list = sorted(weight[i], reverse=True)
        num = list[5]
        for j in range(len(word)):
            if weight[i][j] >= num:
                vocab.add(j)
    vocab_list =[]
    for i in vocab:
        vocab_list.append(word[i])






    #         class_list1 = []
    #         for i in dict_list:
    #             class_list1.append(i[0])
    #     sum_vocab = sum_vocab | set(class_list1)
    #
    # vocab_list = list(sum_vocab)     #最终的词典列表
    print('生成的词典为:%s'% str(vocab_list))
    vocab_file = open('F:/myVocab.txt','w')
    vocab_file.write(str(vocab_list))
    vocab_file.close()
    return sum_list,sum_dict,vocab_list    #返回总的文档列表、总的分类向量、词典


def createVocablist(dataSet):  #去重复词创建词库
    vocabSet=
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值