贝叶斯分类器python实现_朴素贝叶斯分类器底层实现(python)

# coding:utf-8

from collections import defaultdict

import numpy as np

class NativeByes(object):

def __init__(self):

# 词的文档频率

self._dp_dict = None

# 字典

self._word_dict = None

# 各个分类的占比,p(c)

self._pc_dict = None

# p(w|c),词在指定分类下的占比

self._pwc_dict = None

def create_dataset(self):

data_set = [['my', 'dog', 'has', 'flea', 'problem', 'help', 'please'],

['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],

['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],

['stop', 'posting', 'stupid', 'worthless', 'garbage'],

['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],

['quite', 'buying', 'worthless', 'dog', 'food', 'stupid']]

lables = [0, 1, 0, 1, 0, 1]

return data_set, lables

def crate_worddic(self, data_set):

'''

根据文档创建字典

:param data_set:

:return:

'''

s = set([])

dp_dict = defaultdict(int)

for data in data_set:

tmp_set = set(data)

for word in tmp_set:

dp_dict[word] += 1

s = s | tmp_set

return list(s), dp_dict

def build_vec(self, words, word_dic):

'''

使用字典,将词条转成字典向量(词袋模型)

:param words:

:param word_dic:

:return:

'''

word_dic = self._word_dict

dp_dict = self._dp_dict

vec = np.zeros(len(word_dic))

for word in words:

if word in word_dic:

dp = dp_dict[word]

vec[word_dic.index(word)] += (1.0 / dp)

return vec

def train(self, data_set, labels):

'''

训练数据集

:param data_set:

:param labels:

:return:

'''

word_dict, dp_dict = self.crate_worddic(data_set)

self._word_dict = word_dict

self._dp_dict = dp_dict

matrix = [self.build_vec(word, word_dict) for word in data_set]

# p(c) ,key :类别, value:出现次数

pc_dict = defaultdict(int)

for i in range(len(data_set)):

label = labels[i]

pc_dict[label] += 1

cl_num = len(word_dict)

# p(wn|cn)

pwc_dict = defaultdict(int)

for i in range(cl_num):

for j in range(len(matrix)):

vec = matrix[j]

v = int(vec[i])

key = 'c{}-f{}-v{}'.format(labels[j], str(i), v)

pwc_dict[key] += 1

self._pc_dict = pc_dict

self._pwc_dict = pwc_dict

def classify(self, words):

'''

对词条向量进行分类

:param words:

:return:

'''

word_dict = self._word_dict

pc_dict = self._pc_dict

pwc_dict = self._pwc_dict

vec = self.build_vec(words, word_dict)

vec_len = len(vec)

max_pc = 0

label = ''

for c, pc in pc_dict.items():

pwc = 0

for i in range(vec_len):

v = int(vec[i])

key = 'c{}-f{}-v{}'.format(c, i, v)

cur_pwc = pwc_dict[key]

if cur_pwc == 0:

pass

pwc += np.math.log(cur_pwc + 1)

p = pc * pwc

if p > max_pc:

label = c

max_pc = p

return label

if __name__ == '__main__':

byes = NativeByes()

data_set, labels = byes.create_dataset()

byes.train(data_set, labels)

vec = ['dog', 'stupid', 'dog']

lb = byes.classify(vec)

for i in range(len(data_set)):

vec = data_set[i]

real_label = labels[i]

lb = byes.classify(vec)

print '{},cls is {},real is {}'.format(lb == real_label, lb, real_label)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值