朴素贝叶斯分类器Numpy实现【原创代码】

最新推荐文章于 2023-07-04 10:37:08 发布

JintuZheng

最新推荐文章于 2023-07-04 10:37:08 发布

阅读量872

点赞数 1

分类专栏：个人杂记

本文链接：https://blog.csdn.net/rizero/article/details/112645578

版权

个人杂记专栏收录该内容

26 篇文章 1 订阅

订阅专栏

问题：已知

day	outlook	temperature	humidity	wind	Playtennis
1	sunny	hot	high	weak	no
2	sunny	hot	high	strong	no
3	overcast	hot	high	weak	yes
4	rain	mild	high	weak	yes
5	rain	cool	normal	weak	yes
6	rain	cool	normal	strong	no
7	overcast	cool	normal	strong	yes
8	sunny	mild	high	weak	no
9	sunny	cool	normal	weak	yes
10	rain	mild	normal	weak	yes
11	sunny	mild	normal	strong	yes
12	overcast	mild	high	strong	yes
13	overcast	hot	normal	weak	yes
14	rain	mild	high	strong	no

问当出现：<Outlook=sunny, Temperature=cool, Humidity=high, Wind=strong>

问这一天是否适合于打网球？

数据集：

sunny,hot,high,weak,no
sunny,hot,high,strong,no
overcast,hot,high,weak,yes
rain,mild,high,weak,yes
rain,cool,normal,weak,yes
rain,cool,normal,strong,no
overcast,cool,normal,strong,yes
sunny,mild,high,weak,no
sunny,cool,normal,weak,yes
rain,mild,normal,weak,yes
sunny,mild,normal,strong,yes
overcast,mild,high,strong,yes
overcast,hot,normal,weak,yes
rain,mild,high,strong,no

上面数据保存为 input.txt

代码：

import numpy as np
""" This is JintuZheng's Bayesion Classifier homework.
"""

class Bayesion_Classifier_Builder(object):
    def __init__(self,fields_dict):
        self.fields_dict = fields_dict
        self.class_field_mapping = self.fields_dict[-1]
        self.dataRaw = [] # Raw table
        self.data_np_table = None# = np.array(dataRaw) # change data to numpy table
        self.data_wash_correct_dict = {} # data wash
        self.rows = 0
    
    def class_analysis(self): # Category distribution analysis
        class_set = self.data_np_table[:,-1]
        class_set = set(class_set.tolist())
        class_data = self.data_np_table[:,-1] #class data
        class_rate =[len(np.where(class_data == class_number)[0])/self.rows for class_number in class_set]
        print(class_rate)

    def data_correct(self,know_error:str) -> str:
        return self.data_wash_correct_dict[know_error]

    def load_dataset(self, path:str):
        self.dataRaw.clear() #clear all data Raw
        with open(path,'r',encoding='utf8') as f:
            lines = f.readlines()
            for l in lines:
                if l !='\n':
                    _words = l.replace('\n','').split(',')     
                    try: # Data format's change
                        map_encode_list = [ self.line_encoder(c_idx,wi) for c_idx,wi in enumerate(_words)]
                        self.dataRaw.append(map_encode_list)
                    except Exception as _e: 
                        print(_e)
                        print('--> line data load failed --> {} '.format(_words))
                        pass
        
        self.data_np_table = np.array(self.dataRaw) # change data to numpy table
        self.rows = self.data_np_table.shape[0]

    def line_encoder(self, c_idx: int, wi: str) -> int:
        return self.fields_dict[c_idx][wi]

    def _probability_base(self,class_number:int, index_np_array:np.array) -> float:
        p_denominator = 1 # Denominator
        p_numerator = 1 # Numerator
        fields_sum = index_np_array.shape[0] #Tables' cols -1
        
 
        for index_col in range(fields_sum): # Get the each field single P
            col_data = self.data_np_table[:,index_col]
            p_denominator*=(len(np.where(col_data == index_np_array[index_col])[0])/self.rows)
        
        #print(p_denominator)

        class_data = self.data_np_table[:,fields_sum] #class data
        class_p_denomerator = len(np.where(class_data == class_number)[0])
        p_numerator*=(class_p_denomerator/self.rows) # Get class's P(Class)
        p_class = p_numerator

        for index_col in range(fields_sum):
            col_data = self.data_np_table[:,index_col]   
            sum_both_satisfied = len(np.where(class_data[np.where(col_data == index_np_array[index_col])]==class_number)[0])
            #print('{}/{}'.format(sum_both_satisfied,class_p_denomerator))
            p_numerator*=(sum_both_satisfied/class_p_denomerator)
        class_P = p_numerator/p_denominator

        return class_P
            
    def classifier(self, index_np_array:np.array) -> list:
        if index_np_array.shape[0] != (self.data_np_table.shape[1]-1):
            print('NP Array.Size Error~ size for: {}!={}'.format(index_np_array.shape[0],data_np_table.shape[1]))
            return None  

        fields_sum = index_np_array.shape[0] #Tables' cols -1
        class_set = self.data_np_table[:,fields_sum]
        class_set = set(class_set.tolist())
        return [self._probability_base(class_number,index_np_array) for class_number in class_set]


    def Predict(self, data_describe:str) -> str :
        #print(data_describe)
        try: # Data format's change
            _words = data_describe.replace('\n','').split(',')
            map_encode_list = [ self.line_encoder(c_idx,wi) for c_idx,wi in enumerate(_words)]
        except Exception as _e: 
            print(_e)
            print('Data format wrong --> {}'.format(_words))
            return None
        map_encode_array = np.array(map_encode_list)
        #print(map_encode_list)
        # @ Func Above: Data encode --> str map in class
        class_probabilities = self.classifier(map_encode_array) # calculate all probabilities for all classes
        print(class_probabilities)
        return list(self.class_field_mapping)[class_probabilities.index(max(class_probabilities))]

    def batchPredict(self,path:str) ->list:
        results = []
        with open(path,'r',encoding='utf8') as f:
            lines = f.readlines()
            for l in lines:
                if l !='\n':
                    try:
                        for key,cl in self.class_field_mapping.items():
                            pw = ','+key
                            #print(pw)
                            l = l.replace(pw,'')
                        results.append(self.Predict(l))
                    except Exception as _e: 
                        print(_e)
                        print('--> line data load failed --> {} '.format(l))
                        pass
        return results

# 属性名声明：

outlook_filed = {'sunny':0, 'rain':1, 'overcast':2}
temperature_filed = {'hot':0, 'mild':1, 'cool':2}
humidity_filed = {'high':0,'normal':1}
wind_filed = {'weak':0, 'strong':1 }
Playtennis_filed={'yes':0, 'no':1 }

fileds = [outlook_filed, temperature_filed, humidity_filed, wind_filed, Playtennis_filed]
BS = Bayesion_Classifier_Builder(fileds)
BS.load_dataset('input3.txt')
BS.class_analysis()
print(BS.Predict('sunny,cool,high,strong'))

输出：

在这里插入图片描述

JintuZheng

关注

1
点赞
踩
5

收藏

觉得还不错? 一键收藏
2
评论
朴素贝叶斯分类器Numpy实现【原创代码】

问题：已知dayoutlooktemperaturehumiditywindPlaytennis1sunnyhothighweakno2sunnyhothighstrongno3overcasthothighweakyes4rainmildhighweakyes5raincoolnormalweakyes6raincoolnormalstrongno7overcastcooln
复制链接

扫一扫

专栏目录