朴素贝叶斯分类器Numpy实现【原创代码】

问题:已知

dayoutlooktemperaturehumiditywindPlaytennis
1sunnyhothighweakno
2sunnyhothighstrongno
3overcasthothighweakyes
4rainmildhighweakyes
5raincoolnormalweakyes
6raincoolnormalstrongno
7overcastcoolnormalstrongyes
8sunnymildhighweakno
9sunnycoolnormalweakyes
10rainmildnormalweakyes
11sunnymildnormalstrongyes
12overcastmildhighstrongyes
13overcasthotnormalweakyes
14rainmildhighstrongno

问当出现:<Outlook=sunny, Temperature=cool, Humidity=high, Wind=strong>

问这一天是否适合于打网球?

数据集:

sunny,hot,high,weak,no
sunny,hot,high,strong,no
overcast,hot,high,weak,yes
rain,mild,high,weak,yes
rain,cool,normal,weak,yes
rain,cool,normal,strong,no
overcast,cool,normal,strong,yes
sunny,mild,high,weak,no
sunny,cool,normal,weak,yes
rain,mild,normal,weak,yes
sunny,mild,normal,strong,yes
overcast,mild,high,strong,yes
overcast,hot,normal,weak,yes
rain,mild,high,strong,no

上面数据保存为 input.txt

代码:

import numpy as np
""" This is JintuZheng's Bayesion Classifier homework.
"""

class Bayesion_Classifier_Builder(object):
    def __init__(self,fields_dict):
        self.fields_dict = fields_dict
        self.class_field_mapping = self.fields_dict[-1]
        self.dataRaw = [] # Raw table
        self.data_np_table = None# = np.array(dataRaw) # change data to numpy table
        self.data_wash_correct_dict = {} # data wash
        self.rows = 0
    
    def class_analysis(self): # Category distribution analysis
        class_set = self.data_np_table[:,-1]
        class_set = set(class_set.tolist())
        class_data = self.data_np_table[:,-1] #class data
        class_rate =[len(np.where(class_data == class_number)[0])/self.rows for class_number in class_set]
        print(class_rate)

    def data_correct(self,know_error:str) -> str:
        return self.data_wash_correct_dict[know_error]

    def load_dataset(self, path:str):
        self.dataRaw.clear() #clear all data Raw
        with open(path,'r',encoding='utf8') as f:
            lines = f.readlines()
            for l in lines:
                if l !='\n':
                    _words = l.replace('\n','').split(',')     
                    try: # Data format's change
                        map_encode_list = [ self.line_encoder(c_idx,wi) for c_idx,wi in enumerate(_words)]
                        self.dataRaw.append(map_encode_list)
                    except Exception as _e: 
                        print(_e)
                        print('--> line data load failed --> {} '.format(_words))
                        pass
        
        self.data_np_table = np.array(self.dataRaw) # change data to numpy table
        self.rows = self.data_np_table.shape[0]

    def line_encoder(self, c_idx: int, wi: str) -> int:
        return self.fields_dict[c_idx][wi]

    def _probability_base(self,class_number:int, index_np_array:np.array) -> float:
        p_denominator = 1 # Denominator
        p_numerator = 1 # Numerator
        fields_sum = index_np_array.shape[0] #Tables' cols -1
        
 
        for index_col in range(fields_sum): # Get the each field single P
            col_data = self.data_np_table[:,index_col]
            p_denominator*=(len(np.where(col_data == index_np_array[index_col])[0])/self.rows)
        
        #print(p_denominator)

        class_data = self.data_np_table[:,fields_sum] #class data
        class_p_denomerator = len(np.where(class_data == class_number)[0])
        p_numerator*=(class_p_denomerator/self.rows) # Get class's P(Class)
        p_class = p_numerator

        for index_col in range(fields_sum):
            col_data = self.data_np_table[:,index_col]   
            sum_both_satisfied = len(np.where(class_data[np.where(col_data == index_np_array[index_col])]==class_number)[0])
            #print('{}/{}'.format(sum_both_satisfied,class_p_denomerator))
            p_numerator*=(sum_both_satisfied/class_p_denomerator)
        class_P = p_numerator/p_denominator

        return class_P
            
    def classifier(self, index_np_array:np.array) -> list:
        if index_np_array.shape[0] != (self.data_np_table.shape[1]-1):
            print('NP Array.Size Error~ size for: {}!={}'.format(index_np_array.shape[0],data_np_table.shape[1]))
            return None  

        fields_sum = index_np_array.shape[0] #Tables' cols -1
        class_set = self.data_np_table[:,fields_sum]
        class_set = set(class_set.tolist())
        return [self._probability_base(class_number,index_np_array) for class_number in class_set]


    def Predict(self, data_describe:str) -> str :
        #print(data_describe)
        try: # Data format's change
            _words = data_describe.replace('\n','').split(',')
            map_encode_list = [ self.line_encoder(c_idx,wi) for c_idx,wi in enumerate(_words)]
        except Exception as _e: 
            print(_e)
            print('Data format wrong --> {}'.format(_words))
            return None
        map_encode_array = np.array(map_encode_list)
        #print(map_encode_list)
        # @ Func Above: Data encode --> str map in class
        class_probabilities = self.classifier(map_encode_array) # calculate all probabilities for all classes
        print(class_probabilities)
        return list(self.class_field_mapping)[class_probabilities.index(max(class_probabilities))]

    def batchPredict(self,path:str) ->list:
        results = []
        with open(path,'r',encoding='utf8') as f:
            lines = f.readlines()
            for l in lines:
                if l !='\n':
                    try:
                        for key,cl in self.class_field_mapping.items():
                            pw = ','+key
                            #print(pw)
                            l = l.replace(pw,'')
                        results.append(self.Predict(l))
                    except Exception as _e: 
                        print(_e)
                        print('--> line data load failed --> {} '.format(l))
                        pass
        return results

# 属性名声明:

outlook_filed = {'sunny':0, 'rain':1, 'overcast':2}
temperature_filed = {'hot':0, 'mild':1, 'cool':2}
humidity_filed = {'high':0,'normal':1}
wind_filed = {'weak':0, 'strong':1 }
Playtennis_filed={'yes':0, 'no':1 }

fileds = [outlook_filed, temperature_filed, humidity_filed, wind_filed, Playtennis_filed]
BS = Bayesion_Classifier_Builder(fileds)
BS.load_dataset('input3.txt')
BS.class_analysis()
print(BS.Predict('sunny,cool,high,strong'))

输出:

在这里插入图片描述

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值