问题:已知
day | outlook | temperature | humidity | wind | Playtennis |
---|---|---|---|---|---|
1 | sunny | hot | high | weak | no |
2 | sunny | hot | high | strong | no |
3 | overcast | hot | high | weak | yes |
4 | rain | mild | high | weak | yes |
5 | rain | cool | normal | weak | yes |
6 | rain | cool | normal | strong | no |
7 | overcast | cool | normal | strong | yes |
8 | sunny | mild | high | weak | no |
9 | sunny | cool | normal | weak | yes |
10 | rain | mild | normal | weak | yes |
11 | sunny | mild | normal | strong | yes |
12 | overcast | mild | high | strong | yes |
13 | overcast | hot | normal | weak | yes |
14 | rain | mild | high | strong | no |
问当出现:<Outlook=sunny, Temperature=cool, Humidity=high, Wind=strong>
问这一天是否适合于打网球?
数据集:
sunny,hot,high,weak,no
sunny,hot,high,strong,no
overcast,hot,high,weak,yes
rain,mild,high,weak,yes
rain,cool,normal,weak,yes
rain,cool,normal,strong,no
overcast,cool,normal,strong,yes
sunny,mild,high,weak,no
sunny,cool,normal,weak,yes
rain,mild,normal,weak,yes
sunny,mild,normal,strong,yes
overcast,mild,high,strong,yes
overcast,hot,normal,weak,yes
rain,mild,high,strong,no
上面数据保存为 input.txt
代码:
import numpy as np
""" This is JintuZheng's Bayesion Classifier homework.
"""
class Bayesion_Classifier_Builder(object):
def __init__(self,fields_dict):
self.fields_dict = fields_dict
self.class_field_mapping = self.fields_dict[-1]
self.dataRaw = [] # Raw table
self.data_np_table = None# = np.array(dataRaw) # change data to numpy table
self.data_wash_correct_dict = {} # data wash
self.rows = 0
def class_analysis(self): # Category distribution analysis
class_set = self.data_np_table[:,-1]
class_set = set(class_set.tolist())
class_data = self.data_np_table[:,-1] #class data
class_rate =[len(np.where(class_data == class_number)[0])/self.rows for class_number in class_set]
print(class_rate)
def data_correct(self,know_error:str) -> str:
return self.data_wash_correct_dict[know_error]
def load_dataset(self, path:str):
self.dataRaw.clear() #clear all data Raw
with open(path,'r',encoding='utf8') as f:
lines = f.readlines()
for l in lines:
if l !='\n':
_words = l.replace('\n','').split(',')
try: # Data format's change
map_encode_list = [ self.line_encoder(c_idx,wi) for c_idx,wi in enumerate(_words)]
self.dataRaw.append(map_encode_list)
except Exception as _e:
print(_e)
print('--> line data load failed --> {} '.format(_words))
pass
self.data_np_table = np.array(self.dataRaw) # change data to numpy table
self.rows = self.data_np_table.shape[0]
def line_encoder(self, c_idx: int, wi: str) -> int:
return self.fields_dict[c_idx][wi]
def _probability_base(self,class_number:int, index_np_array:np.array) -> float:
p_denominator = 1 # Denominator
p_numerator = 1 # Numerator
fields_sum = index_np_array.shape[0] #Tables' cols -1
for index_col in range(fields_sum): # Get the each field single P
col_data = self.data_np_table[:,index_col]
p_denominator*=(len(np.where(col_data == index_np_array[index_col])[0])/self.rows)
#print(p_denominator)
class_data = self.data_np_table[:,fields_sum] #class data
class_p_denomerator = len(np.where(class_data == class_number)[0])
p_numerator*=(class_p_denomerator/self.rows) # Get class's P(Class)
p_class = p_numerator
for index_col in range(fields_sum):
col_data = self.data_np_table[:,index_col]
sum_both_satisfied = len(np.where(class_data[np.where(col_data == index_np_array[index_col])]==class_number)[0])
#print('{}/{}'.format(sum_both_satisfied,class_p_denomerator))
p_numerator*=(sum_both_satisfied/class_p_denomerator)
class_P = p_numerator/p_denominator
return class_P
def classifier(self, index_np_array:np.array) -> list:
if index_np_array.shape[0] != (self.data_np_table.shape[1]-1):
print('NP Array.Size Error~ size for: {}!={}'.format(index_np_array.shape[0],data_np_table.shape[1]))
return None
fields_sum = index_np_array.shape[0] #Tables' cols -1
class_set = self.data_np_table[:,fields_sum]
class_set = set(class_set.tolist())
return [self._probability_base(class_number,index_np_array) for class_number in class_set]
def Predict(self, data_describe:str) -> str :
#print(data_describe)
try: # Data format's change
_words = data_describe.replace('\n','').split(',')
map_encode_list = [ self.line_encoder(c_idx,wi) for c_idx,wi in enumerate(_words)]
except Exception as _e:
print(_e)
print('Data format wrong --> {}'.format(_words))
return None
map_encode_array = np.array(map_encode_list)
#print(map_encode_list)
# @ Func Above: Data encode --> str map in class
class_probabilities = self.classifier(map_encode_array) # calculate all probabilities for all classes
print(class_probabilities)
return list(self.class_field_mapping)[class_probabilities.index(max(class_probabilities))]
def batchPredict(self,path:str) ->list:
results = []
with open(path,'r',encoding='utf8') as f:
lines = f.readlines()
for l in lines:
if l !='\n':
try:
for key,cl in self.class_field_mapping.items():
pw = ','+key
#print(pw)
l = l.replace(pw,'')
results.append(self.Predict(l))
except Exception as _e:
print(_e)
print('--> line data load failed --> {} '.format(l))
pass
return results
# 属性名声明:
outlook_filed = {'sunny':0, 'rain':1, 'overcast':2}
temperature_filed = {'hot':0, 'mild':1, 'cool':2}
humidity_filed = {'high':0,'normal':1}
wind_filed = {'weak':0, 'strong':1 }
Playtennis_filed={'yes':0, 'no':1 }
fileds = [outlook_filed, temperature_filed, humidity_filed, wind_filed, Playtennis_filed]
BS = Bayesion_Classifier_Builder(fileds)
BS.load_dataset('input3.txt')
BS.class_analysis()
print(BS.Predict('sunny,cool,high,strong'))
输出: