__author__ = 'HM'
f = open('data.txt','r')
first_line = f.readline().split()
attributes = first_line[:-1]
attr_len = len(attributes)
classname = first_line[-1]
data_set_raw = []
class_label_pool = set()
attribute_discrete_pool = {}
for line in f:
raw_data = line.split()
# new_record = {classname:raw_data[-1]}
new_record = {'class_label':raw_data[-1]}
class_label_pool.add(raw_data[-1])
for i in xrange(attr_len):
attribute_name = attributes[i]
new_record[attribute_name] = raw_data[i]
attribute_discrete_pool[attribute_name]=attribute_discrete_pool.get(attribute_name,set()).union(set([raw_data[i]]))
data_set_raw.append(new_record)
for d in data_set_raw:
print d
def train_classifier(data):
attr_value_count = {}#{'yes':{'credit_rating':{'fair':10,'excellent':30},}}
class_value_count = {}#{'yes':10,'no':20}
#initial attr_value_count
for c in class_label_pool:
attr_value_count[c]=dict()
for a in attributes:
attr_value_count[c][a]=dict()
for attr_value in attribute_discrete_pool[a]:
attr_value_count[c][a][attr_value] = 0#not Use Laplacian correction(+1)
print attr_value_count
#initial class_value_count
for c in class_label_pool:
class_value_count[c] = 0
for d in data:
for a in attributes:
attr_value_count[d['class_label']][a][d[a]] += 1
class_value_count[d['class_label']] += 1
return attr_value_count,class_value_count
def predict(data,dataset_len,attr_value_count,class_value_count):
print attr_value_count
print class_value_count
p_c_x_table = {}
for c in class_label_pool:
p_c = class_value_count[c]/float(dataset_len)
print 'pc',p_c
p_x_c = 1
for key in data:
p_x_c *= attr_value_count[c][key][data[key]]/float(class_value_count[c])
print 'p_x_c',p_x_c,data[key],attr_value_count[c][key][data[key]]
p_c_x = p_x_c*p_c
p_c_x_table[c] = p_c_x
print p_c_x_table
d = {'age':'<=30','income':'medium','student':'yes','credit_rating':'fair'}
predict(d,len(data_set_raw),*train_classifier(data_set_raw))
dataset:
age income student credit_rating buys_compute
<=30 high no fair no
<=30 high no excellent no
31…40 high no fair yes
>40 medium no fair yes
>40 low yes fair yes
>40 low yes excellent no
31…40 low yes excellent yes
<=30 medium no fair no
<=30 low yes fair yes
>40 medium yes fair yes
<=30 medium yes excellent yes
31…40 medium no excellent yes
31…40 high yes fair yes
>40 medium no excellent no
备注:表示数据的方法有点麻烦(各种字典套字典。。。),找个方法优化之。