OneR算法(分类应用,寻找最佳的特征值用于分类)
计算数据错误率,不属于最多类的特征值个数,把各个取值的错误率相加,选取错误率最低的特征作为唯一的分类准则(One Rule),用于接下来的分类。
//OneR
from collections import defaultdict
from operator import itemgetter
#参数分别是数据集,类别数组,选好的特征索引值,特征值
def train_feature_value(X,y_true,feature_index,value):
class_counts=defaultdict(int)
for sample,y in zip(X,y_true):
if sample[feature_index]==value:
class_counts[y]+=1
sorted_class_counts=sorted(class_counts.items(),key=itemgetter(1),reverse=True)
most_frequent_class=sorted_class_counts[0][0]
incorrect_predictions=[class_count for class_value,class_count
in class_counts.items()
if class_value!=most_frequent_class]
error=sum(incorrect_predictions)
return most_frequent_class,error
def train_on_feature(X,y_true,feature_index):
values=set(X[:,feature_index])
predictors={}
errors=[]
for current_value in values:
most_frequent_class,error,error = train_feature_value(X,y_true,feature_index,current_value)
predictors[current_value]=most_frequent_class
errors.append(error)
total_error=sum(errors)
return predictors,total_error