Python-分类问题示例-OneR-学习笔记

最新推荐文章于 2023-01-06 09:56:34 发布

DDMiao~

最新推荐文章于 2023-01-06 09:56:34 发布

阅读量1.1k

点赞数 2

分类专栏：学习笔记-python数据挖掘文章标签： python 数据挖掘 OneR 分类问题

本文链接：https://blog.csdn.net/qq_36847641/article/details/77002498

版权

学习笔记-python数据挖掘专栏收录该内容

4 篇文章 1 订阅

订阅专栏

《Python数据挖掘入门与实践》Robert Layton 人民邮电出版社

The OneR algorithm is:

For each variable
- For each value of the variable
  - The prediction based on this variable goes the most frequent class
  - Compute the error of this prediction
- Sum the prediction errors for all values of the variable
Use the variable with the lowest error

import numpy as np #numpy提供矩阵运算功能

# Load our dataset  
from sklearn.datasets import load_iris #scikit-learn库内置了Iris植物分类数据集  
#X, y = np.loadtxt("X_classification.txt"), np.loadtxt("y_classification.txt")  
dataset = load_iris() #读入python自带的iris数据集  
X = dataset.data #字典dataset中data键下的数据 每条数据含植物的四种特征属性  
y = dataset.target #字典dataset中target键下的数据 0、1、2分别代表三种植物  
print(dataset.DESCR) #字典dataset中DESCR键下的内容  
n_samples, n_features = X.shape #shape获取X行列数

离散化：高于该属性均值为1，低于均值为0

# Compute the mean for each attribute  
attribute_means = X.mean(axis=0) #mean()：函数求取均值；axis =0：对各列求均值，返回 1*n 矩阵  
assert attribute_means.shape == (n_features,) #assert断言是声明其布尔值必须为真的判定，如果发生异常就说明表达示为假。用来测试表示式，其返回值为假，就会触发异常。  
X_d = np.array(X >= attribute_means, dtype='int') <span style="font-family: Arial, Helvetica, sans-serif;">#NumPy的数组类被称作ndarray，通常被称作数组。array()创建数组，dtype设置数据类型，此处将X与均值比较大小后的布尔值转换为int

# 将离散后的数据集X_d分为训练集和测试集  
from sklearn.cross_validation import train_test_split #train_test_split随机划分训练集和测试集  
  
# Set the random state to the same number to get the same results as in the book  
random_state = 14 #随机数种子：其实就是该组随机数的编号，在需要重复试验的时候，保证得到一组一样的随机数。  
  
X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=random_state)  
print("There are {} training samples".format(y_train.shape))  
print("There are {} testing samples".format(y_test.shape))

from collections import defaultdict
from operator import itemgetter


def train(X, y_true, feature): #定义train函数，参数为数据集、类别、当前特征变量对应的索引值，计算给定特征变量时对应的预测结果和错误率，如给定以花瓣长度为判断依据，返回当花瓣长度大于平均值和小于平均值时，对应的预测类别，以及总错误率
    """Computes the predictors and error for a given feature using the OneR algorithm
    
    Parameters
    ----------
    X: array [n_samples, n_features]
        The two dimensional array that holds the dataset. Each row is a sample, each column
        is a feature.
    
    y_true: array [n_samples,]
        The one dimensional array that holds the class values. Corresponds to X, such that
        y_true[i] is the class value for sample X[i].
    
    feature: int
        An integer corresponding to the index of the variable we wish to test.
        0 <= variable < n_features  #本例中有0、1、2、3四个特征变量索引值
        
    Returns
    -------
    predictors: dictionary of tuples: (value, prediction)
        For each item in the array, if the variable has a given value, make the given prediction.
    
    error: float
        The ratio of training data that this rule incorrectly predicts.
    """
    # Check that variable is a valid number
    n_samples, n_features = X.shape #获取参数X的行列数
    assert 0 <= feature < n_features #验证参数feature满足条件
    # Get all of the unique values that this variable has
    values = set(X[:,feature])  #set()创建集合，values赋值为所有samples对应的该feature的值
    # Stores the predictors array that is returned
    predictors = dict() #预测出的类别
    errors = [] #错误率
    for current_value in values:  #遍历当前feature的每个值，本例中共0、1两个取值
        most_frequent_class, error = train_feature_value(X, y_true, feature, current_value)
        predictors[current_value] = most_frequent_class #当feature的取值为current_value时，预测类别为most_frequent_class
        errors.append(error) #append() 方法用于在列表末尾添加新的对象
    # Compute the total error of using this feature to classify on
    total_error = sum(errors)  #把当前feature每个取值（此处为0和1）的错误率求和，作为该feature的错误率
    return predictors, total_error #（{特征值1:类别1,特征值2:类别2},错误率）

# Compute what our predictors say each sample is based on its value
#y_predicted = np.array([predictors[sample[feature]] for sample in X])
    

def train_feature_value(X, y_true, feature, value): #定义函数，参数数据集、类别（0、1、2）、当前特征的索引值（0、1、2、3）、特征取值（0、1），计算给定特征值情况下的预测结果和错误率，如给定花瓣大于平均值，返回最可能的类别和错误率
    # Create a simple dictionary to count how frequency they give certain predictions
    class_counts = defaultdict(int)  #当给定特征值时，分别统计有几个个体属于类别0、类别1和类别2
    # Iterate through each sample and count the frequency of each class/value pair
    for sample, y in zip(X, y_true): #遍历数据集中的每个个体，zip的应用：将一系列对象中对应的元素打包成一个tuple（元组），返回由这些tuples组成的列表
        if sample[feature] == value: #如果当前个体的当前特征取值为当前指定特征值，如：当前个体的花瓣长度大于平均值
            class_counts[y] += 1 #当前个体对应类别的统计次数+1，class_counts {类别1：数量1，类别2：数量2，类别3：数量3}
    # Now get the best one by sorting (highest first) and choosing the first item
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True) #排序,返回一个新列表[(类别1,数量1),(类别2,数量2),...]
    most_frequent_class = sorted_class_counts[0][0] #取出列表sorted_class_counts中第一个元祖中的第一个值，即类别1

    # The error is the number of samples that do not classify as the most frequent class
    # *and* have the feature value.
    n_samples = X.shape[1] #样本个体总数
    error = sum([class_count for class_value, class_count in class_counts.items()
                 if class_value != most_frequent_class]) #sum的参数可为iterable，注意列表推导式（list comprehension）的应用
    return most_frequent_class, error #返回（类别1，总错误率）
# Compute all of the predictors
all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}
#all_predictors类别为字典{特征索引1：（{特征值1:类别1,特征值2:类别2},错误率），...}；注意字典推导的应用；此处的variable代表feature特征变量的索引值；shape[1]返回矩阵X_train的第二维长度，即特征变量的个数：4种特征。
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}#获得{特征值索引1：错误率1，...}
# Now choose the best and save that as "model"
# Sort by error
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0] #获得（特征值索引，错误率）
print("The best model is based on variable {0} and has error {1:.2f}".format(best_variable, best_error))
# Choose the bset model
model = {'variable': best_variable,
         'predictor': all_predictors[best_variable][0]} #获得{'variable':特征值索引，'predictor':（{特征值1:类别1,特征值2:类别2}}
print(model)


#应用训练好的model对测试集进行预测

def predict(X_test, model): #定义函数

    variable = model['variable']

    predictor = model['predictor']

    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test]) #int(sample[variable])取出sample的特征索引对应的变量值并转化为int,np.array生产数组，例如A=np.array([a for a in range(5)])

    return y_predicted #获得array([类别0, 类别1,...])

y_predicted = predict(X_test, model) #预测

print(y_predicted)



accuracy = np.mean(y_predicted == y_test) * 100 #准确率

print("The test accuracy is {:.1f}%".format(accuracy))



from sklearn.metrics import classification_report #classification_report构建一个显示主要分类指标的文本报告


print(classification_report(y_test, y_predicted))