机器学习入门第四天

最新推荐文章于 2022-08-27 11:36:12 发布

weixin_43473360

最新推荐文章于 2022-08-27 11:36:12 发布

阅读量402

点赞数 1

分类专栏：机器学习文章标签：机器学习

本文链接：https://blog.csdn.net/weixin_43473360/article/details/89387795

版权

机器学习专栏收录该内容

3 篇文章 0 订阅

订阅专栏

分类报告

sklearn.metrics提供了分类报告相关API，不仅可以得到混淆矩阵，还可以得到交叉验证的查准率、召回率、f1得分的结果。这样可以方便的分析出哪些样本是异常样本。

import sklearn.metrics as sm
# 获取分类报告
cr = sm.classification_report(实际输出, 预测输出)
print(cr)

决策树分类

决策树分类模型会找到与样本特征匹配的叶子节点，然后以投票的方式进行分类。

案例：基于决策树分类算法，训练模型，预测小汽车（car.txt）等级。需要注意的是每一个特征都需要使用标签编码做预处理。

"""
汽车评估 案例
"""
import numpy as np
import sklearn.preprocessing as sp
import sklearn.ensemble as se
import sklearn.model_selection as ms

# 读取文本数据，对特征进行标签编码，
# 基于随机森林进行模型训练，完成交叉验证。
data = np.loadtxt('../ml_data/car.txt',
                  delimiter=',', dtype='U10')
data = data.T
encoders = []
train_x, train_y = [], []
for row in range(len(data)):
    encoder = sp.LabelEncoder()
    if row < len(data) - 1:
        train_x.append(
            encoder.fit_transform(data[row]))
    else:
        train_y = encoder.fit_transform(data[row])
    encoders.append(encoder)
train_x = np.array(train_x).T
# 基于随机森林分类器完成模型训练
model = se.RandomForestClassifier(
    max_depth=6, n_estimators=200,
    random_state=7)
print(ms.cross_val_score(
    model, train_x, train_y, cv=4,
    scoring='f1_weighted').mean())
model.fit(train_x, train_y)
# 造一些测试数据，进行测试。
data = [
    ['high', 'med', '5more', '4',
     'big', 'low', 'unacc'],
    ['high', 'high', '4', '4',
     'med', 'med', 'acc'],
    ['low', 'low', '2', '4',
     'small', 'high', 'good'],
    ['low', 'med', '3', '4',
     'med', 'high', 'vgood']]
# 对测试数据进行相同的标签编码，
# 才可以使用训练好的模型
data = np.array(data).T
test_x, test_y = [], []
for row in range(len(data)):
    encoder = encoders[row]
    if row < len(data) - 1:
        test_x.append(
            encoder.transform(data[row]))
    else:
        test_y = encoder.transform(
            data[row])
test_x = np.array(test_x).T
# 使用模型开始预测
pred_test_y = model.predict(test_x)
e = encoders[-1]  # 针对最后一列的标签编码器
print(e.inverse_transform(test_y))
print(e.inverse_transform(pred_test_y))

验证曲线

验证曲线 : 模型性能 = f(超参数)
验证曲线是超参数优选的一种解决方案，通过验证曲线相关API，可以得到相同模型在不同超参数下的模型性能得分，从而得知最优超参数。

import sklearn.model_selection as ms
train_scores, test_scores = ms.validation_curve(
	model,			 # 初始模型
    输入集, 输出集,
    'n_estimators',  # 超参数名
    np.array([50, 60, 80, 100]), # 超参数取值列表
    cv=5	# 折叠数
)

train_scores的结构：

超参数	CV1	CV2	CV3	CV4	CV5
50	0.973	0.894	0.944	0.924	0.914
100	0.914	0.924	0.934	0.954	0.934
…	…	…	…	…	…

test_scores的结构与上述结构一致。

案例：小汽车案例中使用验证曲线选择超参数。

```python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
"""
汽车评估 案例  验证曲线选择超参数
"""
import numpy as np
import sklearn.preprocessing as sp
import sklearn.ensemble as se
import sklearn.model_selection as ms
import matplotlib.pyplot as mp


# 读取文本数据，对特征进行标签编码，
# 基于随机森林进行模型训练，完成交叉验证。
data = np.loadtxt('../ml_data/car.txt',
                  delimiter=',', dtype='U10')
data = data.T
encoders = []
train_x, train_y = [], []
for row in range(len(data)):
    encoder = sp.LabelEncoder()
    if row < len(data) - 1:
        train_x.append(
            encoder.fit_transform(data[row]))
    else:
        train_y = encoder.fit_transform(data[row])
    encoders.append(encoder)
train_x = np.array(train_x).T

# 基于随机森林分类器完成模型训练
model = se.RandomForestClassifier(
    max_depth=6, random_state=7)
# 获取关于n_estimators的验证曲线
n_estimators = np.arange(50, 550, 50)
train_scores, test_scores = \
    ms.validation_curve(
        model, train_x, train_y,
        'n_estimators', n_estimators, cv=5)
# 每个超参数取值对应的得分
train_means = train_scores.mean(axis=1)
for p, s in zip(n_estimators, train_means):
    print(p, '->', s)

mp.figure('N_estimators', facecolor='lightgray')
mp.title('N_estimators', fontsize=16)
mp.xlabel('Estimators', fontsize=14)
mp.ylabel('Scores', fontsize=14)
mp.tick_params(labelsize=12)
mp.grid(linestyle=":")
mp.plot(n_estimators, train_means,
        c='dodgerblue', label='Valid Curve')
mp.legend()
mp.show()


# 基于随机森林分类器完成模型训练
model = se.RandomForestClassifier(
    n_estimators=200, random_state=7)
# 获取关于max_depth的验证曲线
max_depths = np.arange(1, 11)
train_scores, test_scores = \
    ms.validation_curve(
        model, train_x, train_y,
        'max_depth', max_depths, cv=5)
# 每个超参数取值对应的得分
train_means = train_scores.mean(axis=1)
for p, s in zip(max_depths, train_means):
    print(p, '->', s)

mp.figure('Max_depth', facecolor='lightgray')
mp.title('Max_depth', fontsize=16)
mp.xlabel('Max_depths', fontsize=14)
mp.ylabel('Scores', fontsize=14)
mp.tick_params(labelsize=12)
mp.grid(linestyle=":")
mp.plot(max_depths, train_means,
        c='dodgerblue', label='Valid Curve')
mp.legend()
mp.show()

# 基于随机森林分类器完成模型训练
model = se.RandomForestClassifier(
    max_depth=8,
    n_estimators=200, random_state=7)
print(ms.cross_val_score(
    model, train_x, train_y, cv=4,
    scoring='f1_weighted').mean())
model.fit(train_x, train_y)
# 造一些测试数据，进行测试。
data = [
    ['high', 'med', '5more', '4',
     'big', 'low', 'unacc'],
    ['high', 'high', '4', '4',
     'med', 'med', 'acc'],
    ['low', 'low', '2', '4',
     'small', 'high', 'good'],
    ['low', 'med', '3', '4',
     'med', 'high', 'vgood']]
# 对测试数据进行相同的标签编码，
# 才可以使用训练好的模型
data = np.array(data).T
test_x, test_y = [], []
for row in range(len(data)):
    encoder = encoders[row]
    if row < len(data) - 1:
        test_x.append(
            encoder.transform(data[row]))
    else:
        test_y = encoder.transform(
            data[row])
test_x = np.array(test_x).T
# 使用模型开始预测
pred_test_y = model.predict(test_x)
e = encoders[-1]  # 针对最后一列的标签编码器
print(e.inverse_transform(test_y))
print(e.inverse_transform(pred_test_y))

学习曲线

学习曲线：模型性能=f(训练集大小)

学习曲线相关API:

import sklearn.model_selection as ms
# 获取学习曲线
_, train_scores, test_scores = ms.learning_curve(
	model, # 初始模型
    输入集, 输出集,
    [0.9, 0.8, 0.7], # 训练集的大小
    cv=5	# 折叠数
)

案例：

import numpy as np
import sklearn.preprocessing as sp
import sklearn.ensemble as se
import sklearn.model_selection as ms
import matplotlib.pyplot as mp


# 读取文本数据，对特征进行标签编码，
# 基于随机森林进行模型训练，完成交叉验证。
data = np.loadtxt('../ml_data/car.txt',
                  delimiter=',', dtype='U10')
data = data.T
encoders = []
train_x, train_y = [], []
for row in range(len(data)):
    encoder = sp.LabelEncoder()
    if row < len(data) - 1:
        train_x.append(
            encoder.fit_transform(data[row]))
    else:
        train_y = encoder.fit_transform(data[row])
    encoders.append(encoder)
train_x = np.array(train_x).T

# 基于随机森林分类器完成模型训练
model = se.RandomForestClassifier(
    max_depth=8,
    n_estimators=200, random_state=7)
# 使用学习曲线获取最优训练集大小
train_sizes = np.linspace(0.1, 1, 10)
_, train_scores, test_scores = \
    ms.learning_curve(model, train_x, train_y,
        train_sizes=train_sizes, cv=5)
train_means = train_scores.mean(axis=1)
for size, score in zip(train_sizes, train_means):
    print(size, '->', score)

mp.figure('Learning Curve')
mp.title('Learning Curve', fontsize=16)
mp.xlabel('Train Size', fontsize=12)
mp.ylabel('Curve Score', fontsize=12)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(train_sizes, train_means, 'o-',
        c='dodgerblue', label='Curve')
mp.legend()
mp.show()

支持向量机(SVM)

支持向量机原理

核心理念：寻求最优分类边界

正确：对大部分样本可以正确的划分类别。

泛化：最大化支持向量间距。

公平：类别与支持向量等距。

简单：线性，直线、平面做类别分割。
基于核函数的升维变换

对于难以分类的一些样本，执行基于核函数的升维变换。增加新的特征，使得低维度空间中的线性不可分问题，变成高维度空间中的线性可分问题。

SVM提供的常用核函数

线性核函数 linear
多项式核函数 poly
径向基核函数 rbf

线性核函数

不通过核函数进行维度提升，仅在原始维度空间中寻求线性分类边界。

基于线性核函数的SVM分类器：

import sklearn.svm as svm
model = svm.SVC(kernel='linear')
mpdel.fit(train_x, train_y)

案例：使用线性核函数的SVM训练simple2.txt。

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
"""
demo05_svm_linear.py 线性核函数的SVM
"""
import numpy as np
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import matplotlib.pyplot as mp

x, y = [], []
data = np.loadtxt('../ml_data/multiple2.txt',
                  delimiter=',', dtype='f8')
x = data[:, :-1]
y = data[:, -1]

train_x, test_x, train_y, test_y = \
    ms.train_test_split(
        x, y, test_size=0.25, random_state=5)

# 基于线性核函数的支持向量机模型
model = svm.SVC(kernel='linear')
model.fit(train_x, train_y)

# 预测分类边界
n = 500
l, r = x[:, 0].min() - 1, x[:, 0].max() + 1
b, t = x[:, 1].min() - 1, x[:, 1].max() + 1
grid_x, grid_y = np.meshgrid(
    np.linspace(l, r, n),
    np.linspace(l, r, n))
samples = np.column_stack(
    (grid_x.ravel(), grid_y.ravel()))
grid_z = model.predict(samples)
grid_z = grid_z.reshape(grid_x.shape)
# 使用模型预测测试集数据，输出分类报告
pred_test_y = model.predict(test_x)
cr = sm.classification_report(
    test_y, pred_test_y)
print(cr)

# 绘制分类边界
mp.figure('SVM Linear Classification', facecolor='lightgray')
mp.title('SVM Linear Classification', fontsize=16)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=12)
mp.pcolormesh(grid_x, grid_y, grid_z,
              cmap='gray')
mp.scatter(test_x[:, 0], test_x[:, 1],
           c=test_y, cmap='brg', s=80)
mp.show()

多项式核函数

通过多项式函数增加原始样本特征的高次方幂作为新的特征，在新的思考维度中对样本进行分类。

# 基于多项式函数扩展新的特征
model = svm.SVC(kernel='poly', degree=3)
model.fit(train_x, train_y)

径向基核函数

通过高斯分布函数增加原始样本特征的分布概率。

# 基于径向基核函数的支持向量机分类器
# C: 正则强度
# gamma: 正态分布曲线的标准差
model = svm.SVC(kernel='rbf', C=600, gamma=0.01)
model.fit(train_x, train_y)

样本类别均衡化

通过类别均衡化，使所占比例较小的样本权重较高，而所占比例较大的样本权重较低。以此平均化不同类别样本对分类模型的贡献，提高模型精度。

# 添加 class_weight='balanced' 实现样本类别均衡化
model = svm.SVC(kernel='linear', class_weight='balanced')
model.fit(train_x, train_y)

案例：基于线性核函数对imbalance.txt进行训练

# 基于线性核函数的支持向量机模型
model = svm.SVC(kernel='rbf', C=1000, gamma=0.01,
                class_weight='balanced')
model.fit(train_x, train_y)

置信概率

根据样本与分类边界的距离远近，对其预测类别的可信程度进行量化，离边界越近的样本，置信概率越低，反之，离边界越远的样本，置信概率越高。

获取每个样本的置信概率API：

# 在获取支持向量机模型时，给出超参数 probability=True
m = svm.SVC(kernel='', C=1, gamma=0.01, probability=True)
# 获取每个样本的置信概率
置信概率矩阵 = model.predict_proba(test_x)

置信概率矩阵格式：

	类别1	类别2
样本1	0.8	0.2
样本2	0.6	0.4

案例：修改径向基核函数的SVM案例，新增测试样本，输出置信概率。

# 新增测试点   输出置信概率， 绘制图像
prob_x = np.array([
    [2, 1.5],
    [8, 9],
    [4.8, 5.2],
    [4, 4],
    [2.5, 7],
    [7.6, 2],
    [5.4, 5.9]])
pred_prob_y = model.predict(prob_x)
probs = model.predict_proba(prob_x)
print(probs)

mp.scatter(prob_x[:, 0], prob_x[:, 1],
           c=pred_prob_y, cmap='jet_r',
           s=80, marker='D')
# 为每个点都加上备注: 1类别概率, 2类别概率
for i in range(len(probs)):
    mp.annotate(
        '{}% {}%'.format(
            round(probs[i, 0] * 100, 2),
            round(probs[i, 1] * 100, 2)),
        xy=(prob_x[i, 0], prob_x[i, 1]),
        xytext=(12, -12),
        textcoords='offset points',
        fontsize=9,
        bbox={'boxstyle': 'round,pad=0.6',
              'fc': 'orange', 'alpha': 0.8})

网格搜索

获取一个最优超参数的方式可以绘制验证曲线，但是验证曲线只能每次获取一个最优超参数。如果多个超参数有很多排列组合的话，就可以使用网格搜索寻求最优超参数组合。

在网格搜索过程中，针对每一个超参数组合，实例化给定的模型，做cv次交叉验证，将其中平均f1得分最高的超参数组合作为最佳选择，实例化模型对象。

网格搜索相关API：

import sklearn.model_selection as ms
# 返回的是已经使用了最优超参数组合的model对象
model = ms.GridSearchCV(
    model,			 # 原始模型
	超参数组合列表,	# 使用列表的方式把所有组合列出
	cv=5			 # 交叉验证折叠数
)
# 获取网格搜索过程中的每个参数组合
model.cv_results_['params']
# 获取王国搜索过程每个参数组合对应的平均测试分
model.cv_results_['mean_test_score']
# 获取最好的参数
model.best_params_		# 最优参数
model.best_scores_		# 最优得分
model.best_estimator_	# 最优模型

案例：修改置信概率案例，基于网格搜索寻的最优超参数。

# 基于径向基核函数的支持向量机模型
model = svm.SVC()
# 整理超参数列表，做网格搜索，寻找最优
params = [
    {'kernel': ['linear'],
     'C':[1, 10, 100, 1000]},
    {'kernel': ['poly'], 'C':[1],
     'degree':[2, 3]},
    {'kernel': ['rbf'], 'C':[1, 10, 100, 1000],
     'gamma':[1, 0.1, 0.01, 0.001]}]
model = ms.GridSearchCV(model, params, cv=5)
model.fit(train_x, train_y)

print(model.best_params_)
print(model.best_score_)
print(model.best_estimator_)

# 查看网格搜索过程的细节
for p, s in zip(
    model.cv_results_['params'],
        model.cv_results_['mean_test_score']):
    print(p, '->', s)