基于决策树的分类
1.基于投票的决策
基于决策树的分类与回归的区别就在于,预测结果的获取,是通过投票而非平均得到的。
...
命中子表
... A \
... B |
... A |
... B > A:2<B:4
... B |
... B /
x -> B
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.preprocessing as sp
import sklearn.ensemble as se
import sklearn.model_selection as ms
data = []
with open('../../data/car.txt', 'r') as f:
for line in f.readlines():
data.append(line[:-1].split(','))
data = np.array(data).T
encoders, train_x = [], []
for row in range(len(data)):
encoder = sp.LabelEncoder()
if row < len(data) - 1:
train_x.append(
encoder.fit_transform(data[row]))
else:
train_y = encoder.fit_transform(data[row])
encoders.append(encoder)
train_x = np.array(train_x).T
# 随机森林分类器
model = se.RandomForestClassifier(
max_depth=8, n_estimators=200, random_state=7)
print(ms.cross_val_score(
model, train_x, train_y, cv=5,
scoring='f1_weighted').mean())
model.fit(train_x, train_y)
data = [
['high', 'med', '5more', '4', 'big', 'low'],
['high', 'high', '4', '4', 'med', 'med'],
['low', 'low', '2', '2', 'small', 'high'],
['low', 'med', '4', '4', 'med', 'high']]
data = np.array(data).T
test_x = []
for row in range(len(data)):
encoder = encoders[row]
test_x.append(encoder.transform(data[row]))
test_x = np.array(test_x).T
pred_test_y = model.predict(test_x)
print(encoders[-1].inverse_transform(pred_test_y))
2.验证曲线
模型对象 = 学习模型类(超参数序列)
\______________________/
性能 | 选择
验证曲线
ms.validation_curve(
模型对象, 输入集, 输出集, 超参数名, 超参数取值序列,
cv=验证次数)->训练集F1得分矩阵, 测试集F1得分矩阵
矩阵:一行对应一个超参数取值,一列对应一次交叉验证
如果测试集分值足够高,理想参数,如果测试集分值较低同时训练集分值也较低,说明欠拟合,反之如果测试集分值较低但是训练集分值却较高,说明过拟合。
max_depth 1 2 3 4 5 mean
4 0.89 0.99 0.76 0.89 0.92 ----> 0.85
5 0.88
6 0.92 *
7 0.83
8 0.79
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.preprocessing as sp
import sklearn.ensemble as se
import sklearn.model_selection as ms
import matplotlib.pyplot as mp
data = []
with open('../../data/car.txt', 'r') as f:
for line in f.readlines():
data.append(line[:-1].split(','))
data = np.array(data).T
encoders, train_x = [], []
for row in range(len(data)):
encoder = sp.LabelEncoder()
if row < len(data) - 1:
train_x.append(
encoder.fit_transform(data[row]))
else:
train_y = encoder.fit_transform(data[row])
encoders.append(encoder)
train_x = np.array(train_x).T
# 获取关于n_estimators超参数的验证曲线
model = se.RandomForestClassifier(
max_depth=8, random_state=7)
n_estimators = np.arange(50, 550, 10)
_, test_scores = ms.validation_curve(
model, train_x, train_y, 'n_estimators',
n_estimators, cv=5)
test_means1 = test_scores.mean(axis=1)
print(n_estimators[test_means1.argmax()])
# 获取关于max_depth超参数的验证曲线
model = se.RandomForestClassifier(
n_estimators=490, random_state=7)
max_depth = np.arange(1, 11)
_, test_scores = ms.validation_curve(
model, train_x, train_y, 'max_depth',
max_depth, cv=5)
test_means2 = test_scores.mean(axis=1)
print(max_depth[test_means2.argmax()])
mp.figure('n_estimators', facecolor='lightgray')
mp.title('n_estimators', fontsize=20)
mp.xlabel('n_estimators', fontsize=14)
mp.ylabel('F1 Score', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(n_estimators, test_means1, 'o-',
c='dodgerblue', label='Testing')
mp.legend()
mp.figure('max_depth', facecolor='lightgray')
mp.title('max_depth', fontsize=20)
mp.xlabel('max_depth', fontsize=14)
mp.ylabel('F1 Score', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(max_depth, test_means2, 'o-',
c='orangered', label='Testing')
mp.legend()
mp.show()
3.学习曲线
模型对象.fit(训练集输入, 想训练集输出)
\_________________/
性能 | 规模
学习曲线
有助于发现模型的过欠拟合和过拟合问题。
ms.learning_curve(
模型对象, 输入集, 输出集, train_sizes=训练集大小序列,
cv=验证次数)->训练集样本数,
训练集F1得分矩阵, 测试集F1得分矩阵
矩阵:一行对应一个训练集大小,一列对应一次交叉验证
如果测试集分值足够高,理想参数,如果测试集分值较低同时训练集分值也较低,说明欠拟合,反之如果测试集分值较低但是训练集分值却较高,说明过拟合。
训练集大小 1 2 3 4 5 mean
10% 0.89 0.99 0.76 0.89 0.92 ----> 0.85
20% 0.88
30% 0.92 *
40% 0.83
40% 0.79
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.preprocessing as sp
import sklearn.ensemble as se
import sklearn.model_selection as ms
import matplotlib.pyplot as mp
data = []
with open('../../data/car.txt', 'r') as f:
for line in f.readlines():
data.append(line[:-1].split(','))
data = np.array(data).T
encoders, train_x = [], []
for row in range(len(data)):
encoder = sp.LabelEncoder()
if row < len(data) - 1:
train_x.append(
encoder.fit_transform(data[row]))
else:
train_y = encoder.fit_transform(data[row])
encoders.append(encoder)
train_x = np.array(train_x).T
# 获取关于n_estimators超参数的验证曲线
model = se.RandomForestClassifier(
max_depth=9, n_estimators=490, random_state=7)
train_sizes = np.linspace(0.1, 1, 10)
train_sizes, train_scores, test_scores = \
ms.learning_curve(
model, train_x, train_y,
train_sizes=train_sizes, cv=5)
train_means = train_scores.mean(axis=1)
test_means = test_scores.mean(axis=1)
mp.figure('Learning Curve', facecolor='lightgray')
mp.title('Learning Curve', fontsize=20)
mp.xlabel('Train Size', fontsize=14)
mp.ylabel('F1 Score', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(train_sizes, train_means, 'o-',
c='dodgerblue', label='Training')
mp.plot(train_sizes, test_means, 'o-',
c='orangered', label='Testing')
mp.legend()
mp.show()
支持向量机(SVM)分类
1.原理
1)正确性原则 \
2)安全性原则 | 支持向量机分类的依据就是寻找
3)公平性原则 | 满足这四个原则的最优分类边界
4)简单性原则 /
正确性:训练样本被正确分类。
安全性:支持向量间距最大化。
公平性:边界离支持向量等距。
简单性:线性超平面分类边界。
2.线性分类
import sklearn.svm as svm
model = svm.SVC(kernel='linear')
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../../data/multiple2.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y, dtype=int)
# 划分训练集和测试集
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=5)
# 创建线性支持向量机分类器模型
model = svm.SVC(kernel='linear')
# 训练线性支持向量机分类器模型
model.fit(train_x, train_y)
# 点阵水平边界和步长
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
# 点阵垂直边界和步长
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
# 生成二维点阵
# _ grid_x
# ^ |h| /
# t | * * * *
# | * * * *-- v
# b | * * * *--
# +-------->
# l r
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
# 将点阵中每个点的水平坐标和垂直坐标作为
# 样本的两个特征合并成一个两列的二维数组
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
# 利用朴素贝叶斯分类器模型预测点阵的类别
flat_y = model.predict(flat_x)
# 将一维形式的类别变成点阵形式的二维数组
grid_y = flat_y.reshape(grid_x[0].shape)
# 将测试集中的输入代入模型预测其类别输出
pred_test_y = model.predict(test_x)
# 打印分类报告
print(sm.classification_report(test_y, pred_test_y))
# 绘制训练样本和分类边界
mp.figure('SVM Linear Classification',
facecolor='lightgray')
mp.title('SVM Linear Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
C0, C1 = y == 0, y == 1
mp.scatter(x[C0][:, 0], x[C0][:, 1],
c='orangered', s=60)
mp.scatter(x[C1][:, 0], x[C1][:, 1],
c='limegreen', s=60)
mp.show()
3.非线性分类
对于在原始维度空间无法线性分割的样本,可以通过所谓核函数提高样本的维度,以图在更高维度上做线性分割。
kernel='poly' 多项式核函数
y=w0 + w1x + w2x^2 + ... + wnx^n
kernel='rbf' 径向基核函数,正态分布概率密度函数
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../../data/multiple2.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y, dtype=int)
# 划分训练集和测试集
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=5)
# 创建多项式支持向量机分类器模型
# x1, x2 ->
# x1, x2,
# x1^2, x1x2, x2^2,
# x1^3, x1^2x2, x1x2^2, x2^3
model = svm.SVC(kernel='poly', degree=3)
# 训练线性支持向量机分类器模型
model.fit(train_x, train_y)
# 点阵水平边界和步长
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
# 点阵垂直边界和步长
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
# 生成二维点阵
# _ grid_x
# ^ |h| /
# t | * * * *
# | * * * *-- v
# b | * * * *--
# +-------->
# l r
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
# 将点阵中每个点的水平坐标和垂直坐标作为
# 样本的两个特征合并成一个两列的二维数组
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
# 利用朴素贝叶斯分类器模型预测点阵的类别
flat_y = model.predict(flat_x)
# 将一维形式的类别变成点阵形式的二维数组
grid_y = flat_y.reshape(grid_x[0].shape)
# 将测试集中的输入代入模型预测其类别输出
pred_test_y = model.predict(test_x)
# 打印分类报告
print(sm.classification_report(test_y, pred_test_y))
# 绘制训练样本和分类边界
mp.figure('SVM Polynomial Classification',
facecolor='lightgray')
mp.title('SVM Polynomial Classification',
fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
C0, C1 = y == 0, y == 1
mp.scatter(x[C0][:, 0], x[C0][:, 1],
c='orangered', s=60)
mp.scatter(x[C1][:, 0], x[C1][:, 1],
c='limegreen', s=60)
mp.show()
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../../data/multiple2.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y, dtype=int)
# 划分训练集和测试集
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=5)
# 创建径向基支持向量机分类器模型
model = svm.SVC(kernel='rbf', C=600, gamma=0.01)
# 训练线性支持向量机分类器模型
model.fit(train_x, train_y)
# 点阵水平边界和步长
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
# 点阵垂直边界和步长
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
# 生成二维点阵
# _ grid_x
# ^ |h| /
# t | * * * *
# | * * * *-- v
# b | * * * *--
# +-------->
# l r
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
# 将点阵中每个点的水平坐标和垂直坐标作为
# 样本的两个特征合并成一个两列的二维数组
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
# 利用朴素贝叶斯分类器模型预测点阵的类别
flat_y = model.predict(flat_x)
# 将一维形式的类别变成点阵形式的二维数组
grid_y = flat_y.reshape(grid_x[0].shape)
# 将测试集中的输入代入模型预测其类别输出
pred_test_y = model.predict(test_x)
# 打印分类报告
print(sm.classification_report(test_y, pred_test_y))
# 绘制训练样本和分类边界
mp.figure('SVM RBF Classification',
facecolor='lightgray')
mp.title('SVM RBF Classification',
fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
C0, C1 = y == 0, y == 1
mp.scatter(x[C0][:, 0], x[C0][:, 1],
c='orangered', s=60)
mp.scatter(x[C1][:, 0], x[C1][:, 1],
c='limegreen', s=60)
mp.show()
对于类别比例严重失衡的训练样本,可以设置权重均衡参数:
class_weight='balanced'
其目的是通过权重的分配,让小比例样本的作用得到增强,以改善分类器的预测精度。
4.置信概率
model = svm.SVC(..., probablity=True, ...)
model.predict_proba(输入)->置信概率矩阵
类别1 类别2 类别3
输入样本1 0.7 0.2 0.1 -> 类别1
输入样本2 0.3 0.4 0.3 -> 类别2
输入样本3 0.2 0.3 0.5 -> 类别3
...
越接近分类边界的样本,对应每种类别的置信概率就越平均,分类越模糊,相反越远离分类边界的样本,对应每种类别的置信概率就极端,分类越明确。
* (0.9, 0.1) - 极端,明确
* (0.5, 0.4) - 平均,模糊
--------------------------
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../../data/multiple2.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y, dtype=int)
# 划分训练集和测试集
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=5)
# 创建径向基支持向量机分类器模型
model = svm.SVC(kernel='rbf', C=600, gamma=0.01,
probability=True) # 需要计算置信概率
# 训练线性支持向量机分类器模型
model.fit(train_x, train_y)
# 点阵水平边界和步长
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
# 点阵垂直边界和步长
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
# 生成二维点阵
# _ grid_x
# ^ |h| /
# t | * * * *
# | * * * *-- v
# b | * * * *--
# +-------->
# l r
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
# 将点阵中每个点的水平坐标和垂直坐标作为
# 样本的两个特征合并成一个两列的二维数组
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
# 利用朴素贝叶斯分类器模型预测点阵的类别
flat_y = model.predict(flat_x)
# 将一维形式的类别变成点阵形式的二维数组
grid_y = flat_y.reshape(grid_x[0].shape)
# 将测试集中的输入代入模型预测其类别输出
pred_test_y = model.predict(test_x)
# 打印分类报告
print(sm.classification_report(test_y, pred_test_y))
prob_x = np.array([
[2, 1.5],
[8, 9],
[4.8, 5.2],
[4, 4],
[2.5, 7],
[7.6, 2],
[5.4, 5.9]])
print(prob_x)
pred_prob_y = model.predict(prob_x)
print(pred_prob_y)
probs = model.predict_proba(prob_x)
print(probs)
# 绘制训练样本和分类边界
mp.figure('SVM RBF Classification',
facecolor='lightgray')
mp.title('SVM RBF Classification',
fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
C0, C1 = y == 0, y == 1
mp.scatter(x[C0][:, 0], x[C0][:, 1],
c='orangered', s=60)
mp.scatter(x[C1][:, 0], x[C1][:, 1],
c='limegreen', s=60)
C0, C1 = pred_prob_y == 0, pred_prob_y == 1
mp.scatter(prob_x[C0][:, 0], prob_x[C0][:, 1],
marker='D', c='dodgerblue', s=50)
mp.scatter(prob_x[C1][:, 0], prob_x[C1][:, 1],
marker='D', c='deeppink', s=50)
for i in range(len(probs[C0])):
mp.annotate('{}% {}%'.format(
round(probs[C0][:, 0][i] * 100, 2),
round(probs[C0][:, 1][i] * 100, 2)),
xy=(prob_x[C0][:, 0][i],
prob_x[C0][:, 1][i]),
xytext=(12, -12),
textcoords='offset points',
horizontalalignment='left',
verticalalignment='top',
fontsize=9,
bbox={'boxstyle': 'round,pad=0.6',
'fc': 'deepskyblue',
'alpha': 0.8}),
for i in range(len(probs[C1])):
mp.annotate('{}% {}%'.format(
round(probs[C1][:, 0][i] * 100, 2),
round(probs[C1][:, 1][i] * 100, 2)),
xy=(prob_x[C1][:, 0][i],
prob_x[C1][:, 1][i]),
xytext=(12, -12),
textcoords='offset points',
horizontalalignment='left',
verticalalignment='top',
fontsize=9,
bbox={'boxstyle': 'round,pad=0.6',
'fc': 'violet',
'alpha': 0.8}),
mp.show()
5.网格搜索
模型对象 = 模型类(..., 超参数1, 超参数2, ...)
F1得分 F1得分
10 20
max 20 40
max 30 60
比验证曲线更简单,功能更强大的模型优化策略。
参数组合列表 = [{参数名: [取值列表], ...}, ...]
model = ms.GridSearchCV(模型对象, 参数组合列表,
cv=验证次数)
model.fit(输入, 输出)
(1)根据参数组合列表中的每一种超参数的组合,做cv次交叉验证,获得平均F1得分;
(2)根据最佳平均F1得分所对应的超参数组合设置模型对象;
(3)用提供给fit函数完整数据集训练模型对象;
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../../data/multiple2.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y, dtype=int)
# 划分训练集和测试集
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=5)
# 超参数组合列表
params = [
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
{'kernel': ['poly'], 'C': [1], 'degree': [2, 3]},
{'kernel': ['rbf'], 'C': [1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001]}]
# 网格搜索
model = ms.GridSearchCV(
svm.SVC(probability=True), params, cv=5)
model.fit(train_x, train_y)
# 打印最佳超参数组合及其F1得分
print(model.best_params_, model.best_score_)
print('-' * 42)
# 打印所有超参数组合及其F1得分
for param, score in zip(
model.cv_results_['params'],
model.cv_results_['mean_test_score']):
print(param, score)
# 点阵水平边界和步长
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
# 点阵垂直边界和步长
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
# 生成二维点阵
# _ grid_x
# ^ |h| /
# t | * * * *
# | * * * *-- v
# b | * * * *--
# +-------->
# l r
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
# 将点阵中每个点的水平坐标和垂直坐标作为
# 样本的两个特征合并成一个两列的二维数组
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
# 利用朴素贝叶斯分类器模型预测点阵的类别
flat_y = model.predict(flat_x)
# 将一维形式的类别变成点阵形式的二维数组
grid_y = flat_y.reshape(grid_x[0].shape)
# 将测试集中的输入代入模型预测其类别输出
pred_test_y = model.predict(test_x)
# 打印分类报告
print(sm.classification_report(test_y, pred_test_y))
prob_x = np.array([
[2, 1.5],
[8, 9],
[4.8, 5.2],
[4, 4],
[2.5, 7],
[7.6, 2],
[5.4, 5.9]])
print(prob_x)
pred_prob_y = model.predict(prob_x)
print(pred_prob_y)
probs = model.predict_proba(prob_x)
print(probs)
# 绘制训练样本和分类边界
mp.figure('SVM RBF Classification',
facecolor='lightgray')
mp.title('SVM RBF Classification',
fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
C0, C1 = y == 0, y == 1
mp.scatter(x[C0][:, 0], x[C0][:, 1],
c='orangered', s=60)
mp.scatter(x[C1][:, 0], x[C1][:, 1],
c='limegreen', s=60)
C0, C1 = pred_prob_y == 0, pred_prob_y == 1
mp.scatter(prob_x[C0][:, 0], prob_x[C0][:, 1],
marker='D', c='dodgerblue', s=50)
mp.scatter(prob_x[C1][:, 0], prob_x[C1][:, 1],
marker='D', c='deeppink', s=50)
for i in range(len(probs[C0])):
mp.annotate('{}% {}%'.format(
round(probs[C0][:, 0][i] * 100, 2),
round(probs[C0][:, 1][i] * 100, 2)),
xy=(prob_x[C0][:, 0][i],
prob_x[C0][:, 1][i]),
xytext=(12, -12),
textcoords='offset points',
horizontalalignment='left',
verticalalignment='top',
fontsize=9,
bbox={'boxstyle': 'round,pad=0.6',
'fc': 'deepskyblue',
'alpha': 0.8}),
for i in range(len(probs[C1])):
mp.annotate('{}% {}%'.format(
round(probs[C1][:, 0][i] * 100, 2),
round(probs[C1][:, 1][i] * 100, 2)),
xy=(prob_x[C1][:, 0][i],
prob_x[C1][:, 1][i]),
xytext=(12, -12),
textcoords='offset points',
horizontalalignment='left',
verticalalignment='top',
fontsize=9,
bbox={'boxstyle': 'round,pad=0.6',
'fc': 'violet',
'alpha': 0.8}),
mp.show()
6.事件预测
不是所有的字符串特征都能够使用标签编码器进行编码,如果字符串所代表的数据含义有连续特征,并且包含大小关系的语义,那么其编码值也应该体现一致的语义。
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.preprocessing as sp
import sklearn.model_selection as ms
import sklearn.svm as svm
class DigitEncoder():
def fit_transform(self, x):
return x.astype(int)
def transform(self, x):
return x.astype(int)
def inverse_transform(self, x):
return x.astype(str)
data = []
# with open('../../data/event.txt', 'r') as f:
with open('../../data/events.txt', 'r') as f:
for line in f.readlines():
data.append(line[:-1].split(','))
# 二维列表变成二维数组,转置后删除第1行
data = np.delete(np.array(data).T, 1, 0)
encoders, x = [], []
for row in range(len(data)):
if data[row, 0].isdigit():
encoder = DigitEncoder()
else:
encoder = sp.LabelEncoder()
if row < len(data) - 1:
x.append(encoder.fit_transform(data[row]))
else:
y = encoder.fit_transform(data[row])
encoders.append(encoder)
x = np.array(x).T
train_x, test_x, train_y, test_y = \
ms.train_test_split(
x, y, test_size=0.25, random_state=5)
model = svm.SVC(kernel='rbf', class_weight='balanced')
print(ms.cross_val_score(
model, train_x, train_y, cv=5,
scoring='accuracy').mean())
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
print((pred_test_y == test_y).sum() / pred_test_y.size)
data = [
['Tuesday', '12:30:00', '21', '23']]
data = np.array(data).T
x = []
for row in range(len(data)):
encoder = encoders[row]
x.append(encoder.transform(data[row]))
x = np.array(x).T
pred_y = model.predict(x)
print(encoders[-1].inverse_transform(pred_y))
7.流量预测
1 2 3 4 5 6 7 8 9
------ ------ ------
1 2 3
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.preprocessing as sp
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
class DigitEncoder():
def fit_transform(self, x):
return x.astype(int)
def transform(self, x):
return x.astype(int)
def inverse_transform(self, x):
return x.astype(str)
data = []
with open('../../data/traffic.txt', 'r') as f:
for line in f.readlines():
data.append(line[:-1].split(','))
# 二维列表变成二维数组,转置
data = np.array(data).T
encoders, x = [], []
for row in range(len(data)):
if data[row, 0].isdigit():
encoder = DigitEncoder()
else:
encoder = sp.LabelEncoder()
if row < len(data) - 1:
x.append(encoder.fit_transform(data[row]))
else:
y = encoder.fit_transform(data[row])
encoders.append(encoder)
x = np.array(x).T
train_x, test_x, train_y, test_y = \
ms.train_test_split(
x, y, test_size=0.25, random_state=5)
# 支持向量机回归器
model = svm.SVR(kernel='rbf', C=10, epsilon=0.2)
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
print(sm.r2_score(test_y, pred_test_y))
data = [
['Saturday', '13:35', 'San Francisco', 'no']]
data = np.array(data).T
x = []
for row in range(len(data)):
encoder = encoders[row]
x.append(encoder.transform(data[row]))
x = np.array(x).T
pred_y = int(model.predict(x))
print(pred_y)
想要看更多的课程请微信关注SkrEric的编程课堂