构造训练数据
import numpy as np
#产生正态分布的数据100组,中心点(0,0),其标准差σ为1
p=np.random.randn(100,2)
#将中心点移动到(3.5,3.5),作为正类
for i in range(100):
p[i][0]+=3.5
p[i][1]+=3.5
#产生正态分布的数据100组,中心点(0,0),其标准差σ为1,作为负类
f=np.random.randn(100,2)
import pandas as pd
#将np数组转换成dataframe
df_p=pd.DataFrame(p,columns=['x','y'])
#加上标签z,正类标签1
df_p['z']=1
#将np数组转换成dataframe
df_f=pd.DataFrame(f,columns=['x','y'])
#加上标签z,负类标签0
df_f['z']=0
#将正负类合并成一个dataframe
res = pd.concat([df_p, df_f], axis=0)
res
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
x | y | z | |
---|---|---|---|
0 | 4.250614 | 4.056121 | 1 |
1 | 4.608059 | 3.963256 | 1 |
2 | 3.667928 | 2.844298 | 1 |
3 | 5.110389 | 2.207770 | 1 |
4 | 4.565589 | 2.865835 | 1 |
5 | 4.967936 | 3.428427 | 1 |
6 | 4.164498 | 4.756457 | 1 |
7 | 1.996462 | 3.461555 | 1 |
8 | 3.320537 | 3.236716 | 1 |
9 | 2.552247 | 3.740323 | 1 |
10 | 3.529014 | 4.360995 | 1 |
11 | 3.446757 | 2.899550 | 1 |
12 | 2.765047 | 3.373536 | 1 |
13 | 3.239816 | 2.895096 | 1 |
14 | 2.438422 | 2.251773 | 1 |
15 | 3.001785 | 2.546292 | 1 |
16 | 5.252033 | 5.593779 | 1 |
17 | 3.413621 | 2.757865 | 1 |
18 | 3.624994 | 3.797129 | 1 |
19 | 3.217611 | 3.681506 | 1 |
20 | 4.990572 | 2.256055 | 1 |
21 | 4.350523 | 3.607818 | 1 |
22 | 3.533867 | 3.949800 | 1 |
23 | 3.374815 | 3.882725 | 1 |
24 | 4.112398 | 5.065239 | 1 |
25 | 2.879833 | 3.947735 | 1 |
26 | 1.701558 | 2.832577 | 1 |
27 | 1.784570 | 5.255377 | 1 |
28 | 2.876247 | 1.793252 | 1 |
29 | 4.227331 | 6.178785 | 1 |
... | ... | ... | ... |
70 | -1.081413 | 0.046719 | 0 |
71 | -1.004865 | 0.204234 | 0 |
72 | 0.023450 | 0.453429 | 0 |
73 | -0.460663 | -0.667104 | 0 |
74 | 0.935339 | -1.747852 | 0 |
75 | -0.201228 | 0.347887 | 0 |
76 | 0.987963 | 0.706268 | 0 |
77 | 0.268110 | -1.024068 | 0 |
78 | 0.408360 | 0.661068 | 0 |
79 | 1.501026 | 1.667613 | 0 |
80 | -0.508721 | -1.314594 | 0 |
81 | -0.907388 | -0.119675 | 0 |
82 | 1.227677 | -1.305001 | 0 |
83 | -0.100075 | 0.955962 | 0 |
84 | 2.501123 | -0.224945 | 0 |
85 | 2.691064 | 1.343907 | 0 |
86 | 0.744924 | 0.078018 | 0 |
87 | -0.721247 | -0.296832 | 0 |
88 | -0.602119 | -0.631173 | 0 |
89 | 0.308663 | 1.204604 | 0 |
90 | 0.577042 | 0.367347 | 0 |
91 | 2.394736 | -0.412487 | 0 |
92 | 0.535134 | -0.745468 | 0 |
93 | 0.409373 | -0.259470 | 0 |
94 | 0.404675 | 0.454216 | 0 |
95 | 1.157458 | 1.642951 | 0 |
96 | 0.885934 | -1.503737 | 0 |
97 | 0.363141 | -0.926611 | 0 |
98 | 0.144915 | 0.799192 | 0 |
99 | -0.325018 | -1.283557 | 0 |
200 rows × 3 columns
import matplotlib.pyplot as plt
#绘制出数据集的散点图
plt.scatter(res['x'], res['y'], c=res['z'],cmap=plt.cm.Paired)
plt.xlabel('x')
plt.ylabel('y')
plt.title('random data')
plt.show()
#重置数据集索引,应为合并后数据索引重复
res.reset_index(inplace=True, drop=True)
#取索引是4的整数倍的的数据做为测试集
test=res[(res.index%4==0)]
#取索引不是4的整数倍的的数据做为训练集
train=res[(res.index%4!=0)]
#选择训练集的特征
X=train[['x','y']]
#选择训练集的标签
Z = train['z']
from sklearn import svm
#新建SVC分类器
clf = svm.SVC(kernel='linear')
#训练
clf.fit(X, Z)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
#在训练集上的准确率
clf.score(X, Z)
0.9933333333333333
#在测试集上的准确率
clf.score(test[['x','y']],test['z'])
1.0
plt.scatter(X['x'], X['y'], c=Z,cmap=plt.cm.Paired)
#坐标系
ax = plt.gca()
#获得坐标系边界
xlim = ax.get_xlim()
ylim = ax.get_ylim()
# 0-1生成300个点
xx = np.linspace(xlim[0], xlim[1], 300)
yy = np.linspace(ylim[0], ylim[1], 300)
#生成网格坐标
YY, XX = np.meshgrid(yy, xx)
#将网格坐标组成样本
xy = np.vstack([XX.ravel(), YY.ravel()]).T
#求xy到分界线的函数距离
height = clf.decision_function(xy).reshape(XX.shape)
# 绘制等高线线,levels=[-1, 0, 1]表示绘制距离
ax.contour(XX, YY, height, colors='k',levels=[-1, 0, 1], alpha=0.5,linestyles=['--', '-', '--'])
# 绘制出支持向量
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,linewidth=1, facecolors='none', edgecolors='k')
plt.show()