利用scikit-learn实现svm

最新推荐文章于 2024-06-29 23:21:26 发布

Matrix-yang

最新推荐文章于 2024-06-29 23:21:26 发布

阅读量877

点赞数

分类专栏：机器学习

本文链接：https://blog.csdn.net/qq_21768483/article/details/86495440

版权

机器学习专栏收录该内容

42 篇文章 12 订阅

订阅专栏

构造训练数据

import numpy as np 
#产生正态分布的数据100组，中心点（0，0），其标准差σ为1
p=np.random.randn(100,2)
#将中心点移动到（3.5,3.5），作为正类
for i in range(100):
    p[i][0]+=3.5
    p[i][1]+=3.5

#产生正态分布的数据100组，中心点（0，0），其标准差σ为1，作为负类
f=np.random.randn(100,2)

import pandas as pd 

#将np数组转换成dataframe
df_p=pd.DataFrame(p,columns=['x','y'])
#加上标签z,正类标签1
df_p['z']=1

#将np数组转换成dataframe
df_f=pd.DataFrame(f,columns=['x','y'])
#加上标签z,负类标签0
df_f['z']=0

#将正负类合并成一个dataframe
res = pd.concat([df_p, df_f], axis=0)
res

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

	x	y	z
0	4.250614	4.056121	1
1	4.608059	3.963256	1
2	3.667928	2.844298	1
3	5.110389	2.207770	1
4	4.565589	2.865835	1
5	4.967936	3.428427	1
6	4.164498	4.756457	1
7	1.996462	3.461555	1
8	3.320537	3.236716	1
9	2.552247	3.740323	1
10	3.529014	4.360995	1
11	3.446757	2.899550	1
12	2.765047	3.373536	1
13	3.239816	2.895096	1
14	2.438422	2.251773	1
15	3.001785	2.546292	1
16	5.252033	5.593779	1
17	3.413621	2.757865	1
18	3.624994	3.797129	1
19	3.217611	3.681506	1
20	4.990572	2.256055	1
21	4.350523	3.607818	1
22	3.533867	3.949800	1
23	3.374815	3.882725	1
24	4.112398	5.065239	1
25	2.879833	3.947735	1
26	1.701558	2.832577	1
27	1.784570	5.255377	1
28	2.876247	1.793252	1
29	4.227331	6.178785	1
...	...	...	...
70	-1.081413	0.046719	0
71	-1.004865	0.204234	0
72	0.023450	0.453429	0
73	-0.460663	-0.667104	0
74	0.935339	-1.747852	0
75	-0.201228	0.347887	0
76	0.987963	0.706268	0
77	0.268110	-1.024068	0
78	0.408360	0.661068	0
79	1.501026	1.667613	0
80	-0.508721	-1.314594	0
81	-0.907388	-0.119675	0
82	1.227677	-1.305001	0
83	-0.100075	0.955962	0
84	2.501123	-0.224945	0
85	2.691064	1.343907	0
86	0.744924	0.078018	0
87	-0.721247	-0.296832	0
88	-0.602119	-0.631173	0
89	0.308663	1.204604	0
90	0.577042	0.367347	0
91	2.394736	-0.412487	0
92	0.535134	-0.745468	0
93	0.409373	-0.259470	0
94	0.404675	0.454216	0
95	1.157458	1.642951	0
96	0.885934	-1.503737	0
97	0.363141	-0.926611	0
98	0.144915	0.799192	0
99	-0.325018	-1.283557	0

200 rows × 3 columns

import matplotlib.pyplot as plt

#绘制出数据集的散点图
plt.scatter(res['x'], res['y'], c=res['z'],cmap=plt.cm.Paired)
plt.xlabel('x')
plt.ylabel('y')
plt.title('random data')
plt.show()

在这里插入图片描述

#重置数据集索引，应为合并后数据索引重复
res.reset_index(inplace=True, drop=True)
#取索引是4的整数倍的的数据做为测试集
test=res[(res.index%4==0)]
#取索引不是4的整数倍的的数据做为训练集
train=res[(res.index%4!=0)]

#选择训练集的特征
X=train[['x','y']]
#选择训练集的标签
Z = train['z']

from sklearn import svm
#新建SVC分类器
clf = svm.SVC(kernel='linear')
#训练
clf.fit(X, Z)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#在训练集上的准确率
clf.score(X, Z)

0.9933333333333333

#在测试集上的准确率
clf.score(test[['x','y']],test['z'])

1.0

plt.scatter(X['x'], X['y'], c=Z,cmap=plt.cm.Paired)
#坐标系
ax = plt.gca()
#获得坐标系边界
xlim = ax.get_xlim()
ylim = ax.get_ylim()


# 0-1生成300个点
xx = np.linspace(xlim[0], xlim[1], 300)
yy = np.linspace(ylim[0], ylim[1], 300)
#生成网格坐标
YY, XX = np.meshgrid(yy, xx)
#将网格坐标组成样本
xy = np.vstack([XX.ravel(), YY.ravel()]).T
#求xy到分界线的函数距离
height = clf.decision_function(xy).reshape(XX.shape)
# 绘制等高线线,levels=[-1, 0, 1]表示绘制距离
ax.contour(XX, YY, height, colors='k',levels=[-1, 0, 1], alpha=0.5,linestyles=['--', '-', '--'])
# 绘制出支持向量
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=100,linewidth=1, facecolors='none', edgecolors='k')

plt.show()