题目:
在鸢尾花数据集和 MNIST 手写数字数据库上,分别用线性核和高斯核训练 一个 SVM,写出实验对比结果。要求:
(1)使用 70%数据作为训练集,然后对剩余 30%数据测试。
(2)计算分类精确度值。
步骤:
# -*- coding: utf-8 -*-
###鸢尾花线性核、高斯核###
import numpy as np
from sklearn import svm
import random
import csv
from sklearn.metrics import classification_report
def loadDataSet():
testMat=[];data0=[]
data=csv.reader(open('E:/pywork/test/sy-5/iris/iris.csv'))
for line in data:
lineArr=[]
for i in range(5):
lineArr.append(float(line[i]))
data0.append(lineArr)
m,n=np.shape(data0)
times=int(m*0.3)
for i in range(times):
randIndex=int(random.uniform(0,len(data0)))
testMat.append(data0[randIndex])
del(data0[randIndex])
dataMat = data0[:]
return dataMat,testMat
def separate(dataset):
dataMat =[] ; labelMat = []
for line in dataset:
lineArr = []
for i in range(4):
lineArr.append(line[i])
dataMat.append(lineArr)
labelMat.append(line[-1])
return dataMat,labelMat
def test(X,model):
rightCount=0.0
numTestVec=0.0
b=[]
for line in X:
numTestVec +=1.0
lineArr=[]
a=[]
for i in range(4):
lineArr.append(float(line[i]))
a.append(lineArr)
result=model.predict(a)
b.append(int(result))
if int(result)==int(line[-1]):
rightCount +=1
rightRate=(float(rightCount)/numTestVec)
print("the right rate of this test is:%f"% rightRate)
return rightRate,b
trainSet,testSet=loadDataSet()
trainMat,trainlabels=separate(trainSet)
testMat,testlabels=separate(testSet)
clf1=svm.SVC(kernel='linear')
clf1.fit(trainMat,trainlabels)
try1,re1=test(testSet,clf1)
com1=classification_report(testlabels, re1)
clf2=svm.SVC(kernel='rbf')
clf2.fit(trainMat,trainlabels)
try2,re2=test(testSet,clf2)
com2=classification_report(testlabels, re2)
print('linear:',try1)
print(com1)
print('rbf',try2)
print(com2)
运行结果:
# -*- coding: utf-8 -*-
##############鸢尾花数据集使用SVM线性分类###################
###导入相关的包
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
###数据获取
iris=datasets.load_iris() #datasets
#每行的数据,一共四列,每一列映射为feature_names中对应的值
X=iris.data
#每行数据对应的分类结果值(也就是每行数据的label值),取值为[0,1,2]
Y=iris.target
#通过Y=iris.target.size,可以得到一共150行数据,三个类别个50条数据,并且数据是按照0,1,2的顺序放的
###数据处理
#只取y<2的类别,也就是0 1并且只取前两个特征
X=X[:,:2]
#获取0 1类别的数据
Y1=Y[Y<2]
y1=len(Y1)
#获取0类别的数据
Y2=Y[Y<1]
y2=len(Y2)
X=X[:y1,:2]
###原始数据的绘制(此时得到图1)
plt.scatter(X[0:y2,0],X[0:y2,1],color='red') #绘制出类别0和类别1
plt.scatter(X[y2+1:y1,0],X[y2+1:y1,1],color='blue') #绘制出类别0和类别1
plt.title("iris-1")
plt.show()
###归一化处理
standardScaler=StandardScaler() #标准化
standardScaler.fit(X)
#计算训练数据的均值和方差
X_standard=standardScaler.transform(X)
#用scaler中的均值和方差来转换X,使X标准化
svc=LinearSVC(C=1e9)
svc.fit(X_standard,Y1)
###画出决策边界(此时得到图2)
def plot_decision_boundary(model, axis):
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1, 1),# 600个,影响列数
np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1, 1),# 600个,影响行数
)
# x0 和 x1 被拉成一列,然后拼接成360000行2列的矩阵,表示所有点
X_new = np.c_[x0.ravel(), x1.ravel()] # 变成 600 * 600行, 2列的矩阵
y_predict = model.predict(X_new) # 二维点集才可以用来预测
zz = y_predict.reshape(x0.shape) # (600, 600)
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap) #contourf绘画等高线并填充轮廓,linewidth=3将图像线的粗细设置为3
print(X_new)
#输出的内容
# [[-3. -3. ]
# [-2.98998331 -3. ]
# [-2.97996661 -3. ]
# ...
# [ 2.97996661 3. ]
# [ 2.98998331 3. ]
# [ 3. 3. ]]
plot_decision_boundary(svc, axis=[-3, 3, -3, 3])
plt.scatter(X_standard[0:y2,0], X_standard[0:y2,1],color='red')
plt.scatter(X_standard[y2:y1,0], X_standard[y2:y1,1],color='blue')
plt.title("iris-2")
plt.show()
###再画一个svc2(此时得到图3)
svc2=LinearSVC(C=0.01)
svc2.fit(X_standard,Y1)
print(svc2.coef_)
print(svc2.intercept_)
plot_decision_boundary(svc2, axis=[-3, 3, -3, 3])
plt.scatter(X_standard[0:y2,0], X_standard[0:y2,1],color='red')
plt.scatter(X_standard[y2:y1,0], X_standard[y2:y1,1],color='blue')
plt.title("iris-3")
plt.show()
###对分好类的内容基础上添加上下边界(此时得到图4)
def plot_svc_decision_boundary(model, axis):
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1, 1),# 600个,影响列数
np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1, 1),# 600个,影响行数
)
# x0 和 x1 被拉成一列,然后拼接成360000行2列的矩阵,表示所有点
X_new = np.c_[x0.ravel(), x1.ravel()] # 变成 600 * 600行, 2列的矩阵
y_predict = model.predict(X_new) # 二维点集才可以用来预测
zz = y_predict.reshape(x0.shape) # (600, 600)
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)
w = model.coef_[0]
b = model.intercept_[0]
index_x = np.linspace(axis[0], axis[1], 100)
# f(x,y) = w[0]x1 + w[1]x2 + b 决策树形式
# 1 = w[0]x1 + w[1]x2 + b 上边界
# -1 = w[0]x1 + w[1]x2 + b 下边界
y_up = (1-w[0]*index_x - b) / w[1]
y_down = (-1-w[0]*index_x - b) / w[1]
x_index_up = index_x[(y_up<=axis[3]) & (y_up>=axis[2])]
x_index_down = index_x[(y_down<=axis[3]) & (y_down>=axis[2])]
y_up = y_up[(y_up<=axis[3]) & (y_up>=axis[2])]
y_down = y_down[(y_down<=axis[3]) & (y_down>=axis[2])]
plt.plot(x_index_up, y_up, color="black") #plt.plot(x,y,color=,lw=,label=),label是图像所带的标签
plt.plot(x_index_down, y_down, color="black")
# plot_svc_decision_boundary(svc, axis=[-3, 3, -3, 3])
# plt.scatter(X_standard[0:y2,0], X_standard[0:y2,1],color='red')
# plt.scatter(X_standard[y2:y1,0], X_standard[y2:y1,1],color='blue')
# plt.title("iris-4")
# plt.show()
###修改c值(此时得到图5)
plot_svc_decision_boundary(svc2, axis=[-3, 3, -3, 3])
plt.scatter(X_standard[0:y2,0], X_standard[0:y2,1],color='red')
plt.scatter(X_standard[y2:y1,0], X_standard[y2:y1,1],color='blue')
plt.show()
###求解精确值
import pandas as pd
iris_data = pd.read_csv('iris.csv') #由于这个数据没有列名, 所以先给每个列取个名字。
iris_data.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
print(iris_data.head(5))
iris_data.describe()
from sklearn.model_selection import train_test_split
all_x = iris_data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
all_y = iris_data['class']#.values
(training_x,testing_x,training_y,testing_y)=train_test_split(all_x,all_y,test_size=0.3,random_state=1)
def show_accuracy(y_hat,y_train,str):
pass
###高斯核函数
def SVM():
from sklearn import svm
classifier=svm.SVC(kernel='rbf',gamma=0.1,decision_function_shape='ovo',C=0.8)
#调用ravel()函数将矩阵转变成一维数组
classifier.fit(training_x,training_y.ravel())
print("高斯核数据集的准确率:", classifier.score(training_x, training_y))
y_hat = classifier.predict(training_x)
show_accuracy(y_hat, training_y, '训练集')
print("测试集的准确率:", classifier.score(testing_x, testing_y))
y_hat = classifier.predict(testing_x)
show_accuracy(y_hat, testing_y, '测试集')
SVM()
###线性核函数
def SVM():
from sklearn import svm
classifier=svm.SVC(kernel='linear',gamma=0.1,decision_function_shape='ovo',C=0.8)
#调用ravel()函数将矩阵转变成一维数组
classifier.fit(training_x,training_y.ravel())
print("线性核训练集的准确率:", classifier.score(training_x, training_y))
y_hat = classifier.predict(training_x)
show_accuracy(y_hat, training_y, '训练集')
print("测试集的准确率:", classifier.score(testing_x, testing_y))
y_hat = classifier.predict(testing_x)
show_accuracy(y_hat, testing_y, '测试集')
SVM()
运行结果:
# -*- coding: utf-8 -*-
#######################mnist手写数据识别###############################
import numpy as np
from sklearn import svm
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
###线性核函数
if __name__ == '__main__':
mnist = load_digits() #读取数据
# print(mnist)
# train_test_split函数可以随机划分训练集和测试集
train_x,test_x,train_y,test_y = train_test_split(mnist.data,mnist.target,test_size=0.3,random_state=40)
model = svm.SVC(kernel='linear')
model.fit(train_x,train_y) #数据训练模型
# 将训练好的模型进行预测并打印下准确度
z = model.predict(test_x)
print('线性核准确率:',np.sum(z==test_y)/z.size)
###高斯核函数
if __name__ == '__main__':
mnist = load_digits()
train_x,test_x,train_y,test_y = train_test_split(mnist.data,mnist.target,test_size=0.3,random_state=40)
model = svm.SVC(kernel='rbf')
model.fit(train_x,train_y)
z = model.predict(test_x)
print('高斯核准确率:',np.sum(z==test_y)/z.size)
运行结果: