数据集
使用UCI Machine Learning的Iris数据集,下载的文档iris.data内容如图:
每一行有五项,前四项代表鸢尾花的性状,第五项代表鸢尾花的种类,有“Iris-setosa”,“Iris-versicolor”,“Iris-virginica”共三类。
实现代码(python)
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import confusion_matrix
# 读取数据
data = pd.read_csv("iris.data", header=None)
data.dropna(inplace=True) # 设置数据原地替换
# 数据标签化
data.loc[data[4] == 'Iris-setosa', 4] = 0
data.loc[data[4] == 'Iris-versicolor', 4] = 1
data.loc[data[4] == 'Iris-virginica', 4] = 2
# 归一化
for i in range(4):
data[i] = (data[i] - data[i].mean()) / data[i].std()
# 划分训练集和测试集
total_X = data.iloc[:,:4]
total_y = data.iloc[:,4].astype(int)
X_train, X_test, y_train, y_test = train_test_split(total_X, total_y, test_size=0.333333)
# 参数设置
svm = SVC(kernel='rbf', decision_function_shape='ovo') # 使用rbf核函数,一对一的多分类
C_range = np.logspace(-5, 15, 12, base=2) # 人工经验,C的范围为2^-5~2^15
gamma_range = np.logspace(-15, 3, 12, base=2) # 人工经验,gamma的范围为2^-15~2^3
# 网格搜索与交叉验证
param_grid = {'kernel':['rbf'], 'C':C_range, 'gamma':gamma_range}
grid = GridSearchCV(svm, param_grid, cv=3, n_jobs=-1)
# 训练
grid.fit(X_train, y_train)
print("最好参数:%s" % grid.best_params_)
print("最好识别率:%s" % grid.best_score_)
# 测试
score = grid.score(X_test, y_test)
print("测试集的识别率:%s" % score)
# 预测整个数据集
res = grid.predict(total_X)
print("整个数据集的预测结果:\n%s" % res)
# 整个数据集的混淆矩阵
cm = confusion_matrix(total_y, res)
print("混淆矩阵:\n%s" % cm)