基于SVM算法的男女生分类器
题目:采用SVM设计男女生分类器。采用的特征包含身高、体重、鞋码、50m成绩、肺活量、是否喜欢运动共六个特征。要求:采用平台提供的软件包进行分类器的设计以及测试,尝试不同的核函数设计分类器,采用交叉验证的方式实现对于性能指标的评判(包含SE,SP,ACC和AUC,AUC的计算基于平台的软件包)。
数据集
完整数据集请到Github仓库下载。
# 数据存放在 people.cvs 文件中
# 性别:0代表女,1代表男,是否喜欢运动:0代表不喜欢,1代表喜欢
# 性别,身高,体重,鞋码,50米成绩,肺活量,是否喜欢运动
0,164,47,38,9,2500,1
0,160,46,38,9,2500,1
0,165,60,39,7.4,2400,1
0,168,44,38,9,4000,0
0,167,49,38,6.9,3800,1
0,175,50,38,9,3800,0
0,172,43,36,7.9,2400,1
0,158,49,35,7.9,2400,1
0,162,46,36,9.5,2800,0
0,158,50,37,11,2500,0
0,165,48,36,8,2500,1
0,172,57,38,9.25,2800,0
...
代码实现
# -*- encoding: utf-8 -*-
'''
@File : svm.py
@Time : 2021/02/04 08:57:33
@Author : Wang Jiaqing
@Contact : wangjiaqingll@foxmail.com
'''
# 导入必要包
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, confusion_matrix # 用于计算roc和auc
def maxminnorm(array):
# 对数据集进行归一化
maxcols = array.max(axis=0)
mincols = array.min(axis=0)
data_shape = array.shape
data_rows = data_shape[0]
data_cols = data_shape[1]
t = np.empty((data_rows, data_cols))
for i in range(data_cols):
t[:, i] = (array[:, i] - mincols[i]) / (maxcols[i] - mincols[i])
return t
def svmfun():
# 读取并处理数据集
data = pd.read_csv('SVM算法/dataset/people.csv', header=None, sep=',')
data = np.array(data)
np.random.shuffle(data)
labels = data[..., 0] # 提取标签
dataset = data[..., 1:7] # 提取特征
dataset = maxminnorm(dataset) # 特征归一化
x_train, x_test, y_train, y_test = train_test_split(
dataset, # 划分为训练集和测试集
labels,
test_size=0.2)
i = 0
KF = KFold(n_splits=4) # 做4折交叉验证
for train_index, test_index in KF.split(dataset):
i += 1
print("=" * 25, "The %d train:" % (i), "=" * 25)
x_train, x_test = dataset[train_index], dataset[test_index]
y_train, y_test = labels[train_index], labels[test_index]
# 核函数--参数利用网格搜索法大致确定:
# linear:gamma=0.1,C=10
# rbf:C=10,gamma=0.1
# sigmoid:C=10,gamma=0.1
# poly:C=10,gamma=10
clf = SVC(C=10,
kernel='linear', # 核函数,手动更改测试
degree=3,
gamma=0.1,
coef0=0.0,
shrinking=True,
probability=False,
tol=1e-3,
cache_size=200,
class_weight=None,
verbose=False,
max_iter=-1,
decision_function_shape='ovr',
random_state=None)
clf.fit(x_train, y_train)
y_predict = clf.predict(x_test)
print('AUC')
print(roc_auc_score(y_test, y_predict))
print('ACC')
print(accuracy_score(y_test, y_predict))
fpr, tpr, threshold = roc_curve(y_test, y_predict) # 计算真正率和假正率
roc_auc = auc(fpr, tpr) # 计算auc的值
matrix = confusion_matrix(y_test, y_predict) # 计算混淆矩阵
TP = matrix[0][0]
FP = matrix[0][1]
FN = matrix[1][0]
TN = matrix[1][1]
SE = TP / (TP + FN)
SP = TN / (TN + FP)
print("SE:", SE)
print("SP:", SP)
lw = 2
plt.figure(figsize=(10, 10))
plt.plot(fpr,
tpr,
color='darkorange',
lw=lw,
label='ROC curve (area = %0.2f)' %
roc_auc) # 假正率为横坐标,真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Linear-SVM')
plt.legend(loc="lower right")
plt.show()
svmfun()