基于SVM算法的男女生分类器-CSDN博客

本文链接：https://blog.csdn.net/qq_20496483/article/details/115645853

基于SVM算法的男女生分类器

题目：采用SVM设计男女生分类器。采用的特征包含身高、体重、鞋码、50m成绩、肺活量、是否喜欢运动共六个特征。要求：采用平台提供的软件包进行分类器的设计以及测试，尝试不同的核函数设计分类器，采用交叉验证的方式实现对于性能指标的评判（包含SE,SP,ACC和AUC，AUC的计算基于平台的软件包）。

数据集

完整数据集请到Github仓库下载。

# 数据存放在 people.cvs 文件中
# 性别：0代表女，1代表男，是否喜欢运动：0代表不喜欢，1代表喜欢
# 性别，身高，体重，鞋码，50米成绩，肺活量，是否喜欢运动
0,164,47,38,9,2500,1
0,160,46,38,9,2500,1
0,165,60,39,7.4,2400,1
0,168,44,38,9,4000,0
0,167,49,38,6.9,3800,1
0,175,50,38,9,3800,0
0,172,43,36,7.9,2400,1
0,158,49,35,7.9,2400,1
0,162,46,36,9.5,2800,0
0,158,50,37,11,2500,0
0,165,48,36,8,2500,1
0,172,57,38,9.25,2800,0
...

代码实现

# -*- encoding: utf-8 -*-
'''
@File    :   svm.py
@Time    :   2021/02/04 08:57:33
@Author  :   Wang Jiaqing
@Contact :   wangjiaqingll@foxmail.com
'''

# 导入必要包
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, confusion_matrix  # 用于计算roc和auc


def maxminnorm(array):
    # 对数据集进行归一化
    maxcols = array.max(axis=0)
    mincols = array.min(axis=0)
    data_shape = array.shape
    data_rows = data_shape[0]
    data_cols = data_shape[1]
    t = np.empty((data_rows, data_cols))
    for i in range(data_cols):
        t[:, i] = (array[:, i] - mincols[i]) / (maxcols[i] - mincols[i])
    return t


def svmfun():
    # 读取并处理数据集
    data = pd.read_csv('SVM算法/dataset/people.csv', header=None, sep=',')
    data = np.array(data)
    np.random.shuffle(data)
    labels = data[..., 0]  # 提取标签
    dataset = data[..., 1:7]  # 提取特征
    dataset = maxminnorm(dataset)  # 特征归一化
    x_train, x_test, y_train, y_test = train_test_split(
        dataset,  # 划分为训练集和测试集
        labels,
        test_size=0.2)

    i = 0
    KF = KFold(n_splits=4)  # 做4折交叉验证
    for train_index, test_index in KF.split(dataset):
        i += 1
        print("=" * 25, "The %d train:" % (i), "=" * 25)
        x_train, x_test = dataset[train_index], dataset[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        # 核函数--参数利用网格搜索法大致确定:
        # linear:gamma=0.1,C=10
        # rbf:C=10,gamma=0.1
        # sigmoid:C=10,gamma=0.1
        # poly:C=10,gamma=10
        clf = SVC(C=10,
                  kernel='linear',  # 核函数，手动更改测试
                  degree=3,
                  gamma=0.1,
                  coef0=0.0,
                  shrinking=True,
                  probability=False,
                  tol=1e-3,
                  cache_size=200,
                  class_weight=None,
                  verbose=False,
                  max_iter=-1,
                  decision_function_shape='ovr',
                  random_state=None)
        clf.fit(x_train, y_train)
        y_predict = clf.predict(x_test)
        print('AUC')
        print(roc_auc_score(y_test, y_predict))
        print('ACC')
        print(accuracy_score(y_test, y_predict))
        fpr, tpr, threshold = roc_curve(y_test, y_predict)  # 计算真正率和假正率
        roc_auc = auc(fpr, tpr)  # 计算auc的值
        matrix = confusion_matrix(y_test, y_predict)  # 计算混淆矩阵
        TP = matrix[0][0]
        FP = matrix[0][1]
        FN = matrix[1][0]
        TN = matrix[1][1]
        SE = TP / (TP + FN)
        SP = TN / (TN + FP)
        print("SE:", SE)
        print("SP:", SP)
        lw = 2
        plt.figure(figsize=(10, 10))
        plt.plot(fpr,
                 tpr,
                 color='darkorange',
                 lw=lw,
                 label='ROC curve (area = %0.2f)' %
                 roc_auc)  # 假正率为横坐标，真正率为纵坐标做曲线
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Linear-SVM')
        plt.legend(loc="lower right")
        plt.show()


svmfun()