数据挖掘与机器学习作业_04 支持向量机

支持向量机

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd
import numpy as np
# 导入自己写的工具类
from my_tools import *
# 忽略warning
import warnings
from sklearn.decomposition import PCA
warnings.filterwarnings("ignore")
jibing_res = pd.read_excel("./jibing_feature_res_final.xlsx")
jibing = pd.read_excel("./jibing_feature_final.xlsx")

在SVM中连续型变量应尽可能归一化

11到60列是连续型的变量,要归一化
jibing.iloc[:,11:62].head()
血红蛋白红细胞压积血小板计数血小板压积总蛋白g/L白蛋白g/L球蛋白g/L白球比ALT丙氨酸氨基转移酶碱性磷酸酶...腺苷脱氨酶ADA果糖胺肌酸激酶α-L-盐藻糖苷酶乳酸淀粉酶同型半胱氨酸总铁结合力血型
0120.036.6307.00.27673.841.732.11.37102...10.01.3248.012.01.949.09.912.343.53
1131.038.5207.00.19167.944.523.41.93073...10.01.6777.016.01.481.09.216.955.50
2128.038.586.00.09357.336.221.11.72478...15.01.8678.022.01.989.09.97.051.40
3146.045.4190.00.17481.147.733.41.41869...16.01.6892.012.01.469.09.315.853.00
4135.040.3102.00.11479.252.227.01.92469...13.01.6058.014.01.7153.08.113.245.90

5 rows × 49 columns

jibing.index = range(jibing.shape[0])

归一化

jibing = guiyihua(jibing)
jibing.head()
左右是否外伤症状持续时间明显夜间痛年龄高血压高血脂2型糖尿病吸烟与否饮酒与否...腺苷脱氨酶ADA果糖胺肌酸激酶α-L-盐藻糖苷酶乳酸淀粉酶同型半胱氨酸总铁结合力血型
000300.11036010000.0...0.2644710.0013300.0040040.0751880.1395350.1442020.0480570.2356980.2804433
111200.10360410000.0...0.2644710.0033540.0065230.1052630.1007750.2811300.0409000.3409610.5018450
210410.08783800000.0...0.4243680.0044530.0066100.1503760.1395350.3153620.0480570.1144160.4261990
310300.09909900000.0...0.4563480.0034120.0078260.0751880.1007750.2297820.0419220.3157890.4557200
401300.10135100000.0...0.3604090.0029500.0048730.0902260.1240310.5892170.0296520.2562930.3247230

5 rows × 60 columns

标准化

jibing = biaozhunhua(jibing)
jibing.head()
左右是否外伤症状持续时间明显夜间痛年龄高血压高血脂2型糖尿病吸烟与否饮酒与否...腺苷脱氨酶ADA果糖胺肌酸激酶α-L-盐藻糖苷酶乳酸淀粉酶同型半胱氨酸总铁结合力血型
000300.4028641000-0.448892...-0.396787-0.160764-0.176406-1.2411220.269307-0.755958-0.420427-0.880622-1.2260993
111200.1802581000-0.448892...-0.396787-0.079732-0.098498-0.773740-0.3907230.608493-0.538745-0.1325860.0887610
21041-0.3391560000-0.448892...1.055008-0.035743-0.095811-0.0726670.2693070.949606-0.420427-1.742489-0.3604830
310300.0318540000-0.448892...1.345367-0.077417-0.058200-1.241122-0.3907230.096824-0.521842-0.311464-0.1851680
401300.1060560000-0.448892...0.474290-0.095938-0.149541-1.0074310.0052953.678509-0.724673-0.734267-0.9631270

5 rows × 60 columns

使用SVC进行训练

from time import time
import datetime
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, recall_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

使用不同核函数

  • 支持向量机可以通过核函数来将数据映射到更高维度,从而使模型更有效地拟合数据。

  • ploy:多项式核

  • rbf:径向基函数

  • linear:线性核函数

  • sigmoid:核函数

根据结果,有两种需要探索的核函数,sigmoid,linear

smote = SMOTE(sampling_strategy=1, random_state=42)
Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing,jibing_res,test_size=0.3,random_state=42)
Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)
for kernel in ["poly","rbf",'linear',"sigmoid"]:
    clf = SVC(kernel=kernel,random_state=42)
    clf.fit(Xtrain, Ytrain)
    y_pre = clf.predict(Xtest)
    metrics_ = res_metrics(Ytest,y_pre,"核函数为: {}".format(kernel))
####################核函数为: poly####################
+--------------------+--------------------+--------------------+
|     precision      |       recall       |         f1         |
+--------------------+--------------------+--------------------+
| 0.8056570140920862 | 0.3103448275862069 | 0.4480843629362087 |
+--------------------+--------------------+--------------------+
####################核函数为: rbf#####################
+--------------------+--------------------+---------------------+
|     precision      |       recall       |          f1         |
+--------------------+--------------------+---------------------+
| 0.8051937984496125 | 0.1724137931034483 | 0.28401276375836354 |
+--------------------+--------------------+---------------------+
###################核函数为: linear###################
+--------------------+---------------------+--------------------+
|     precision      |        recall       |         f1         |
+--------------------+---------------------+--------------------+
| 0.8011947349110254 | 0.39655172413793105 | 0.5305215491957693 |
+--------------------+---------------------+--------------------+
##################核函数为: sigmoid###################
+--------------------+--------------------+--------------------+
|     precision      |       recall       |         f1         |
+--------------------+--------------------+--------------------+
| 0.8102625272331155 | 0.6206896551724138 | 0.7029187625012112 |
+--------------------+--------------------+--------------------+

对linear 和 sigmoid 进行调参

首先是 linear ,SVM 默认的核函数就是linear

特征筛选

SelectKBest:

根据指定的统计检验方法选择出最优的 k 个特征。

SMOTE:

使用类似插值的方法扩充少数类。

mutual_info_classif:

衡量两个随机变量间相关性的一种方法,互信息的数值越大,则两个随机变量之间的相关性越强。

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from imblearn.over_sampling import SMOTE
f1_list = []
best_k = -1
best_score = -1
set_font()
for i in range(1,60):
#     sampler = RandomOverSampler(sampling_strategy=0.2, random_state=42)
    smote = SMOTE(sampling_strategy=1, random_state=42)
    selector = SelectKBest(mutual_info_classif, k=i)
    jibing_ = selector.fit_transform(jibing, jibing_res)
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)
#     Xtrain, Ytrain = sampler.fit_resample(Xtrain,Ytrain)
    Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)
    clf = SVC(random_state=42)
    clf.fit(Xtrain, Ytrain)
    y_pre = clf.predict(Xtest)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
    if best_score < metrics_["f1-score"]:
        best_k = i
        best_score = metrics_["f1-score"]
zhexiantu(range(1,60),f1_list,"f1 - 特征筛选")

请添加图片描述

线性的核函数无法完美地拟合高维的数据,从而产生了下降趋势。

best_k
3

实际上,这里所做的是牺牲多数样本的准确性

尽可能将少数样本找出来

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=1, random_state=42)
selector = SelectKBest(mutual_info_classif, k=3)
jibing_ = selector.fit_transform(jibing, jibing_res)

Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)
#     Xtrain, Ytrain = sampler.fit_resample(Xtrain,Ytrain)
Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)

clf = SVC(random_state=42)
clf.fit(Xtrain, Ytrain)
y_pre = clf.predict(Xtest)
metrics_ = res_metrics(Ytest,y_pre,"find_best")
# f1_list.append(metrics_["f1-score"])
####################find_best#####################
+--------------------+--------------------+-------------------+
|     precision      |       recall       |         f1        |
+--------------------+--------------------+-------------------+
| 0.7883562091503269 | 0.5344827586206896 | 0.637058344527622 |
+--------------------+--------------------+-------------------+

PCA 没多大作用

f1_list = []
from sklearn.manifold import TSNE
for i in range(1,3):
    clf = SVC(random_state=42)
    pca = PCA(n_components=i,random_state=42)
    Xtrain_ = pca.fit_transform(Xtrain,Ytrain)
    clf.fit(Xtrain_, Ytrain)
    Xtest_ = pca.fit_transform(Xtest)
    y_pre = clf.predict(Xtest_)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(range(1,3),f1_list,"f1 - PCA")

TSNE效果很好

请添加图片描述

f1_list = []
from sklearn.manifold import TSNE
for i in range(1,3):
    clf = SVC(random_state=42)
    tsne = TSNE(n_components=i,random_state=42)
    Xtrain_ = tsne.fit_transform(Xtrain,Ytrain)
    clf.fit(Xtrain_, Ytrain)
    Xtest_ = tsne.fit_transform(Xtest)
    y_pre = clf.predict(Xtest_)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(range(1,3),f1_list,"tsne - F1")

请添加图片描述

clf = SVC(random_state=42)
tsne = TSNE(n_components=2,random_state=42)
Xtrain_ = tsne.fit_transform(Xtrain,Ytrain)
clf.fit(Xtrain_, Ytrain)
Xtest_ = tsne.fit_transform(Xtest)
y_pre = clf.predict(Xtest_)
metrics_ = res_metrics(Ytest,y_pre,"TSNE-F1")
f1_list.append(metrics_["f1-score"])
#####################TSNE-F1######################
+--------------------+--------------------+--------------------+
|     precision      |       recall       |         f1         |
+--------------------+--------------------+--------------------+
| 0.7909002130681818 | 0.7413793103448276 | 0.7653395422396697 |
+--------------------+--------------------+--------------------+

综上,选择TSNE进行降到2维时,linear 的效果最好

f1-score 为0.765

训练

要调的参数

  • 惩罚系数 C
  • 核函数

接下来要对 sigmoid 核函数进行调参

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from imblearn.over_sampling import SMOTE
f1_list = []
set_font()
smote = SMOTE(sampling_strategy=1, random_state=42)
for i in range(1,60):
    selector = SelectKBest(mutual_info_classif, k=i)
    jibing_ = selector.fit_transform(jibing, jibing_res)
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)
    Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)    
    clf = SVC(kernel="sigmoid",random_state=42)
    clf.fit(Xtrain, Ytrain)
    y_pre = clf.predict(Xtest)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(range(1,60),f1_list,"f1 - 特征筛选")

请添加图片描述

f1_list = []
set_font()
for i in range(50,60):
    smote = SMOTE(sampling_strategy=1, random_state=42)
    selector = SelectKBest(mutual_info_classif, k=i)
    jibing_ = selector.fit_transform(jibing, jibing_res)
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)
    Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)
    clf = SVC(kernel="sigmoid",random_state=42)
    clf.fit(Xtrain, Ytrain)
    y_pre = clf.predict(Xtest)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(range(50,60),f1_list,"f1 - 特征筛选")

请添加图片描述

确定了选择前53个特征

smote = SMOTE(sampling_strategy=1, random_state=42)
selector = SelectKBest(mutual_info_classif, k=53)
jibing_ = selector.fit_transform(jibing, jibing_res)
Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)
Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)
clf = SVC(kernel="sigmoid",random_state=42)
clf.fit(Xtrain, Ytrain)
y_pre = clf.predict(Xtest)
metrics_ = res_metrics(Ytest,y_pre,"f1-select")
f1_list.append(metrics_["f1-score"])
####################f1-select#####################
+--------------------+--------------------+--------------------+
|     precision      |       recall       |         f1         |
+--------------------+--------------------+--------------------+
| 0.8095399875039051 | 0.6724137931034483 | 0.7346327001417388 |
+--------------------+--------------------+--------------------+

降维-PCA

f1_list = []
for i in range(1,53):
    clf = SVC(kernel="sigmoid",random_state=42)
    pca = PCA(n_components=i,random_state=42)
    Xtrain_ = pca.fit_transform(Xtrain,Ytrain)
    clf.fit(Xtrain_, Ytrain)
    Xtest_ = pca.fit_transform(Xtest)
    y_pre = clf.predict(Xtest_)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(range(1,53),f1_list,"f1 - PCA")

请添加图片描述

f1_list = []
for i in range(1,10):
    clf = SVC(kernel="sigmoid",random_state=42)
    pca = PCA(n_components=i,random_state=42)
    Xtrain_ = pca.fit_transform(Xtrain,Ytrain)
    clf.fit(Xtrain_, Ytrain)
    Xtest_ = pca.fit_transform(Xtest)
    y_pre = clf.predict(Xtest_)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(range(1,10),f1_list,"f1 - PCA")

请添加图片描述

TSNE

f1_list = []
from sklearn.manifold import TSNE
for i in range(1,4):
    clf = SVC(kernel="sigmoid",random_state=42)
    tsne = TSNE(n_components=i,random_state=42)
    Xtrain_ = tsne.fit_transform(Xtrain,Ytrain)
    clf.fit(Xtrain_, Ytrain)
    Xtest_ = tsne.fit_transform(Xtest)
    y_pre = clf.predict(Xtest_)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(range(1,4),f1_list,"tsne - TSNE")

请添加图片描述

最终确定使用PCA降到6维

clf = SVC(kernel="sigmoid",random_state=42)
pca = PCA(n_components=6,random_state=42)
Xtrain_ = pca.fit_transform(Xtrain,Ytrain)
clf.fit(Xtrain_, Ytrain)
Xtest_ = pca.fit_transform(Xtest)
y_pre = clf.predict(Xtest_)
metrics_ = res_metrics(Ytest,y_pre,"PCA-sigmoid")
f1_list.append(metrics_["f1-score"])
###################PCA-sigmoid####################
+--------------------+--------------------+--------------------+
|     precision      |       recall       |         f1         |
+--------------------+--------------------+--------------------+
| 0.8232222222222222 | 0.7413793103448276 | 0.7801602013904365 |
+--------------------+--------------------+--------------------+

寻找C的范围

f1_list = []
for c_ in np.linspace(0.1,100,300):
    clf = SVC(kernel='sigmoid',C=c_)
    clf = clf.fit(Xtrain_,Ytrain)
    y_pre = clf.predict(Xtest_)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(np.linspace(0.1,100,300),f1_list,"C - F1")

请添加图片描述

C的取值越小,拟合程度越好,但也不能太小

最佳值大约在5到6之间。

f1_list = []
for c_ in np.linspace(5,6,10):
    clf = SVC(kernel='sigmoid',C=c_,random_state=42)
    clf = clf.fit(Xtrain_,Ytrain)
    y_pre = clf.predict(Xtest_)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(np.linspace(5,6,10),f1_list,"C - F1")

请添加图片描述

C = 5.6

clf = SVC(kernel='sigmoid',C=5.6,random_state=42)
clf = clf.fit(Xtrain_,Ytrain)
y_pre = clf.predict(Xtest_)
metrics_ = res_metrics(Ytest,y_pre,"SVM-F1-Final")
###################SVM-F1-Final###################
+--------------------+--------------------+--------------------+
|     precision      |       recall       |         f1         |
+--------------------+--------------------+--------------------+
| 0.8350202848153668 | 0.7758620689655172 | 0.8043549105674161 |
+--------------------+--------------------+--------------------+

通过与 linear 核函数对比发现

支持向量机的 f1-score 最高0.804

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值