数据挖掘与机器学习作业_05 逻辑回归

逻辑回归

导入包

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from my_tools import *
import warnings
warnings.filterwarnings("ignore")
jibing_res = pd.read_excel("./jibing_feature_res_final.xlsx")
jibing = pd.read_excel("./jibing_feature_final.xlsx")
jibing.head()
左右是否外伤症状持续时间明显夜间痛年龄高血压高血脂2型糖尿病吸烟与否饮酒与否...腺苷脱氨酶ADA果糖胺肌酸激酶α-L-盐藻糖苷酶乳酸淀粉酶同型半胱氨酸总铁结合力血型
000306510000...10.01.3248.012.01.949.09.912.343.53
111206210000...10.01.6777.016.01.481.09.216.955.50
210415500000...15.01.8678.022.01.989.09.97.051.40
310306000000...16.01.6892.012.01.469.09.315.853.00
401306100000...13.01.6058.014.01.7153.08.113.245.90

5 rows × 60 columns

jibing_res.head()
结果
00
11
21
30
41

归一化

jibing = guiyihua(jibing)

标准化

jibing = biaozhunhua(jibing)
jibing.iloc[0]
左右               0.000000
是否外伤             0.000000
症状持续时间           3.000000
明显夜间痛            0.000000
年龄               0.402864
高血压              1.000000
高血脂              0.000000
2型糖尿病            0.000000
吸烟与否             0.000000
饮酒与否            -0.448892
红细胞计数*10^12/L   -0.111242
血红蛋白            -1.262287
红细胞压积           -0.628449
血小板计数            1.836626
血小板压积           -0.016066
总蛋白g/L           0.117665
白蛋白g/L          -0.783686
球蛋白g/L           0.892589
白球比             -1.141215
ALT丙氨酸氨基转移酶     -0.955624
碱性磷酸酶            0.577122
谷氨酸转肽酶          -0.458009
AST:ALT          1.972187
总胆红素            -0.567388
直接胆红素            0.058454
间接胆红素           -0.700329
钾                1.331665
钠               -0.154827
氯               -0.203053
钙               -1.011273
磷               -0.094543
镁                1.419808
葡萄糖             -0.813153
肌酐               0.219459
尿素               0.950509
尿酸              -0.222815
甘油三酯             0.111053
总胆固醇             0.102856
H高密度胆固醇          0.085759
L低密度胆固醇          0.101836
载脂蛋白A1          -0.047968
载脂蛋白B            0.763163
载脂蛋白E mg/l      -0.397325
aPoB/aPoA1       0.073904
脂蛋白小a           -0.059640
乳酸脱氢酶LDH        -1.057407
β-2微球蛋白          1.273672
胆碱酯酶            -1.187449
前白蛋白mg/l         0.070510
总胆汁酸            -0.415554
腺苷脱氨酶ADA        -0.396787
果糖胺             -0.160764
肌酸激酶            -0.176406
α-L-盐藻糖苷酶       -1.241122
乳酸               0.269307
淀粉酶             -0.755958
同型半胱氨酸          -0.420427
铁               -0.880622
总铁结合力           -1.226099
血型               3.000000
Name: 0, dtype: float64

为解决样本不均衡问题

使用 SMOTE 补充数据

SMOTE:插值的方法扩充数据

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from imblearn.over_sampling import SMOTE
f1_list = []
set_font()
for i in range(1,60):
    smote = SMOTE(sampling_strategy=1, random_state=42)
    selector = SelectKBest(mutual_info_classif, k=i)
    jibing_ = selector.fit_transform(jibing, jibing_res)
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)
    Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)    
    clf = LogisticRegression(random_state=42)
    clf.fit(Xtrain, Ytrain)
    y_pre = clf.predict(Xtest)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(range(1,60),f1_list,"f1 - 特征筛选")

请添加图片描述

发现 f1-score 的最高值出现在 0-10 之间

f1_list=[]
for i in range(5,11):
    smote = SMOTE(sampling_strategy=1, random_state=42)
    selector = SelectKBest(mutual_info_classif, k=i)
    jibing_ = selector.fit_transform(jibing, jibing_res)
    Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)
    Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)
    clf = LogisticRegression(random_state=42)
    clf.fit(Xtrain, Ytrain)
    y_pre = clf.predict(Xtest)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(range(5,11),f1_list,"f1 - 特征筛选")

请添加图片描述

选取最佳的6个特征进行训练,K = 6

smote = SMOTE(sampling_strategy=1, random_state=42)
selector = SelectKBest(mutual_info_classif, k=6)
jibing_ = selector.fit_transform(jibing, jibing_res)
Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)

Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)
clf = LogisticRegression(random_state=42)
clf.fit(Xtrain, Ytrain)

y_pre = clf.predict(Xtest)
metrics_ = res_metrics(Ytest,y_pre,"f1 - k=6")
#####################f1 - k=6#####################
+--------------------+--------------------+--------------------+
|     precision      |       recall       |         f1         |
+--------------------+--------------------+--------------------+
| 0.8265539116030419 | 0.6724137931034483 | 0.7415586728925839 |
+--------------------+--------------------+--------------------+

尝试降维

PCA

f1_list = []
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
for i in range(1,6):
    clf = LogisticRegression(random_state=42)
    pca = PCA(n_components=i,random_state=42)
    Xtrain_ = pca.fit_transform(Xtrain,Ytrain)
    clf.fit(Xtrain_, Ytrain)
    Xtest_ = pca.fit_transform(Xtest)
    y_pre = clf.predict(Xtest_)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(range(1,6),f1_list,"f1 - PCA")

请添加图片描述

f1_list = []
from sklearn.manifold import TSNE
for i in range(1,4):
    clf = LogisticRegression(random_state=42)
    tsne = TSNE(n_components=i,random_state=42)
    Xtrain_ = tsne.fit_transform(Xtrain,Ytrain)
    clf.fit(Xtrain_, Ytrain)
    Xtest_ = tsne.fit_transform(Xtest)
    y_pre = clf.predict(Xtest_)
    metrics_ = res_metrics(Ytest,y_pre,"调参")
    f1_list.append(metrics_["f1-score"])
zhexiantu(range(1,4),f1_list,"tsne - F1")

请添加图片描述

放弃降维,直接使用 K = 6 的特征筛选

smote = SMOTE(sampling_strategy=1, random_state=42)
selector = SelectKBest(mutual_info_classif, k=6)
jibing_ = selector.fit_transform(jibing, jibing_res)
Xtrain,Xtest,Ytrain,Ytest = train_test_split(jibing_,jibing_res,test_size=0.3,random_state=42)
Xtrain, Ytrain = smote.fit_resample(Xtrain,Ytrain)
# 训练,拟合
clf = LogisticRegression(random_state=42)
clf.fit(Xtrain, Ytrain)
y_pre = clf.predict(Xtest)
metrics_ = res_metrics(Ytest,y_pre,"逻辑回归-特征筛选")
####################逻辑回归-特征筛选#####################
+--------------------+--------------------+--------------------+
|     precision      |       recall       |         f1         |
+--------------------+--------------------+--------------------+
| 0.8263447971781305 | 0.7068965517241379 | 0.7619678246723789 |
+--------------------+--------------------+--------------------+

最高的 f1-score 为0.76


  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值