数据挖掘与机器学习作业_03 特征筛选

特征筛选

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import numpy as np
import scipy
import sklearn
import re
import sys
import random
# 方差
from sklearn.feature_selection import VarianceThreshold
# 相关性过滤
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# 皮尔逊相关系数
from scipy.stats import pearsonr
from collections import OrderedDict
# 使用决策树筛选特征
from sklearn import tree
# 互信息法
from sklearn.feature_selection import mutual_info_classif as MIC
from sklearn.feature_selection import SelectKBest,chi2
from my_tools import *
import warnings
warnings.filterwarnings("ignore")

读取数据

jibing = pd.read_excel("./jibing_yuchuli_final.xlsx")
jibing.shape
(1598, 63)

消去方差为 0 的属性

selector = VarianceThreshold() #实例化,不填参数默认方差为0
jibing_var0 = selector.fit_transform(jibing) #获取删除不合格特征之后的新特征矩阵
jibing_var0.shape
(1598, 63)
jibing.head()
左右是否外伤症状持续时间明显夜间痛性别年龄高血压高血脂2型糖尿病吸烟与否...果糖胺肌酸激酶α-L-盐藻糖苷酶乳酸淀粉酶同型半胱氨酸总铁结合力血型结果
000300651000...1.3248.012.01.949.09.912.343.530
111200621000...1.6777.016.01.481.09.216.955.501
210410550000...1.8678.022.01.989.09.97.051.401
310300600000...1.6892.012.01.469.09.315.853.000
401300610000...1.6058.014.01.7153.08.113.245.901

5 rows × 63 columns

相关性分析

皮尔逊相关系数
col = jibing.columns.tolist()
col.remove("结果")
dict_ = dict()
feature_ls = []
for col_ in col:
    corr, p = pearsonr(jibing["结果"], jibing[col_])
    """
        p > 0.05 才会有统计学意义
        所以这里将 p > 0.05 的特征放到一个字典中
        查看他们与结果的相关性
    """
    if p > 0.05:
        feature_ls.append(col_)
        dict_[col_] = abs(corr)
len(dict_)
59
feature_ls
['左右',
 '症状持续时间',
 '明显夜间痛',
 '性别',
 '年龄',
 '高血压',
 '高血脂',
 '2型糖尿病',
 '吸烟与否',
 '饮酒与否',
 '红细胞计数*10^12/L',
 '血红蛋白',
 '红细胞压积',
 '血小板计数',
 '血小板压积',
 '总蛋白g/L',
 '白蛋白g/L',
 '球蛋白g/L',
 '白球比',
 'ALT丙氨酸氨基转移酶',
 'AST天门冬氨酸氨基转移酶',
 '碱性磷酸酶',
 '谷氨酸转肽酶',
 'AST:ALT',
 '总胆红素',
 '直接胆红素',
 '间接胆红素',
 '钾',
 '钠',
 '氯',
 '钙',
 '磷',
 '镁',
 '葡萄糖',
 '尿素',
 '尿酸',
 '甘油三酯',
 '总胆固醇',
 'H高密度胆固醇',
 'L低密度胆固醇',
 '载脂蛋白A1',
 '载脂蛋白B',
 '载脂蛋白E mg/l',
 'aPoB/aPoA1',
 '脂蛋白小a',
 '乳酸脱氢酶LDH',
 'β-2微球蛋白',
 '胆碱酯酶',
 '前白蛋白mg/l',
 '总胆汁酸',
 '腺苷脱氨酶ADA',
 '果糖胺',
 '肌酸激酶',
 'α-L-盐藻糖苷酶',
 '淀粉酶',
 '同型半胱氨酸',
 '铁',
 '总铁结合力',
 '血型']
# 设置字体
set_font()
start = 0
end = 19
for i in range(3):
    # 使用自定义字体
    show_x = list(dict_.keys())[start:end]
    show_y = list(dict_.values())[start:end]
    plt.xlabel('属性名')
    plt.ylabel('相关系数')
    # 绘制柱状图
    plt.bar(show_x, show_y)
    # 显示图形
    start = start + 20
    end = end + 20
    plt.xticks(rotation=90)
    plt.title("相关性系数")
    plt.show()

请添加图片描述

请添加图片描述

请添加图片描述

使用决策树筛选特征

clf = tree.DecisionTreeClassifier(random_state=30)
clf = clf.fit(jibing.iloc[:,:-1],jibing.iloc[:,-1])

获取特征的重要性

根据决策树的某些系数划分重要程度
tree_importance = clf.feature_importances_
tree_importance
array([0.00642058, 0.00428039, 0.00499379, 0.02252402, 0.        ,
       0.01654977, 0.01911454, 0.        , 0.        , 0.        ,
       0.        , 0.03454957, 0.00667023, 0.0113456 , 0.02181291,
       0.008653  , 0.0320874 , 0.06006682, 0.02659235, 0.02146013,
       0.00285359, 0.        , 0.01941168, 0.01444173, 0.02098024,
       0.00456575, 0.02118258, 0.00256741, 0.01527166, 0.01516202,
       0.02008489, 0.00494003, 0.        , 0.01235312, 0.03628761,
       0.02015343, 0.02685477, 0.03634301, 0.01141437, 0.03151978,
       0.01307189, 0.00508406, 0.01698537, 0.00808379, 0.01866136,
       0.02006391, 0.0165685 , 0.03874059, 0.01265598, 0.02484254,
       0.01521916, 0.0041057 , 0.01748961, 0.0339876 , 0.00445256,
       0.01425327, 0.02496806, 0.01092751, 0.01736746, 0.05684635,
       0.01211596, 0.        ])

由于特征是比较少的,所以将皮尔逊系数和决策树特征重要性放到一起考虑

# 确定一会儿要删除的特征
# 把它们放到drop_list中
drop_list = []
# 获取重要性为0的特征
indexes = [i for i, x in enumerate(tree_importance) if x == 0]
indexes
[4, 7, 8, 9, 10, 21, 32, 61]
for index in indexes:
    if dict_[col[index]] < 0.01:
        drop_list.append(col[index])
drop_list
['性别', 'AST天门冬氨酸氨基转移酶']

将无关属性去掉

jibing.shape
(1598, 63)
type(jibing)
pandas.core.frame.DataFrame
for name in drop_list:
    jibing.drop(name, axis=1,inplace = True)
jibing.shape
(1598, 61)

特征和结果分开

jibing_res = jibing.iloc[:,-1]
jibing_res.head()
0    0
1    1
2    1
3    0
4    1
Name: 结果, dtype: int64
jibing = jibing.iloc[:,:-1]
jibing.head()
左右是否外伤症状持续时间明显夜间痛年龄高血压高血脂2型糖尿病吸烟与否饮酒与否...腺苷脱氨酶ADA果糖胺肌酸激酶α-L-盐藻糖苷酶乳酸淀粉酶同型半胱氨酸总铁结合力血型
000306510000...10.01.3248.012.01.949.09.912.343.53
111206210000...10.01.6777.016.01.481.09.216.955.50
210415500000...15.01.8678.022.01.989.09.97.051.40
310306000000...16.01.6892.012.01.469.09.315.853.00
401306100000...13.01.6058.014.01.7153.08.113.245.90

5 rows × 60 columns

jibing
左右是否外伤症状持续时间明显夜间痛年龄高血压高血脂2型糖尿病吸烟与否饮酒与否...腺苷脱氨酶ADA果糖胺肌酸激酶α-L-盐藻糖苷酶乳酸淀粉酶同型半胱氨酸总铁结合力血型
000306510000...10.01.3248.012.01.949.09.912.343.53
111206210000...10.01.6777.016.01.481.09.216.955.50
210415500000...15.01.8678.022.01.989.09.97.051.40
310306000000...16.01.6892.012.01.469.09.315.853.00
401306100000...13.01.6058.014.01.7153.08.113.245.90
..................................................................
159300417610000...12.01.6160.05.01.560.07.921.550.13
159411315810000...16.01.9296.022.01.189.010.316.355.23
159501316800000...11.01.8484.021.02.343.013.117.346.70
159600316200000...19.02.3076.036.01.1125.018.824.051.50
159700405300000...14.01.5477.011.01.379.011.011.554.00

1598 rows × 60 columns

保存

jibing.to_excel("./jibing_feature_final.xlsx", index=False)
jibing_res.to_excel("./jibing_feature_res_final.xlsx", index=False)

这只是初步筛选,在具体的算法中还会进一步筛选

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值