day 8.6 逻辑回归-特征过程

# 方法1.自己分析
# PCA 和 SVD 一般用不上

# 方法2. 统计方法可以使用,但没必要

# todo: 高效的嵌入法

from sklearn.linear_model import LogisticRegression as LR
from sklearn.datasets import load_breast_cancer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel

data = load_breast_cancer()
# print(data.data.shape)  # (569, 30)
# 实例化模型
LR_ = LR(solver="liblinear", C=0.9, random_state=420)
# print(cross_val_score(LR_, data.data, data.target, cv=10).mean())
# 0.9490601503759398

# todo 降维

# 筛选特征的阈值 threshold=float,
## norm_order=1 L1范数
# X_embedded = SelectFromModel(LR_, norm_order=1).fit_transform(data.data, data.target)
## print(X_embedded.shape)  # (569, 9)
## print(cross_val_score(LR_, X_embedded, data.target, cv=10).mean())
## 0.9368107769423559
## 虽然特征被减少到了个位数,模型效果却没有下降太多,如果我们要求不高,在这里其实就可以停下来了

# todo: 调参
#  第一种调整方法 : 画threshold的学习曲线
fullx = []
fsx = []
# # 取对模型贡献系数的绝对值,再取最大值
# threshold = np.linspace(0, abs(LR_.fit(data.data, data.target).coef_).max(), 20)
# k = 0
# for i in threshold:
#     X_embedded = SelectFromModel(LR_, threshold=i).fit_transform(data.data, data.target)
#     fullx.append(cross_val_score(LR_, data.data, data.target, cv=5).mean())
#     fsx.append(cross_val_score(LR_, X_embedded, data.target, cv=5).mean())
#     print((threshold[k], X_embedded.shape[1]))
#     k += 1
# plt.figure(figsize=(20, 5))
# plt.plot(threshold, fullx, label="full")
# plt.plot(threshold, fsx, label="feature selection")
# plt.xticks(threshold)
# plt.legend()
# plt.show()
# 结论 这种方法是无效的,这里发现,threshold越大,被删除的特征越多,模型效果越来越差

# todo  第二种调整方法
# C = np.arange(0.01, 10.01, 0.5)
# for i in C:
#     LR_ = LR(solver="liblinear", C=i, random_state=420)
#     fullx.append(cross_val_score(LR_, data.data, data.target, cv=10).mean())
#     X_embedded = SelectFromModel(LR_, norm_order=1).fit_transform(data.data,data.target)
#     fsx.append(cross_val_score(LR_, X_embedded, data.target, cv=10).mean())
# print(max(fsx), C[fsx.index(max(fsx))])    #  0.9561090225563911 7.01
# plt.figure(figsize=(20,5))
# plt.plot(C,fullx,label='full')
# plt.plot(C,fsx,label="feature selection")
# plt.xticks()
# plt.legend()
# plt.show()
# 结论,在0.9561090225563911 7.01 最好

#  todo 进一步精细学习曲线
# C = np.arange(6.05, 7.05, 0.005)
# for i in C:
#     LR_ = LR(solver="liblinear", C=i, random_state=420)
#     fullx.append(cross_val_score(LR_, data.data, data.target, cv=10).mean())
#     X_embedded = SelectFromModel(LR_, norm_order=1).fit_transform(data.data, data.target)
#     fsx.append(cross_val_score(LR_, X_embedded, data.target, cv=10).mean())
# print(max(fsx), C[fsx.index(max(fsx))])      # 0.9561090225563911 6.069999999999999
# plt.figure(figsize=(20, 5))
# plt.plot(C, fullx, label='full')
# plt.plot(C, fsx, label="feature selection")
# plt.xticks()
# plt.legend()
# plt.show()


# todo: 结论
LR_2 = LR(solver="liblinear", C=6.069999999999999, random_state=420)
X_embedded = SelectFromModel(LR_, norm_order=1).fit_transform(data.data, data.target)
result = cross_val_score(LR_2, X_embedded, data.target, cv=10).mean()
print(result)  # 0.9526002506265664
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值