# 方法1.自己分析 # PCA 和 SVD 一般用不上 # 方法2. 统计方法可以使用,但没必要 # todo: 高效的嵌入法 from sklearn.linear_model import LogisticRegression as LR from sklearn.datasets import load_breast_cancer import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import cross_val_score from sklearn.feature_selection import SelectFromModel data = load_breast_cancer() # print(data.data.shape) # (569, 30) # 实例化模型 LR_ = LR(solver="liblinear", C=0.9, random_state=420) # print(cross_val_score(LR_, data.data, data.target, cv=10).mean()) # 0.9490601503759398 # todo 降维 # 筛选特征的阈值 threshold=float, ## norm_order=1 L1范数 # X_embedded = SelectFromModel(LR_, norm_order=1).fit_transform(data.data, data.target) ## print(X_embedded.shape) # (569, 9) ## print(cross_val_score(LR_, X_embedded, data.target, cv=10).mean()) ## 0.9368107769423559 ## 虽然特征被减少到了个位数,模型效果却没有下降太多,如果我们要求不高,在这里其实就可以停下来了 # todo: 调参 # 第一种调整方法 : 画threshold的学习曲线 fullx = [] fsx = [] # # 取对模型贡献系数的绝对值,再取最大值 # threshold = np.linspace(0, abs(LR_.fit(data.data, data.target).coef_).max(), 20) # k = 0 # for i in threshold: # X_embedded = SelectFromModel(LR_, threshold=i).fit_transform(data.data, data.target) # fullx.append(cross_val_score(LR_, data.data, data.target, cv=5).mean()) # fsx.append(cross_val_score(LR_, X_embedded, data.target, cv=5).mean()) # print((threshold[k], X_embedded.shape[1])) # k += 1 # plt.figure(figsize=(20, 5)) # plt.plot(threshold, fullx, label="full") # plt.plot(threshold, fsx, label="feature selection") # plt.xticks(threshold) # plt.legend() # plt.show() # 结论 这种方法是无效的,这里发现,threshold越大,被删除的特征越多,模型效果越来越差 # todo 第二种调整方法 # C = np.arange(0.01, 10.01, 0.5) # for i in C: # LR_ = LR(solver="liblinear", C=i, random_state=420) # fullx.append(cross_val_score(LR_, data.data, data.target, cv=10).mean()) # X_embedded = SelectFromModel(LR_, norm_order=1).fit_transform(data.data,data.target) # fsx.append(cross_val_score(LR_, X_embedded, data.target, cv=10).mean()) # print(max(fsx), C[fsx.index(max(fsx))]) # 0.9561090225563911 7.01 # plt.figure(figsize=(20,5)) # plt.plot(C,fullx,label='full') # plt.plot(C,fsx,label="feature selection") # plt.xticks() # plt.legend() # plt.show() # 结论,在0.9561090225563911 7.01 最好 # todo 进一步精细学习曲线 # C = np.arange(6.05, 7.05, 0.005) # for i in C: # LR_ = LR(solver="liblinear", C=i, random_state=420) # fullx.append(cross_val_score(LR_, data.data, data.target, cv=10).mean()) # X_embedded = SelectFromModel(LR_, norm_order=1).fit_transform(data.data, data.target) # fsx.append(cross_val_score(LR_, X_embedded, data.target, cv=10).mean()) # print(max(fsx), C[fsx.index(max(fsx))]) # 0.9561090225563911 6.069999999999999 # plt.figure(figsize=(20, 5)) # plt.plot(C, fullx, label='full') # plt.plot(C, fsx, label="feature selection") # plt.xticks() # plt.legend() # plt.show() # todo: 结论 LR_2 = LR(solver="liblinear", C=6.069999999999999, random_state=420) X_embedded = SelectFromModel(LR_, norm_order=1).fit_transform(data.data, data.target) result = cross_val_score(LR_2, X_embedded, data.target, cv=10).mean() print(result) # 0.9526002506265664
day 8.6 逻辑回归-特征过程
最新推荐文章于 2021-06-19 09:22:32 发布