Gitee仓库地址:特征筛选LASSO回归封装好的代码、数据集和结果
README
LassoFeatureSelector_main
这个是主函数文件,在实例化LassoFeatureSelector类时,需要传入下面这些参数:
- input_train_data_path:输入训练集的路径
- input_test_data_path:输入测试集的路径
- output_train_path:输出训练集的路径
- output_test_path:输出测试集的路径
- Upper_limit_alpha:正则化搜索范围上限
- Lower_limit_alpha:正则化搜索范围下限
- iterations:LASSO回归迭代次数
- cv:选择最佳正则化系数的交叉验证次数
实例化后调用总运行函数即可:lasso_selector.run_all()
LassoFeatureSelector
这个是封装好的类,主要实现以下几个功能:
- 计算特征筛选前后的方差膨胀因子,输出并导出
- 绘制岭迹图并导出
- 以MSE为损失函数进行LASSO回归
- k折交叉验证进行最佳正则化系数的搜索
- 导出特征筛选后的训练集和测试集
- 无论输入的文件格式是xlsx文件还是csv文件,类都能读取
数据集
数据集来自网络入侵检测领域的经典数据集:NSLKDD
预处理好的数据集和导出的训练集测试集可以在百度网盘下载:
链接:https://pan.baidu.com/s/125SniuPOWFkrB4fONtIPQw?pwd=fgin 提取码:fgin
原始数据集见官网下载:
导出的文件
- LASSO系数矩阵.xlsx
- LASSO回归岭迹图.png
- 原始训练集的方差膨胀因子.xlsx
- LASSO回归后训练集的方差膨胀因子.xlsx
- NSLKDD_train_LASSO.xlsx
- NSLKDD_test_LASSO.xlsx
LASSO回归参数说明
若需要调整LASSO回归的参数,需要到LassoFeatureSelector文件的lasso_regression函数中进行修改
lassoreg = Lasso(alpha=alpha, max_iter=self.iterations, fit_intercept=True,precompute=False, copy_X=False,tol=0.0001, warm_start=False,positive=False,selection='cyclic')
alpha=alpha: 正则化强度,控制稀疏性
fit_intercept=True: 拟合截距
precompute=False: 是否预计算 Gram 矩阵,通常设置为 False
copy_X=True: 对输入数据进行复制
max_iter=self.iterations: 最大迭代次数,控制算法运行的最大迭代次数
tol=0.0001: 收敛的容忍度,指定算法收敛的阈值
warm_start=False: 如果为 True,则使用前一个调用的解决方案以适应的权重
positive=False: 如果为 True,则要求系数为正
selection='cyclic': 指定系数更新的策略。'cyclic' 表示按循环顺序逐个更新系数
封装好的类
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from matplotlib import rcParams
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# 设置matplotlib绘图的中文字体
rcParams['font.sans-serif'] = ['Microsoft YaHei']
rcParams['axes.unicode_minus'] = False
# 忽略特定类型的警告
warnings.filterwarnings("ignore", category=ConvergenceWarning)
class LassoFeatureSelector:
def __init__(self, input_train_data_path, input_test_data_path, output_train_path, output_test_path, upper_limit_alpha, lower_limit_alpha, iterations, cv): # 初始化函数
self.input_train_data_path = input_train_data_path
self.input_test_data_path = input_test_data_path
self.output_train_path = output_train_path
self.output_test_path = output_test_path
self.upper_limit_alpha = upper_limit_alpha
self.lower_limit_alpha = lower_limit_alpha
self.iterations = iterations
self.cv = cv
def load_data(self): # 读取文件函数
if os.path.splitext(self.input_train_data_path)[1] == '.xlsx':
self.input_train_data = pd.read_excel(self.input_train_data_path)
elif os.path.splitext(self.input_train_data_path)[1] == '.csv':
self.input_train_data = pd.read_csv(self.input_train_data_path)
if os.path.splitext(self.input_test_data_path)[1] == '.xlsx':
self.input_test_data = pd.read_excel(self.input_test_data_path)
elif os.path.splitext(self.input_test_data_path)[1] == '.csv':
self.input_test_data = pd.read_csv(self.input_test_data_path)
def lasso_regression(self, train, test, alpha): # LASSO回归函数
lassoreg = Lasso(alpha=alpha, max_iter=self.iterations, fit_intercept=True,precompute=False,
copy_X=False,tol=0.0001, warm_start=False,positive=False,selection='cyclic')
lassoreg.fit(train.iloc[:, 0:-1], train.iloc[:, -1])
feature_count = np.sum(lassoreg.coef_ != 0)
y_pred = lassoreg.predict(test.iloc[:, 0:-1])
mse = mean_squared_error(test.iloc[:, -1], y_pred)
ret = [alpha, mse]
ret.append(feature_count)
ret.extend(lassoreg.coef_)
return ret
def matrix_lasso(self): # LASSO系数矩阵函数
self.alpha_lasso = np.linspace(self.lower_limit_alpha, self.upper_limit_alpha, self.iterations)
col = ["alpha", "mse", "feature_count"] + list(self.input_train_data.iloc[:, 0:-1])
ind = ["alpha_%.4g" % self.alpha_lasso[i] for i in range(0, len(self.alpha_lasso))]
self.coef_matrix_lasso = pd.DataFrame(index=ind, columns=col)
input_train_1, input_train_2 = train_test_split(self.input_train_data, test_size=0.2, random_state=42) # 在输入的训练集里面分割进行LASSO回归
for i in range(len(self.alpha_lasso)):
self.coef_matrix_lasso.iloc[i] = self.lasso_regression(input_train_1, input_train_2, self.alpha_lasso[i])
self.coef_matrix_lasso.to_excel(r'LASSO系数矩阵.xlsx', index=True)
def plot_lasso_path(self): # 绘制岭迹图函数
plt.figure(figsize=(14, 6.8))
for i in np.arange(len(list(self.input_train_data.iloc[:, 0:-1]))):
plt.plot(self.coef_matrix_lasso["alpha"],
self.coef_matrix_lasso[list(self.input_train_data.iloc[:, 0:-1])[i]],
color=plt.cm.Set1(i / len(list(self.input_train_data.iloc[:, 0:-1]))),
label=list(self.input_train_data.iloc[:, 0:-1])[i])
plt.legend(loc="upper right", ncol=2, prop={'size': 7})
plt.xlabel("正则化系数", fontsize=14)
plt.ylabel("回归系数", fontsize=14)
plt.savefig(r'LASSO回归岭迹图', dpi=600)
plt.show()
def select_best_alpha(self): # 选择最佳正则化系数函数
alpha_choose = np.linspace(self.lower_limit_alpha, self.upper_limit_alpha, self.iterations)
lasso_cv = LassoCV(alphas=alpha_choose, cv=self.cv, max_iter=self.iterations)
lasso_cv.fit(self.input_train_data.iloc[:, 0:-1], self.input_train_data.iloc[:, -1])
self.lasso_best_alpha = lasso_cv.alpha_
print(f"选择的最佳正则化系数: {self.lasso_best_alpha}")
def calculate_vif(self, data): # 计算方差膨胀因子函数
vif = pd.DataFrame()
vif['特征'] = data.columns
vif['方差膨胀因子'] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
return vif
def fit_lasso_model(self): # 筛选特征函数
self.lasso_model = Lasso(alpha=self.lasso_best_alpha, fit_intercept=True,
max_iter=self.iterations, random_state=42, selection='cyclic')
self.lasso_model.fit(self.input_train_data.iloc[:, 0:-1], self.input_train_data.iloc[:, -1])
self.selected_features = self.input_train_data.iloc[:, 0:-1].columns[self.lasso_model.coef_ != 0]
def save_vif(self): # 输出并保存方差膨胀因子函数
vif_before = self.calculate_vif(self.input_train_data.iloc[:, 0:-1])
print("原始数据的方差膨胀因子:\n", vif_before)
input_data_selected = self.input_train_data.iloc[:, 0:-1][self.selected_features]
vif_after = self.calculate_vif(input_data_selected)
print("筛选特征后的方差膨胀因子:\n", vif_after)
vif_before.to_excel(r'原始训练集的方差膨胀因子.xlsx', index=False)
vif_after.to_excel(r'LASSO回归后训练集的方差膨胀因子.xlsx', index=False)
def save_selected_data(self): # 导出LASSO筛选特征后的训练集和测试集
selected_data_train = self.input_train_data[list(self.selected_features) + [self.input_train_data.columns[-1]]]
selected_data_test = self.input_test_data[list(self.selected_features) + [self.input_test_data.columns[-1]]]
if os.path.splitext(self.output_train_path)[1] == '.xlsx':
selected_data_train.to_excel(self.output_train_path, index=False)
elif os.path.splitext(self.output_train_path)[1] == '.csv':
selected_data_train.to_csv(self.output_train_path, index=False)
if os.path.splitext(self.output_test_path)[1] == '.xlsx':
selected_data_test.to_excel(self.output_test_path, index=False)
elif os.path.splitext(self.output_test_path)[1] == '.csv':
selected_data_test.to_csv(self.output_test_path, index=False)
def calculate_avg_vif(self): # 计算平均方差膨胀因子函数
vif_before = self.calculate_vif(self.input_train_data.iloc[:, 0:-1])
avg_vif_before = vif_before['方差膨胀因子'].mean()
input_data_selected = self.input_train_data.iloc[:, 0:-1][self.selected_features]
vif_after = self.calculate_vif(input_data_selected)
avg_vif_after = vif_after['方差膨胀因子'].mean()
print(f"特征筛选前的平均方差膨胀因子: {avg_vif_before}")
print(f"特征筛选后的平均方差膨胀因子: {avg_vif_after}")
def run_all(self):
self.load_data()
self.matrix_lasso()
self.plot_lasso_path()
self.select_best_alpha()
self.fit_lasso_model()
self.save_vif()
self.save_selected_data()
self.calculate_avg_vif()
调用的子函数
from LassoFeatureSelector import *
'''初始化'''
input_train_data_path = r'D:\Gitee\NSLKDD_train.xlsx' # 输入训练集的路径
input_test_data_path = r'D:\Gitee\NSLKDD_test.xlsx' # 输入测试集的路径
output_train_path = r'D:\Gitee\NSLKDD_train_LASSO.xlsx' # 输出训练集的路径
output_test_path = r'D:\Gitee\NSLKDD_test_LASSO.xlsx' # 输出测试集的路径
Upper_limit_alpha = 0.001 # 正则化搜索范围上限
Lower_limit_alpha = 0.0012 # 正则化搜索范围下限
iterations = 2000 # LASSO回归迭代次数
cv = 10 # 选择最佳正则化系数的交叉验证次数
'''调用函数'''
lasso_selector = LassoFeatureSelector(input_train_data_path, input_test_data_path, output_train_path, output_test_path, Upper_limit_alpha, Lower_limit_alpha, iterations, cv)
lasso_selector.run_all()