多元逐步回归算法

最新推荐文章于 2024-08-20 20:49:02 发布

草木的执念

最新推荐文章于 2024-08-20 20:49:02 发布

阅读量7.9k

点赞数 3

分类专栏：数据挖掘数据分析算法文章标签： python 多元逐步回归

本文链接：https://blog.csdn.net/qq_35007162/article/details/89224232

版权

数据挖掘同时被 3 个专栏收录

1 篇文章 0 订阅

订阅专栏

数据分析

1 篇文章 0 订阅

订阅专栏

算法

1 篇文章 0 订阅

订阅专栏

先谈一下个人对多元逐步回归的理解：多元逐步回归的最本质的核心是最小二乘原理，本方法中调用smf方法。

# encoding: utf-8

"""
功能：多元逐步回归
描述：基于python实现多元逐步回归的功能
作者：CHEN_C_W （草木陈）
时间：2019年4月12日(星期五) 凌晨
地点：杭州
参考：https://www.cnblogs.com/lantingg/p/9535010.html
"""
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from statsmodels.formula import api as smf

class FeatureSelection():

def stepwise(self, df, response, intercept=True, normalize=False, criterion='bic',
             f_pvalue_enter=.05, p_value_enter=.05, direction='backward', show_step=True,
             criterion_enter=None, criterion_remove=None, max_iter=200, **kw):

    """
    逐步回归。

    参数
    ----
    df : dataframe
        分析用数据框，response为第一列。
    response : str
        回归分析相应变量。
    intercept : bool, 默认是True
        模型是否有截距项。
    criterion : str, 默认是'bic'
        逐步回归优化规则。
    f_pvalue_enter : float, 默认是.05
        当选择criterion=’ssr‘时，模型加入或移除变量的f_pvalue阈值。
    p_value_enter : float, 默认是.05
        当选择derection=’both‘时，移除变量的pvalue阈值。
    direction : str, 默认是'backward'
        逐步回归方向。
    show_step : bool, 默认是True
        是否显示逐步回归过程。
    criterion_enter : float, 默认是None
        当选择derection=’both‘或'forward'时，模型加入变量的相应的criterion阈值。
    criterion_remove : float, 默认是None
        当选择derection='backward'时，模型移除变量的相应的criterion阈值。
    max_iter : int, 默认是200
        模型最大迭代次数。
    """
    criterion_list = ['bic', 'aic', 'ssr', 'rsquared', 'rsquared_adj']
    if criterion not in criterion_list:
        raise IOError('请输入正确的criterion, 必须是以下内容之一：', '\n', criterion_list)

    direction_list = ['backward', 'forward', 'both']
    if direction not in direction_list:
        raise IOError('请输入正确的direction, 必须是以下内容之一：', '\n', direction_list)

    # 默认p_enter参数
    p_enter = {'bic': 0.0, 'aic': 0.0, 'ssr': 0.05, 'rsquared': 0.05, 'rsquared_adj': -0.05}
    if criterion_enter:  # 如果函数中对p_remove相应key传参，则变更该参数
        p_enter[criterion] = criterion_enter

    # 默认p_remove参数
    p_remove = {'bic': 0.01, 'aic': 0.01, 'ssr': 0.1, 'rsquared': 0.05, 'rsquared_adj': -0.05}
    if criterion_remove:  # 如果函数中对p_remove相应key传参，则变更该参数
        p_remove[criterion] = criterion_remove

    if normalize:  # 如果需要标准化数据
        intercept = False  # 截距强制设置为0
        df_std = StandardScaler().fit_transform(df)
        df = pd.DataFrame(df_std, columns=df.columns, index=df.index)

    """ forward """
    if direction == 'forward':
        remaining = list(df.columns)  # 自变量集合
        remaining.remove(response)
        selected = []  # 初始化选入模型的变量列表
        # 初始化当前评分,最优新评分
        if intercept:  # 是否有截距
            formula = "{} ~ {} + 1".format(response, remaining[0])
        else:
            formula = "{} ~ {} - 1".format(response, remaining[0])

        result = smf.ols(formula, df).fit()  # 最小二乘法回归模型拟合
        current_score = eval('result.' + criterion)
        best_new_score = eval('result.' + criterion)

        if show_step:
            print('\nstepwise starting:\n')
        iter_times = 0
        # 当变量未剔除完，并且当前评分更新时进行循环
        while remaining and (current_score == best_new_score) and (iter_times < max_iter):
            scores_with_candidates = []  # 初始化变量以及其评分列表
            for candidate in remaining:  # 在未剔除的变量中每次选择一个变量进入模型，如此循环
                if intercept:  # 是否有截距
                    formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
                else:
                    formula = "{} ~ {} - 1".format(response, ' + '.join(selected + [candidate]))

                result = smf.ols(formula, df).fit()  # 最小二乘法回归模型拟合
                fvalue = result.fvalue
                f_pvalue = result.f_pvalue
                score = eval('result.' + criterion)
                scores_with_candidates.append((score, candidate, fvalue, f_pvalue))  # 记录此次循环的变量、评分列表

            if criterion == 'ssr':  # 这几个指标取最小值进行优化
                scores_with_candidates.sort(reverse=True)  # 对评分列表进行降序排序
                best_new_score, best_candidate, best_new_fvalue, best_new_f_pvalue = scores_with_candidates.pop()  # 提取最小分数及其对应变量
                if ((current_score - best_new_score) > p_enter[criterion]) and (
                        best_new_f_pvalue < f_pvalue_enter):  # 如果当前评分大于最新评分
                    remaining.remove(best_candidate)  # 从剩余未评分变量中剔除最新最优分对应的变量
                    selected.append(best_candidate)  # 将最新最优分对应的变量放入已选变量列表
                    current_score = best_new_score  # 更新当前评分
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, SSR = %.3f, Fstat = %.3f, FpValue = %.3e' %
                              (best_candidate, best_new_score, best_new_fvalue, best_new_f_pvalue))
                elif (current_score - best_new_score) >= 0 and (
                        best_new_f_pvalue < f_pvalue_enter) and iter_times == 0:  # 当评分差大于等于0，且为第一次迭代
                    remaining.remove(best_candidate)
                    selected.append(best_candidate)
                    current_score = best_new_score
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                elif (best_new_f_pvalue < f_pvalue_enter) and iter_times == 0:  # 当评分差小于p_enter，且为第一次迭代
                    selected.append(remaining[0])
                    remaining.remove(remaining[0])
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (remaining[0], criterion, best_new_score))
            elif criterion in ['bic', 'aic']:  # 这几个指标取最小值进行优化
                scores_with_candidates.sort(reverse=True)  # 对评分列表进行降序排序
                best_new_score, best_candidate, best_new_fvalue, best_new_f_pvalue = scores_with_candidates.pop()  # 提取最小分数及其对应变量
                if (current_score - best_new_score) > p_enter[criterion]:  # 如果当前评分大于最新评分
                    remaining.remove(best_candidate)  # 从剩余未评分变量中剔除最新最优分对应的变量
                    selected.append(best_candidate)  # 将最新最优分对应的变量放入已选变量列表
                    current_score = best_new_score  # 更新当前评分
                    # print(iter_times)
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                elif (current_score - best_new_score) >= 0 and iter_times == 0:  # 当评分差大于等于0，且为第一次迭代
                    remaining.remove(best_candidate)
                    selected.append(best_candidate)
                    current_score = best_new_score
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                elif iter_times == 0:  # 当评分差小于p_enter，且为第一次迭代
                    selected.append(remaining[0])
                    remaining.remove(remaining[0])
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (remaining[0], criterion, best_new_score))
            else:
                scores_with_candidates.sort()
                best_new_score, best_candidate, best_new_fvalue, best_new_f_pvalue = scores_with_candidates.pop()
                if (best_new_score - current_score) > p_enter[criterion]:
                    remaining.remove(best_candidate)
                    selected.append(best_candidate)
                    current_score = best_new_score
                    print(iter_times, flush=True)
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                elif (best_new_score - current_score) >= 0 and iter_times == 0:  # 当评分差大于等于0，且为第一次迭代
                    remaining.remove(best_candidate)
                    selected.append(best_candidate)
                    current_score = best_new_score
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                elif iter_times == 0:  # 当评分差小于p_enter，且为第一次迭代
                    selected.append(remaining[0])
                    remaining.remove(remaining[0])
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (remaining[0], criterion, best_new_score))
            iter_times += 1

        if intercept:  # 是否有截距
            formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
        else:
            formula = "{} ~ {} - 1".format(response, ' + '.join(selected))

        self.stepwise_model = smf.ols(formula, df).fit()  # 最优模型拟合

        if show_step:  # 是否显示逐步回归过程
            print('\nLinear regression model:', '\n  ', self.stepwise_model.model.formula)
            print('\n', self.stepwise_model.summary())

    """ backward """
    if direction == 'backward':
        remaining, selected = set(df.columns), set(df.columns)  # 自变量集合
        remaining.remove(response)
        selected.remove(response)  # 初始化选入模型的变量列表
        # 初始化当前评分,最优新评分
        if intercept:  # 是否有截距
            formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
        else:
            formula = "{} ~ {} - 1".format(response, ' + '.join(selected))

        result = smf.ols(formula, df).fit()  # 最小二乘法回归模型拟合
        current_score = eval('result.' + criterion)
        worst_new_score = eval('result.' + criterion)

        if show_step:
            print('\nstepwise starting:\n')
        iter_times = 0
        # 当变量未剔除完，并且当前评分更新时进行循环
        while remaining and (current_score == worst_new_score) and (iter_times < max_iter):
            scores_with_eliminations = []  # 初始化变量以及其评分列表
            for elimination in remaining:  # 在未剔除的变量中每次选择一个变量进入模型，如此循环
                if intercept:  # 是否有截距
                    formula = "{} ~ {} + 1".format(response, ' + '.join(selected - set(elimination)))
                else:
                    formula = "{} ~ {} - 1".format(response, ' + '.join(selected - set(elimination)))

                result = smf.ols(formula, df).fit()  # 最小二乘法回归模型拟合
                fvalue = result.fvalue
                f_pvalue = result.f_pvalue
                score = eval('result.' + criterion)
                scores_with_eliminations.append((score, elimination, fvalue, f_pvalue))  # 记录此次循环的变量、评分列表

            if criterion == 'ssr':  # 这几个指标取最小值进行优化
                scores_with_eliminations.sort(reverse=False)  # 对评分列表进行降序排序
                worst_new_score, worst_elimination, worst_new_fvalue, worst_new_f_pvalue = scores_with_eliminations.pop()  # 提取最小分数及其对应变量
                if ((worst_new_score - current_score) < p_remove[criterion]) and (
                        worst_new_f_pvalue < f_pvalue_enter):  # 如果当前评分大于最新评分
                    remaining.remove(worst_elimination)  # 从剩余未评分变量中剔除最新最优分对应的变量
                    selected.remove(worst_elimination)  # 从已选变量列表中剔除最新最优分对应的变量
                    current_score = worst_new_score  # 更新当前评分
                    if show_step:  # 是否显示逐步回归过程
                        print('Removing %s, SSR = %.3f, Fstat = %.3f, FpValue = %.3e' %
                              (worst_elimination, worst_new_score, worst_new_fvalue, worst_new_f_pvalue))
            elif criterion in ['bic', 'aic']:  # 这几个指标取最小值进行优化
                scores_with_eliminations.sort(reverse=False)  # 对评分列表进行降序排序
                worst_new_score, worst_elimination, worst_new_fvalue, worst_new_f_pvalue = scores_with_eliminations.pop()  # 提取最小分数及其对应变量
                if (worst_new_score - current_score) < p_remove[criterion]:  # 如果评分变动不显著
                    remaining.remove(worst_elimination)  # 从剩余未评分变量中剔除最新最优分对应的变量
                    selected.remove(worst_elimination)  # 从已选变量列表中剔除最新最优分对应的变量
                    current_score = worst_new_score  # 更新当前评分
                    if show_step:  # 是否显示逐步回归过程
                        print('Removing %s, %s = %.3f' % (worst_elimination, criterion, worst_new_score))
            else:
                scores_with_eliminations.sort(reverse=True)
                worst_new_score, worst_elimination, worst_new_fvalue, worst_new_f_pvalue = scores_with_eliminations.pop()
                if (current_score - worst_new_score) < p_remove[criterion]:
                    remaining.remove(worst_elimination)
                    selected.remove(worst_elimination)
                    current_score = worst_new_score
                    if show_step:  # 是否显示逐步回归过程
                        print('Removing %s, %s = %.3f' % (worst_elimination, criterion, worst_new_score))
            iter_times += 1

        if intercept:  # 是否有截距
            formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
        else:
            formula = "{} ~ {} - 1".format(response, ' + '.join(selected))

        self.stepwise_model = smf.ols(formula, df).fit()  # 最优模型拟合

        if show_step:  # 是否显示逐步回归过程
            print('\nLinear regression model:', '\n  ', self.stepwise_model.model.formula)
            print('\n', self.stepwise_model.summary())

    """ both """
    if direction == 'both':
        remaining = list(df.columns)  # 自变量集合
        remaining.remove(response)
        selected = []  # 初始化选入模型的变量列表
        # 初始化当前评分,最优新评分
        if intercept:  # 是否有截距
            formula = "{} ~ {} + 1".format(response, remaining[0])
        else:
            formula = "{} ~ {} - 1".format(response, remaining[0])

        result = smf.ols(formula, df).fit()  # 最小二乘法回归模型拟合
        current_score = eval('result.' + criterion)
        best_new_score = eval('result.' + criterion)

        if show_step:
            print('\nstepwise starting:\n')
        # 当变量未剔除完，并且当前评分更新时进行循环
        iter_times = 0
        while remaining and (current_score == best_new_score) and (iter_times < max_iter):
            scores_with_candidates = []  # 初始化变量以及其评分列表
            for candidate in remaining:  # 在未剔除的变量中每次选择一个变量进入模型，如此循环
                if intercept:  # 是否有截距
                    formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
                else:
                    formula = "{} ~ {} - 1".format(response, ' + '.join(selected + [candidate]))

                result = smf.ols(formula, df).fit()  # 最小二乘法回归模型拟合
                fvalue = result.fvalue
                f_pvalue = result.f_pvalue
                score = eval('result.' + criterion)
                scores_with_candidates.append((score, candidate, fvalue, f_pvalue))  # 记录此次循环的变量、评分列表

            if criterion == 'ssr':  # 这几个指标取最小值进行优化
                scores_with_candidates.sort(reverse=True)  # 对评分列表进行降序排序
                best_new_score, best_candidate, best_new_fvalue, best_new_f_pvalue = scores_with_candidates.pop()    # 提取最小分数及其对应变量
                if ((current_score - best_new_score) > p_enter[criterion]) and (
                        best_new_f_pvalue < f_pvalue_enter):  # 如果当前评分大于最新评分
                    remaining.remove(best_candidate)  # 从剩余未评分变量中剔除最新最优分对应的变量
                    selected.append(best_candidate)  # 将最新最优分对应的变量放入已选变量列表
                    current_score = best_new_score  # 更新当前评分
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, SSR = %.3f, Fstat = %.3f, FpValue = %.3e' %
                              (best_candidate, best_new_score, best_new_fvalue, best_new_f_pvalue))
                elif (current_score - best_new_score) >= 0 and (
                        best_new_f_pvalue < f_pvalue_enter) and iter_times == 0:  # 当评分差大于等于0，且为第一次迭代
                    remaining.remove(best_candidate)
                    selected.append(best_candidate)
                    current_score = best_new_score
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                elif (best_new_f_pvalue < f_pvalue_enter) and iter_times == 0:  # 当评分差小于p_enter，且为第一次迭代
                    selected.append(remaining[0])
                    remaining.remove(remaining[0])
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (remaining[0], criterion, best_new_score))
            elif criterion in ['bic', 'aic']:  # 这几个指标取最小值进行优化
                scores_with_candidates.sort(reverse=True)  # 对评分列表进行降序排序
                best_new_score, best_candidate, best_new_fvalue, best_new_f_pvalue = scores_with_candidates.pop()  # 提取最小分数及其对应变量
                if (current_score - best_new_score) > p_enter[criterion]:  # 如果当前评分大于最新评分
                    remaining.remove(best_candidate)  # 从剩余未评分变量中剔除最新最优分对应的变量
                    selected.append(best_candidate)  # 将最新最优分对应的变量放入已选变量列表
                    current_score = best_new_score  # 更新当前评分
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                elif (current_score - best_new_score) >= 0 and iter_times == 0:  # 当评分差大于等于0，且为第一次迭代
                    remaining.remove(best_candidate)
                    selected.append(best_candidate)
                    current_score = best_new_score
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                elif iter_times == 0:  # 当评分差小于p_enter，且为第一次迭代
                    selected.append(remaining[0])
                    remaining.remove(remaining[0])
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (remaining[0], criterion, best_new_score))
            else:
                scores_with_candidates.sort()
                best_new_score, best_candidate, best_new_fvalue, best_new_f_pvalue = scores_with_candidates.pop()
                if (best_new_score - current_score) > p_enter[criterion]:  # 当评分差大于p_enter
                    remaining.remove(best_candidate)
                    selected.append(best_candidate)
                    current_score = best_new_score
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                elif (best_new_score - current_score) >= 0 and iter_times == 0:  # 当评分差大于等于0，且为第一次迭代
                    remaining.remove(best_candidate)
                    selected.append(best_candidate)
                    current_score = best_new_score
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (best_candidate, criterion, best_new_score))
                elif iter_times == 0:  # 当评分差小于p_enter，且为第一次迭代
                    selected.append(remaining[0])
                    remaining.remove(remaining[0])
                    if show_step:  # 是否显示逐步回归过程
                        print('Adding %s, %s = %.3f' % (remaining[0], criterion, best_new_score))

            if intercept:  # 是否有截距
                formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
            else:
                formula = "{} ~ {} - 1".format(response, ' + '.join(selected))

            result = smf.ols(formula, df).fit()  # 最优模型拟合
            if iter_times >= 1:  # 当第二次循环时判断变量的pvalue是否达标
                if result.pvalues.max() > p_value_enter:
                    var_removed = result.pvalues[result.pvalues == result.pvalues.max()].index[0]
                    p_value_removed = result.pvalues[result.pvalues == result.pvalues.max()].values[0]
                    selected.remove(result.pvalues[result.pvalues == result.pvalues.max()].index[0])
                    if show_step:  # 是否显示逐步回归过程
                        print('Removing %s, Pvalue = %.3f' % (var_removed, p_value_removed))
            iter_times += 1

        if intercept:  # 是否有截距
            formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
        else:
            formula = "{} ~ {} - 1".format(response, ' + '.join(selected))

        self.stepwise_model = smf.ols(formula, df).fit()  # 最优模型拟合
        if show_step:  # 是否显示逐步回归过程
            print('\nLinear regression model:', '\n  ', self.stepwise_model.model.formula)
            print('\n', self.stepwise_model.summary())
            # 最终模型选择的变量
    if intercept:
        self.stepwise_feat_selected_ = list(self.stepwise_model.params.index[1:])
    else:
        self.stepwise_feat_selected_ = list(self.stepwise_model.params.index)
    return self

def main():
    print('main方法进入了......')
    # url = "http://data.princeton.edu/wws509/datasets/salary.dat"
    # data = pd.read_csv(url, sep='\\s+')
    data = np.random.rand(200, 4)

    df = pd.DataFrame({'A': data[:, 0], 'B': data[:, 1], 'C': data[:, 2], 'D': data[:, 3]})

    print('测试data大小', df.shape)
    print(df.head())
    FeatureSelection().stepwise(df=df, response='A', max_iter=200, criterion='ssr')  # criterion='ssr'是为了移除不合适特征

# 注：输入数据的结果如下图所示



if __name__ == "__main__":
    main()