多元回归比一元回归优越性

最新推荐文章于 2022-07-22 15:59:34 发布

weixin_34206899

最新推荐文章于 2022-07-22 15:59:34 发布

阅读量712

点赞数

文章标签：人工智能 python

sklearn实战-乳腺癌细胞数据挖掘(博客主亲自录制视频教程)

https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

貌似一个不相关的变量，可能对结果有显著影响

多元回归可以分析独立变量与因变量是否显著相关。但解释能力不如因子分析

因子分析对变量相关性解释能力更强

正态分布检验OK

三组数据呈现正态分布，可以用回归检测

# -*- coding: utf-8 -*-
'''
Author：Toby
QQ：231469242，all right reversed,no commercial use
normality_check.py
正态性检验脚本
  
'''
  
import scipy
from scipy.stats import f
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
# additional packages
from statsmodels.stats.diagnostic import lillifors
  
#对一列数据进行正态分布测试
def check_normality(testData):
    print("one group normality check begin:")
    #20<样本数<50用normal test算法检验正态分布性
    if 20<len(testData) <50:
       p_value= stats.normaltest(testData)[1]
       if p_value<0.05:
           print("use normaltest")
           print("p value:",p_value)
           print ("data are not normal distributed")
           return  False
       else:
           print("use normaltest")
           print("p value:",p_value)
           print ("data are normal distributed")
           return True
      
    #样本数小于50用Shapiro-Wilk算法检验正态分布性
    if len(testData) <50:
       p_value= stats.shapiro(testData)[1]
       if p_value<0.05:
           print ("use shapiro:")
           print("p value:",p_value)
           print ("data are not normal distributed")
           return  False
       else:
           print ("use shapiro:")
           print("p value:",p_value)
           print ("data are normal distributed")
           return True
        
    if 300>=len(testData) >=50:
       p_value= lillifors(testData)[1]
       
       if p_value<0.05:
           print ("use lillifors:")
           print("p value:",p_value)
           print ("data are not normal distributed")
           return  False
       else:
           print ("use lillifors:")
           print("p value:",p_value)
           print ("data are normal distributed")
           return True
      
    if len(testData) >300:
       p_value= stats.kstest(testData,'norm')[1]
       if p_value<0.05:
           print ("use kstest:")
           print("p value:",p_value)
           print ("data are not normal distributed")
           return  False
       else:
           print ("use kstest:")
           print("p value:",p_value)
           print ("data are normal distributed")
           return True
    #测试结束
    print("-"*100)
  
#对所有样本组进行正态性检验
def NormalTest(list_groups):
    for group in list_groups:
        #正态性检验
        status=check_normality(group)
        if status==False :
            return False
              
 

group1=[5,2,4,2.5,3,3.5,2.5,3]
group2=[1.5,2,1.5,2.5,3.3,2.3,4.2,2.5]
group3=[96,90,95,92,95,94,94,94]
list_groups=[group1,group2,group3]
list_total=group1+group2+group3
#对所有样本组进行正态性检验  
NormalTest(list_groups)

下图可见，独立变量x1和x2没有相关，R调整平方为0.19

x1和yR调整平方0.59的关系--存在很弱关系

x2和y存在R调整平方-0.19，即没有关系

但x1和x2与y存在0.886R调整平方关系，非常强

且x1和x2与y结合后，残差服从正态分布，AIC和BIC值很小，

prob (F-statistic)=0.00187,小于0.05，说明回归方程显著

参数t检验显著，x1和x2的t分数P值分别为0.001和0.01,小于0.05，否定H0，表示x1和x2显著，说明此模型拟合度很好

说明貌似一个不相关的变量，可能对结果有显著影响

# -*- coding: utf-8 -*-
"""
Created on Tue Jul 18 09:37:15 2017

@author: toby
"""



# Import standard packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import datasets, linear_model
from matplotlib.font_manager import FontProperties 
font_set = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=15)  
# additional packages
import sys
import os
sys.path.append(os.path.join('..', '..', 'Utilities'))

try:
# Import formatting commands if directory "Utilities" is available
    from ISP_mystyle import showData 
    
except ImportError:
# Ensure correct performance otherwise
    def showData(*options):
        plt.show()
        return

# additional packages ...
# ... for the 3d plot ...
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

# ... and for the statistic
from statsmodels.formula.api import ols
#生成组合
from itertools import combinations

x1=[5,2,4,2.5,3,3.5,2.5,3]
x2=[1.5,2,1.5,2.5,3.3,2.3,4.2,2.5]
y=[96,90,95,92,95,94,94,94]


#自变量列表
list_x=[x1,x2]

#绘制多元回归三维图
def Draw_multilinear():
    
    df = pd.DataFrame({'x1':x1,'x2':x2,'y':y})
    # --- >>> START stats <<< ---
    # Fit the model
    model = ols("y~x1+x2", df).fit()
    param_intercept=model.params[0]
    param_x1=model.params[1]
    param_x2=model.params[2]
    rSquared_adj=model.rsquared_adj
    
    #generate data,产生矩阵然后把数值附上去
    x = np.linspace(-5,5,101)
    (X,Y) = np.meshgrid(x,x)
    
    # To get reproducable values, I provide a seed value
    np.random.seed(987654321)   
    Z = param_intercept + param_x1*X+param_x2*Y+np.random.randn(np.shape(X)[0], np.shape(X)[1])

    # 绘图
    #Set the color
    myCmap = cm.GnBu_r
    # If you want a colormap from seaborn use:
    #from matplotlib.colors import ListedColormap
    #myCmap = ListedColormap(sns.color_palette("Blues", 20))
    
    # Plot the figure
    fig = plt.figure("multi")
    ax = fig.gca(projection='3d')
    surf = ax.plot_surface(X,Y,Z, cmap=myCmap, rstride=2, cstride=2, 
        linewidth=0, antialiased=False)
    ax.view_init(20,-120)
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.set_title("multilinear with adj_Rsquare %f"%(rSquared_adj))
    fig.colorbar(surf, shrink=0.6)
    
    outFile = '3dSurface.png'
    showData(outFile)

    
#检查独立变量之间共线性关系
def Two_dependentVariables_compare(x1,x2):
    # Convert the data into a Pandas DataFrame
    df = pd.DataFrame({'x':x1, 'y':x2})
    # Fit the model
    model = ols("y~x", df).fit()
    rSquared_adj=model.rsquared_adj
    print("rSquared_adj",rSquared_adj)
    if rSquared_adj>=0.8:
        print("high relation")
        return True
    elif 0.6<=rSquared_adj<0.8:
         print("middle relation")
         return False
    elif rSquared_adj<0.6:
         print("low relation")
         return False

#比较所有参数，观察是否存在多重共线
def All_dependentVariables_compare(list_x):  
    list_status=[]
    list_combine=list(combinations(list_x, 2))
    for i in list_combine:
        x1=i[0]
        x2=i[1]
        status=Two_dependentVariables_compare(x1,x2)
        list_status.append(status)
    if True in list_status:
        print("there is multicorrelation exist in dependent variables")
        return True
    else:
        return False
    
        
#回归方程，支持哑铃变量
def regressionModel(x1,x2,y):
    '''Multilinear regression model, calculating fit, P-values, confidence intervals etc.'''
    # Convert the data into a Pandas DataFrame
    df = pd.DataFrame({'x1':x1,'x2':x2,'y':y})
    
    # --- >>> START stats <<< ---
    # Fit the model
    model = ols("y~x1+x2", df).fit()
    # Print the summary
    print((model.summary()))
    return model._results.params  # should be array([-4.99754526,  3.00250049, -0.50514907])

    
# Function to show the resutls of linear fit model
def Draw_linear_line(X_parameters,Y_parameters,figname,x1Name,x2Name):
    #figname表示图表名字，用于生成独立图表fig1 = plt.figure('fig1')，fig2 = plt.figure('fig2')
    plt.figure(figname)
    #获取调整R方参数    
    df = pd.DataFrame({'x':X_parameters, 'y':Y_parameters})
    # Fit the model
    model = ols("y~x", df).fit()
    rSquared_adj=model.rsquared_adj 
    
    #处理X_parameter1数据
    X_parameter1 = []
    for i in X_parameters:
        X_parameter1.append([i])
    
    # Create linear regression object
    regr = linear_model.LinearRegression()
    regr.fit(X_parameter1, Y_parameters)
    plt.scatter(X_parameter1,Y_parameters,color='blue',label="real value")
    plt.plot(X_parameter1,regr.predict(X_parameter1),color='red',linewidth=4,label="prediction line")
    plt.title("linear regression %s and %s with adj_rSquare:%f"%(x1Name,x2Name,rSquared_adj))
    plt.xlabel('x', fontproperties=font_set)  
    plt.ylabel('y', fontproperties=font_set)  
    plt.xticks(())
    plt.yticks(())
    plt.legend()
    plt.show()      
    

#绘制多元回归三维图
Draw_multilinear()  
#比较所有参数，观察是否存在多重共线
All_dependentVariables_compare(list_x)              
Draw_linear_line(x1,x2,"fig1","x1","x2")
Draw_linear_line(x1,y,"fig4","x1","y")
Draw_linear_line(x2,y,"fig5","x2","y")
regressionModel(x1,x2,y)

    
    
'''
训练数据
x1=[2,6,8,3,2,7,9,8,4,6]
x2=[1,0,1,0,1,1,0,0,1,1]
y=[2900,3000,4800,1800,2900,4900,4200,4800,4400,4500]

x=[89,66,78,111,44,77,80,66,109,76]
y=[4,1,3,6,1,3,3,2,5,3]
z=[7,5.4,6.6,7.4,4.8,6.4,7,5.6,7.3,6.4]

x1=[89,66,78,111,44,77,80,66,109,76]
x2=[4,1,3,6,1,3,3,2,5,3]
x3=[3.84,3.19,3.78,3.89,3.57,3.57,3.03,3.51,3.54,3.25]
y=[7,5.4,6.6,7.4,4.8,6.4,7,5.6,7.3,6.4]

   
'''

python风控评分卡建模和风控常识(博客主亲自录制视频教程)

https://study.163.com/course/introduction.htm?courseId=1005214003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

weixin_34206899

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
多元回归比一元回归优越性

sklearn实战-乳腺癌细胞数据挖掘(博客主亲自录制视频教程)https://study.163.com/course/introduction.htm?courseId=1005269003&amp;utm_campaign=commission&amp;utm_source=cp-400000000398149&amp;utm_medium=share 貌似一个不相关的变量，可能...
复制链接

扫一扫