logistic regression在R中实现案例

r 同时被 2 个专栏收录
14 篇文章 0 订阅
6 篇文章 0 订阅
rm(list = ls())
install.packages("AER")
library(AER)
library(survival)
library(ROCR)
data(Affairs,papackage = "AER")
mydata <- Affairs
summary(mydata)

   affairs          gender         age         yearsmarried    children  religiousness     education       occupation   
 Min.   : 0.000   female:315   Min.   :17.50   Min.   : 0.125   no :171   Min.   :1.000   Min.   : 9.00   Min.   :1.000  
 1st Qu.: 0.000   male  :286   1st Qu.:27.00   1st Qu.: 4.000   yes:430   1st Qu.:2.000   1st Qu.:14.00   1st Qu.:3.000  
 Median : 0.000                Median :32.00   Median : 7.000             Median :3.000   Median :16.00   Median :5.000  
 Mean   : 1.456                Mean   :32.49   Mean   : 8.178             Mean   :3.116   Mean   :16.17   Mean   :4.195  
 3rd Qu.: 0.000                3rd Qu.:37.00   3rd Qu.:15.000             3rd Qu.:4.000   3rd Qu.:18.00   3rd Qu.:6.000  
 Max.   :12.000                Max.   :57.00   Max.   :15.000             Max.   :5.000   Max.   :20.00   Max.   :7.000  
     rating     
 Min.   :1.000  
 1st Qu.:3.000  
 Median :4.000  
 Mean   :3.932  
 3rd Qu.:5.000  
 Max.   :5.000  

table(mydata$affairs)

  0   1   2   3   7  12 
451  34  17  19  42  38 

mydata$affairs[mydata$affairs>0]<- 1
table(mydata$affairs)
  0   1 
451 150 
mydata$affairs <- factor(mydata$affairs,levels = c(0,1),labels = c("NO","YES"))
table(mydata$affairs)
NO YES 
451 150 
model <-glm(affairs~gender + age +yearsmarried + children +religiousness+
              education +occupation +rating,data = mydata,family = binomial())
summary(model)$coefficients
                 Estimate Std. Error    z value     Pr(>|z|)
(Intercept)    1.37725816 0.88775566  1.5513933 1.208075e-01
gendermale     0.28028665 0.23909454  1.1722838 2.410831e-01
age           -0.04425502 0.01824811 -2.4251835 1.530065e-02
yearsmarried   0.09477302 0.03221445  2.9419413 3.261617e-03
childrenyes    0.39767213 0.29150776  1.3641905 1.725076e-01
religiousness -0.32472063 0.08975394 -3.6178985 2.970049e-04
education      0.02105086 0.05051032  0.4167636 6.768513e-01
occupation     0.03091971 0.07177644  0.4307780 6.666298e-01
rating        -0.46845426 0.09090644 -5.1531469 2.561511e-07

#coef 为回归系数

coef <- summary(model)$coefficients[,1]
se <- summary(model)$coefficients[,2]
pvalue <- summary(model)$coefficients[,4]

#第一种方法计算OR及可信区间

results <- cbind(exp(coef),exp(coef-1.96*se),exp(coef + 1.96*se),pvalue)
dimnames(results)[[2]] <- c("OR","LL","UL","P value")
results
                     OR        LL         UL      P value
(Intercept)   3.9640180 0.6957653 22.5843967 1.208075e-01
gendermale    1.3235091 0.8283341  2.1146979 2.410831e-01
age           0.9567099 0.9230967  0.9915472 1.530065e-02
yearsmarried  1.0994093 1.0321383  1.1710647 3.261617e-03
childrenyes   1.4883560 0.8405632  2.6353799 1.725076e-01
religiousness 0.7227292 0.6061436  0.8617389 2.970049e-04
education     1.0212740 0.9250113  1.1275545 6.768513e-01
occupation    1.0314027 0.8960473  1.1872047 6.666298e-01
rating        0.6259691 0.5238076  0.7480559 2.561511e-07

#第二种方法计算OR及可信区间

exp(cbind("OR" = coef(model),confint(model)))
Waiting for profiling to be done...
                     OR     2.5 %     97.5 %
(Intercept)   3.9640180 0.7013467 22.9148790
gendermale    1.3235091 0.8294178  2.1209988
age           0.9567099 0.9223032  0.9908864
yearsmarried  1.0994093 1.0326203  1.1718726
childrenyes   1.4883560 0.8451473  2.6584834
religiousness 0.7227292 0.6049441  0.8605325
education     1.0212740 0.9254481  1.1284991
occupation    1.0314027 0.8964089  1.1884105
rating        0.6259691 0.5227302  0.7470069

#逐步回归

step(model,direction = "backward")
Start:  AIC=627.51
affairs ~ gender + age + yearsmarried + children + religiousness + 
    education + occupation + rating

                Df Deviance    AIC
- education      1   609.68 625.68
- occupation     1   609.70 625.70
- gender         1   610.89 626.89
- children       1   611.40 627.40
<none>               609.51 627.51
- age            1   615.67 631.67
- yearsmarried   1   618.34 634.34
- religiousness  1   622.92 638.92
- rating         1   636.75 652.75

Step:  AIC=625.68
affairs ~ gender + age + yearsmarried + children + religiousness + 
    occupation + rating

                Df Deviance    AIC
- occupation     1   610.15 624.15
- gender         1   611.29 625.29
- children       1   611.62 625.62
<none>               609.68 625.68
- age            1   615.78 629.78
- yearsmarried   1   618.46 632.46
- religiousness  1   623.27 637.27
- rating         1   636.93 650.93

Step:  AIC=624.15
affairs ~ gender + age + yearsmarried + children + religiousness + 
    rating

                Df Deviance    AIC
- children       1   611.86 623.86
<none>               610.15 624.15
- gender         1   613.41 625.41
- age            1   616.00 628.00
- yearsmarried   1   619.07 631.07
- religiousness  1   623.98 635.98
- rating         1   637.23 649.23

Step:  AIC=623.86
affairs ~ gender + age + yearsmarried + religiousness + rating

                Df Deviance    AIC
<none>               611.86 623.86
- gender         1   615.36 625.36
- age            1   618.05 628.05
- religiousness  1   625.57 635.57
- yearsmarried   1   626.23 636.23
- rating         1   639.93 649.93

Call:  glm(formula = affairs ~ gender + age + yearsmarried + religiousness + 
    rating, family = binomial(), data = mydata)

Coefficients:
  (Intercept)     gendermale            age   yearsmarried  religiousness  
      1.94760        0.38612       -0.04393        0.11133       -0.32714  
       rating  
     -0.46721  

Degrees of Freedom: 600 Total (i.e. Null);  595 Residual
Null Deviance:	    675.4 
Residual Deviance: 611.9 	AIC: 623.9

#AIC越小,模型拟合越高,接下来根据显著影响的因素,列入回归模型

model2 <-glm(formula = affairs~ gender + age +yearsmarried +religiousness+rating,
             family = binomial(),data = mydata)
summary(model2)$coefficients
                Estimate Std. Error   z value     Pr(>|z|)
(Intercept)    1.94760307 0.61233521  3.180616 1.469624e-03
gendermale     0.38612217 0.20702802  1.865072 6.217131e-02
age           -0.04392545 0.01806068 -2.432104 1.501138e-02
yearsmarried   0.11132715 0.02982799  3.732304 1.897360e-04
religiousness -0.32714238 0.08947345 -3.656307 2.558752e-04
rating        -0.46721157 0.08928317 -5.232919 1.668543e-07

#对新模型和旧模型进行检验

anova(model,model2)
Analysis of Deviance Table

Model 1: affairs ~ gender + age + yearsmarried + children + religiousness + 
    education + occupation + rating
Model 2: affairs ~ gender + age + yearsmarried + religiousness + rating
  Resid. Df Resid. Dev Df Deviance
1       592     609.51            
2       595     611.86 -3  -2.3486

#预测,用testdata尝试预测

testdata <- data.frame(rating = rep(c(1,2,3,4,5),2),
                       gender =rep(c("male","female"),each = 5),
                       age =mean(mydata$age),
                       yearsmarried = mean(mydata$yearsmarried),
                       religiousness = mean(mydata$religiousness))
testdata$prob <- predict(model2,newdata = testdata,type = "response")
testdata
   rating gender      age yearsmarried religiousness      prob
1       1   male 32.48752     8.177696      3.116473 0.5818455
2       2   male 32.48752     8.177696      3.116473 0.4658389
3       3   male 32.48752     8.177696      3.116473 0.3534133
4       4   male 32.48752     8.177696      3.116473 0.2551596
5       5   male 32.48752     8.177696      3.116473 0.1767546
6       1 female 32.48752     8.177696      3.116473 0.4860616
7       2 female 32.48752     8.177696      3.116473 0.3721557
8       3 female 32.48752     8.177696      3.116473 0.2708743
9       4 female 32.48752     8.177696      3.116473 0.1888649
10      5 female 32.48752     8.177696      3.116473 0.1273479

暂时就到这里,prob就是预测模型中因素发生的可能性百分比。
#对affairs这个数据集进行增加预测值与实际结果的值

Affairs$yn_affairs[Affairs$affairs == 0] <- 0
Affairs$yn_affairs[Affairs$affairs > 0] <- 1
Affairs$yn_affairs <- factor(Affairs$yn_affairs,
                             levels = c(0,1),
                             labels = c("No","Yes"))

#使用pROC绘制AUC曲线

pred <- predict(model2,newdata = Affairs[,-c(1,10)],type = "response")
modelroc <- roc(Affairs[,10],pred)
plot(modelroc, print.auc = TRUE,auc.polygon = TRUE,grild = c(0.1,0.2),
     grid.col =c("green","red"),max.auc.polygon =TRUE,
     auc.polygon.col = "skyblue",print.three = TRUE)

结果如下,AUC 的值即 ROC 曲线与 X 轴围城的面积,大于0.5既可以认为模型的精确度较好。
在这里插入图片描述

  • 0
    点赞
  • 0
    评论
  • 0
    收藏
  • 一键三连
    一键三连
  • 扫一扫,分享海报

相关推荐
©️2020 CSDN 皮肤主题: 数字20 设计师:CSDN官方博客 返回首页
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值