library(foreign)
df<-read.spss("meph.sav ",to.data.frame=TRUE)
attach(df)
contrasts(df$RACE1)
levels(df$RACE1) <- list("WHITE"=c("others","white"),"BLACK"="Black","NATIVE"="Native","ASIAN"="Asian")
contrasts(df$RACE1)
2. 挑出因子变量合并为一个数据集
names(df)
df1 <- subset(df,select=c(GENDER,RACE1,REGION1,EDUC1,PHSTAT1,MNHPOOR,ANYLIMIT,INCOME1,insure,poisexp))
3. 查看各变量的因子赋值
detach(df)
attach(df1)
box <- list(0)
for (i in 1:ncol(df1)){
box[[i]]<-contrasts(df1[,i])
}
box
4. 调整GENDER和RACE1的参考值
contrasts(df1$GENDER)
contrasts(df1$RACE1)
df1$GENDER <- relevel(df1$GENDER,ref="male")
df1$RACE1 <- relevel(df1$RACE1,ref="ASIAN")
5. 计算percentage of data
prop.table(table(df1$RACE1))
box1 <- list(0)
for (i in 1:ncol(df1)){
box1[[i]]<-prop.table(table(df1[,i]))
}
box1
6. 计算在每一个level上正的有多少
PPE <-function(it){
name <- levels(it)
level <- nlevels(it)
m <- matrix(0,nrow=2,ncol=level)
for(i in 1:level){
prob <- sum(it==name[i]&df1$poisexp=="health expenditure is positive")/sum(it==name[i])
m[,i] <- rbind(name[i],prob)
}
return(m)
}
for (i in 1:length(df1)){
print(PPE(df1[,i]))
}
7. 导出数据
write.csv(df1,"meph.csv")
8. 比较两个模型的适用性
fit1 <- glm()
fit1 <- glm(poisexp~ GENDER + RACE1 + REGION1 + EDUC1 + PHSTAT1 + MNHPOOR + ANYLIMIT + INCOME1 + insure,
family=binomial(link="logit"), data=df1)
fit2 <- glm(poisexp~GENDER+RACE1+REGION1+EDUC1+PHSTAT1+ANYLIMIT+INCOME1+insure,
family=binomial(link="logit"), data=df1)
anova(fit1,fit2)
9. 预测正确率
df1$prob <- predict(fit2,newdata=df1,type="response")
contrasts(df1$poisexp)
df1$ppE <- ifelse(df1$prob>0.5,"health expenditure is positive","otherwise")
df1$Right <- ifelse(df1$ppE==df1$poisexp,1,0)
prop.table(table(df1$Right))