[R] 后半学期总结-CSDN博客

本文链接：https://blog.csdn.net/m0_74331272/article/details/138300116
本文探讨了使用R语言中的多种统计和机器学习方法，如交叉表分析、卡方检验、CramérsV、Fisherstest、逻辑斯谛回归等，对社会调查数据进行性别、专业选择、奖学金授予、就业状况等多方面的关联性和模型性能评估，包括伪R²、混淆矩阵和ROC曲线，以优化预测模型的准确性和解释性。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
#List
library(gmodels)
library(sjPlot)
library(dplyr)
library(sjstats)

CrossTable(Survey_GE_class_choice_1$Followed_GEC, expected = TRUE, prop.r = TRUE, prop.c = TRUE, prop.t = TRUE, prop.chisq = TRUE)
cat("History of China:", 0.339,"\nSocieties inequalities：",0.322,"\nMedian:" ,0.339 )

chisq.test(table(Survey_GE_class_choice_1$Major, Survey_GE_class_choice_1$Followed_GEC))
#the p-value: we can say 99.9% confidence that I can reject the H0(Zero Hypothesis)(Absence of correlation)

crosstable_statistics(Survey_GE_class_choice_1,x1=Followed_GEC,x2=Major,statistics = ("cramer"))
# the cramer's V(0-1)0.1-0.2 weak,...>0.3 strong correlation

sjt.xtab(Survey_GE_class_choice_1$Followed_GEC,Survey_GE_class_choice_1$Major)

# 4. Calculate Cramer's V for magnitude of correlation

#1.	Using the fisher test figure out if a statistical correlation exist between grante on social criteria and gender
fisher_test_grant_gender <- fisher.test(table(Survey_GE_class_choice_1$Granted_social_criteria, Survey_GE_class_choice_1$Gender))

# Output the result
print(fisher_test_grant_gender)



millemium_2015_study <- millemium_2015_study %>% mutate_if(is.character, as.factor)
millemium_2015_study$help <- as.factor(millemium_2015_study$help)

#5.	Using the package GDAtools, find the PEM of each cells of the tables showing the relation between help and class
y= table(millemium_2015_study$class, millemium_2015_study$help)
pem(y)
pem.table(millemium_2015_study$class, millemium_2015_study$help,na.rm=T)


library(LogisticDx)
library(jtools)
library(regclass)
library(DescTools)
library(stats)
library(dplyr)

#glm2 ->glm -> built model
#jtools -> export_summs -> display the model -> AOC ROC
#LogisticDx -> gof -> AUC ->70%, 80% good
#DescTools -> PseudoR2 -> goodness and fit
#0.2-0.4 very good
#regclass -> confusion_matrix

titanic.train$age2 =titanic.train$Age^2

#1.	Create a model including all the possible independent variable 
#(pclass, sex, age, sibsp, parch, fare, embarked). 
#This model will be name survive_full
survive_full<- glm(Survived ~Pclass+Sex+Age+SibSp+Parch+Fare+Embarked, data = titanic.train, family = "binomial")

#logictitanic1<- glm(Survived ~ Pclass, data = titanic.train, family = "binomial")
#logictitanic2<- glm(Survived ~ Pclass+Sex, data = titanic.train, family = "binomial")

#2.	Create a model including Pclass, Sex, Age, SibSp, Embarked. This model will be named survive_intermediate
survive_intermediate<- glm (Survived ~Pclass+Sex+Age+SibSp+Embarked, data = titanic.train,family = "binomial")

#3.	Create a model including the independent variable: pclass, sex, age, embarked. 
#This model will be name survive_reduced
survive_reduced<- glm (Survived ~Pclass+Sex+Age+Embarked, data = titanic.train,family = "binomial")

levels(titanic.train$Sex)
#4.	Produce a table summarizing the results of the three using the export_summs function
export_summs(survive_full,survive_intermediate,survive_reduced)

#5.	What are is the pseudo R² of those different model
PseudoR2(survive_full)
PseudoR2(survive_intermediate)
PseudoR2(survive_reduced)

#6.	According to the AIC which model is the best?

cat("the second, as 800.96 < 803.67 <810.97 ")

#7.	Produce the ROC curve of the three models. 
#According to the ROC, which one is the best (need to specify g=8)
gof(survive_full,g=8)
gof(survive_intermediate,g=8)
gof(survive_reduced,g=8)


#1.	Compute the Mac Fadden R2 for the three model and analyze their goodness
PseudoR2(survive_full,which = "all")
PseudoR2(survive_intermediate,which = "all")
PseudoR2(survive_reduced,which = "all")



#2.
confusion_matrix(survive_full)
confusion_matrix(survive_intermediate)
confusion_matrix(survive_reduced)


survive_intermediate_2<- glm (Survived ~Pclass+Sex+age2+SibSp+Embarked, data = titanic.train,family = "binomial")
confusion_matrix(survive_intermediate_2)

#优秀的模型：通常具有非常低的假阴性率和假阳性率（False Positive Rate, FPR），大约低于 10%。这意味着模型能正确识别大多数的正例和负例。

#好的模型：假阴性率和假阳性率可能在 10% 到 20% 之间。模型表现良好，但在某些情况下可能会错过一些正例或错误地将负例识别为正例。

#可接受的模型：假阴性率和假阳性率可能在 20% 到 30% 之间。这样的模型仍然可用，但其错误率较高，可能需要进一步的优化。

#改进空间大的模型：如果假阴性率和假阳性率超过 30%，则通常认为模型有很大的改进空间。这表明模型在很大程度上未能准确预测正例和负例。

#Tesy\t are not biased? How to verify?

library(car)
vif(survive_intermediate)
titanic.train$Logage = log(titanic.train$Age)
survive_intermediate_logage<- glm (Survived ~Pclass+Sex+Age+SibSp+Embarked+Logage, data = titanic.train,family = "binomial")
export_summs(survive_intermediate_logage)
#the age is already logged
#find out the problem and change it into the categratical variable


dx(survive_intermediate ,byCov = FALSE)
#no value with dBhat above 1
plot(survive_intermediate)

library(haven)
library(nnet)
#Visualize
library(stargazer)
library(mlogit)
#R^2 checks
library(DescTools)

#The percentages fo ? of different prediction
library(summarytools)

#1.	The variable corresponding to working status is named wrstat_2. 
#For this variable you will first create two possible reference level (full-time and part-time)
GSS_employment_status_simplified_V1$ref_level_full_time<-relevel(GSS_employment_status_simplified_V1$wrkstat_2,ref="working fulltime")
#You can also work with factor
levels(GSS_employment_status_simplified_V1$wrkstat_2)

GSS_employment_status_simplified_V1$ref_level_full_time <- factor(GSS_employment_status_simplified_V1$wrkstat_2,levels = c("working fulltime","keeping house","not working","working parttime"))
levels(GSS_employment_status_simplified_V1$ref_level_full_time)

GSS_employment_status_simplified_V1$ref_level_part_time<-relevel(GSS_employment_status_simplified_V1$wrkstat_2,ref="working parttime")
levels(GSS_employment_status_simplified_V1$ref_level_part_time)
#2.	While taking full-time as reference level, 
#create a first model which will include the 
#variables, sexnow, age, degree). 
#Visualize the results of this model using the stargazer package and comment the result.
Model_1<-multinom(data=GSS_employment_status_simplified_V1,GSS_employment_status_simplified_V1$ref_level_full_time~sexnow+age+degree)
stargazer(Model_1,type="text")
#When we move from male to woman, the chance at the odd to "not working" rather than "Full-time",increase by (exp(0.823)-1)*100,%
cat((exp(0.823)-1)*100,"%")



df2 = subset(GSS_employment_status_simplified_V1,select = c("ref_level_full_time","sexnow","age","degree","race","childs_2"))
df2<-na.omit(df2)
#Drop those elements that lack info of these variables
Model_1_bis<-multinom(data=df2,ref_level_full_time~sexnow+age+degree+race+childs_2)
stargazer(Model_1_bis,type="text")


df1 =subset(gfk_cleaned_na_omit,mclassic_ref_nei=="Like or like a lot")
df2 =subset(gfk_cleaned_na_omit,mclassic_ref_nei=="Dislike or dislike a lot")
dfC =rbind(df1,df2)export_summs(modelC)

modelC = glm(data = dfC, mclassic_ref_nei~hhincome_cat+political_cap+educ,family = "binomial")
confusion_matrix(modelC)
export_summs(modelC)






library(summarytools)
ctable <- table(df2$ref_level_full_time,predict(Model_1_bis))
show(ctable)
ctable(df2$ref_level_full_time,predict(Model_1_bis))
#ctable <- table(GSS_na_omit$wrkstat_2,predict(Model_1_bis))
PseudoR2(Model_1_bis)
#"working fulltime" 实际类别中，模型预测为 "working fulltime" 的有676例（占该类别总数的98.1%）。
#"keeping house" 实际类别中，模型预测为 "keeping house" 的有25例（占该类别总数的17.6%），而大多数（82.4%）被错误地预测为 "working fulltime"。
#"not working" 和"working parttime" 实际类别中，模型预测完全错误，