可能不是最方便的方法,但是自己学习过程总结的。如果有更好的方法欢迎各位大佬们补充!!!
以生存分析的材料为例:
1. 变量处理
按照亚组分析的变量创建一个新的分组变量,将连续型变量按照事先设定的阈值调整为分类变量。
#将连续型变量分类处理
#年龄
mydata$Age.group <- ifelse(mydata$Age<=60,1,
ifelse(mydata$Age>60,2,NA))
mydata$Age.group <- factor(mydata$Age.group,levels = c(1,2),labels = c("<=60",">60"))
table(mydata$Age.group)
#Towsend指数
summary(mydata$Townsend_deprivation_index)
mydata$Townsend_deprivation_index.group <- ifelse(mydata$Townsend_deprivation_index<(-2.3),1,
ifelse(mydata$Townsend_deprivation_index>=(-2.3),2,NA))
mydata$Townsend_deprivation_index.group <- factor(mydata$Townsend_deprivation_index.group,levels = c(1,2),labels = c("low","high"))
table(mydata$Townsend_deprivation_index.group)
#久坐时间
summary(mydata$Sedentary_hours)
mydata$Sedentary_hours.group <- ifelse(mydata$Sedentary_hours<4.5,1,
ifelse(mydata$Sedentary_hours>=4.5,2,NA))
mydata$Sedentary_hours.group <- factor(mydata$Sedentary_hours.group,levels = c(1,2),labels = c("low","high"))
table(mydata$Sedentary_hours.group)
例如,我将上述的变量,年龄,Towsend指数和久坐时间进行的分类处理,并对其进行了因子化
2. 产生新的数据框
赋值到一个新的数据框,便于保留原始数据,挑选出自己需要的列,简化数据表格
library(dplyr)
names(mydata)
df <- mydata%>%
select(patientID,Age,Townsend_deprivation_index,activity,Sedentary_hours,Total_sugar,Energy,Fat,Fish,red_meat,
vegefruit_sum,Insulin_user,Lipid_lowering_drugs_user,Aspirin_user,futime,fustat,SSBs_category,ASBs_category,PJs_category,
Sex,Age.group,Ethnic,smoking_status,Alcohol_intake_frequency,BMI.group,Townsend_deprivation_index.group,
Sedentary_hours.group,sleep_duration_group,physical_activity_group,diabetes,Antihypertensive_drugs_user)
str(df)
3. 亚组分析的表格绘制(重点)
a.收集HR,95%CI 和p value值
创建一个表头,用于收纳需要收集的数据,因为自变量有四个水平,以1作为对照,那么就有3个比较的水平,所以设置3个HR
a <- c("HR1","5%CI1","95%CI1","P1","HR2","5%CI2","95%CI2","P2","HR3","5%CI3","95%CI3","P3")
建立cox回归结果
a1<-summary(coxph(Surv(futime,fustat==1)~SSBs_category+Age+Ethnic+Townsend_deprivation_index+
smoking_status+Alcohol_intake_frequency+activity+Sedentary_hours+
Total_sugar+Energy+Fat+Fish+red_meat+vegefruit_sum+
Insulin_user+Antihypertensive_drugs_user+Lipid_lowering_drugs_user+Aspirin_user,
subset =Sex =="Male",data=df))
利用森林图格式整理的方法提取cox回归中的HR,然后用rbind连接
a <- rbind(a,c(round(a1$coefficients[1,2],2),round(a1$conf.int[1,3],2),round(a1$conf.int[1,4],2),round(a1$coefficients[1,5],3),
round(a1$coefficients[2,2],2),round(a1$conf.int[2,3],2),round(a1$conf.int[2,4],2),round(a1$coefficients[2,5],3),
round(a1$coefficients[3,2],2),round(a1$conf.int[3,3],2),round(a1$conf.int[3,4],2),round(a1$coefficients[3,5],3)))
a1$con.int提取HR,95%CI, a1$coefficients 提取P value
再次重复上述过程!!!!!只需要复制粘贴,修改subset内容就行
a1 <-summary(coxph(Surv(futime,fustat==1)~SSBs_category+Age+Ethnic+Townsend_deprivation_index+
smoking_status+Alcohol_intake_frequency+activity+Sedentary_hours+
Total_sugar+Energy+Fat+Fish+red_meat+vegefruit_sum+
Insulin_user+Antihypertensive_drugs_user+Lipid_lowering_drugs_user+Aspirin_user,
subset =Sex =="Female",data=df))
a <- rbind(a,c(round(a1$coefficients[1,2],2),round(a1$conf.int[1,3],2),round(a1$conf.int[1,4],2),round(a1$coefficients[1,5],3),
round(a1$coefficients[2,2],2),round(a1$conf.int[2,3],2),round(a1$conf.int[2,4],2),round(a1$coefficients[2,5],3),
round(a1$coefficients[3,2],2),round(a1$conf.int[3,3],2),round(a1$conf.int[3,4],2),round(a1$coefficients[3,5],3)))
b.给数据设置行名
rownames(a) <- c("characteristic",
"male","female","<60 years",">=65 years",
"white","non-white","smoking_yes","smoking_no","<3 times/week",">=3times/week","<25kg/m2",">=25kg/m2","low","high",
"sedentary_low","sedentary_high","sleep_low","sleep_high","activity_low","activity_high",
"diabetes_yes","diabetes_no","hypertension_yes","hypertension_no")
a <- data.frame(a)#设置为数据框
c.统计各亚组的样本数n
详情可以参考实用技巧——绘制森林图-CSDN博客中的“总结每个变量的病例数”部分。
library(tableone)
names(df)
myVars <- c("Sex","Age.group","Ethnic","smoking_status","Alcohol_intake_frequency","BMI.group","Townsend_deprivation_index.group",
"Sedentary_hours.group","sleep_duration_group","physical_activity_group","diabetes","Antihypertensive_drugs_user")
catVars <-c("Sex","Age.group","Ethnic","smoking_status","Alcohol_intake_frequency","BMI.group","Townsend_deprivation_index.group",
"Sedentary_hours.group","sleep_duration_group","physical_activity_group","diabetes","Antihypertensive_drugs_user")
table1 <- print(CreateTableOne(vars=myVars,
data=df,
factorVars=catVars),
catDigits=2,contDigits=2,showAllLevels=TRUE)
table1 <- data.frame(table1)
a$n <- table1$Overall
table1的样式,然后将值赋值给a
d.统计亚组中发生的事件数events
#亚组中发生的事件数
table2 <- print(CreateTableOne(vars=myVars,
data=df,
strata="fustat",
factorVars=catVars),
catDigits=2,contDigits=2,showAllLevels=TRUE)
table2 <- data.frame(table2)
a$event <- table2[,3]
e.创建ref组和HR(95%CI)组合
#新建一列作为对照组
a$reference <- round(c(rep(1.00,25)),2)
a$HR.95CI1 <- paste0(a$X1,"(",a$X2,"-",a$X3,")");a
a$HR.95CI2 <- paste0(a$X5,"(",a$X6,"-",a$X7,")");a
a$HR.95CI3 <- paste0(a$X9,"(",a$X10,"-",a$X11,")");a
f.将为行名设置为第一列,设置行名
a <- tibble::rownames_to_column(a,var = "Characteristics")
ncol(a)
colnames(a) <- c("characteristic","HR1","5%CI1","95%CI1","P1","HR2","5%CI2","95%CI2","P2","HR3","5%CI3","95%CI3","P3",
"n","event","0/day","0-1/day","1-2/day",">2/day")
a[,1] <- c("characteristic","male","female","<60 years",">=65 years","white","non-white","yes","no","<3 times/week",">=3times/week",
"<25kg/m2",">=25kg/m2","low","high","low","high","<7 h/day","≥7 h/day","low","high","yes","no","yes","no")
a <- a[-1,]
g.每个亚组变量上设置一个空行,插入变量名称
a <- rbind(c("Sex",rep(NA,18)),
a[c(1:2),],
c("Age",rep(NA,18)),
a[c(3:4),],
c("Ethnicity",rep(NA,18)),
a[c(5:6),],
c("smoking status",rep(NA,18)),
a[c(7:8),],
c("alcohol intake",rep(NA,18)),
a[c(9:10),],
c("BMI",rep(NA,18)),
a[c(11:12),],
c("deprivation index",rep(NA,18)),
a[c(13:14),],
c("sedentary hours",rep(NA,18)),
a[c(15:16),],
c("Sleep duration",rep(NA,18)),
a[c(17:18),],
c("physical activity",rep(NA,18)),
a[c(19:20),],
c("Diabetes",rep(NA,18)),
a[c(21:22),],
c("hypertension",rep(NA,18)),
a[c(23:24),])
subgroup1 <-a