统计缺失个数
na_count<-function(data,x){
sum(data[,x]==""|is.na(data[,x]))
}
eg:
na_count(Data_Tmp,"hypertension_before")
插补
centralImputation<-function (data,xx)
{
for (i in c(xx)) {
if (any(idx <- which(data[, i]==""|is.na(data[, i]))))
data[idx,i]<-data[sample(1:length(data[,i]),length(idx)),i]
}
data
}
数据框加一列
Training_Final1s$age_less_20<-with(Training_Final1s,ifelse(Training_Final1s$年龄.岁.<age_20,1,0))
transform(airquality, new = -Ozone, Temp = (Temp-32)/1.8)
筛选
data2<-subset(data1,select=-c(城市等级,城市大小,医院等级,医院性质,areas))
subset(df, subset = price > 5)
dplyr改变数据类型
# convert all factor to character
dat %>% mutate(across(where(is.factor), as.character))
# apply function (change encoding) to all character columns
dat %>% mutate(across(where(is.character),
function(x){iconv(x, to = "ASCII//TRANSLIT")}))
# subsitute all NA in numeric columns
dat %>% mutate(across(where(is.numeric), function(x) tidyr::replace_na(x, 0)))
批量统计数据占比
#例如我们有原始数据:data,想统计不同饮食生活习惯(具体变量见xiguan)下的人维生素A和维生素E的差异。因变量:data$VA_CA(严重缺乏、一般缺乏、轻微缺乏、正常、过量5个取值,如果是数值型变量可以使用cut()分割成分类变量)和data$VE_CA(4个取值)
xiguan<-c("Appetite_ca","Sleep_ca","Activity_ca","Vitamin supplementation","Staple Food","Meat","Eggs","Vegetables","Fruits","Liver_ca","Milk_ca")
xunhuan<-xiguan
aaa<-list()
for ( i in 1:length(xunhuan)){
data3<-data[!is.na(data[,xunhuan[i]])&data[,xunhuan[i]]!="",]#删除饮食生活习惯中的缺失值
prop_matrix_va<-t(prop.table(table(data3$VA_CA,data3[,xunhuan[i]]),2))
mean_va<-round(tapply(data3$VA,data3[,xunhuan[i]],mean,na.rm=T),2)#均值
sd_va<-round(tapply(data3$VA,data3[,xunhuan[i]],sd,na.rm=T),2)#标准差
va_pp_severe<-paste(round(prop_matrix_va[,"severe deficiency"]*100,2),"%",sep="")
va_pp_moderate<-paste(round(prop_matrix_va[,"moderate deficiency"]*100,2),"%",sep="")
va_pp_mild<-paste(round(prop_matrix_va[,"mild deficiency"]*100,2),"%",sep="")
va_pp_normal<-paste(round(prop_matrix_va[,"normal"]*100,2),"%",sep="")
va_pp_upper<-paste(round(prop_matrix_va[,"upper normal"]*100,2),"%",sep="")
va_pp<-cbind(va_pp_severe,va_pp_moderate,va_pp_mild,va_pp_normal,va_pp_upper,mean_va,sd_va)
colnames(va_pp)<-c("<0.1","[0.1,0.2)","[0.2,0.3)","[0.3,0.7]",">0.7","Mean","SD")
rownames(va_pp)<-rownames(prop_matrix_va)
#ve table
prop_matrix_ve<-t(prop.table(table(data3$VE_CA,data3[,xunhuan[i]]),2))
mean_ve<-round(tapply(data3$VE,data3[,xunhuan[i]],mean,na.rm=T),2)
sd_ve<-round(tapply(data3$VE,data3[,xunhuan[i]],sd,na.rm=T),2)
ve_pp_severe<-paste(round(prop_matrix_ve[,"severe deficiency"]*100,2),"%",sep="")
ve_pp_moderate<-paste(round(prop_matrix_ve[,"moderate deficiency"]*100,2),"%",sep="")
ve_pp_normal<-paste(round(prop_matrix_ve[,"normal"]*100,2),"%",sep="")
ve_pp_upper<-paste(round(prop_matrix_ve[,"upper normal"]*100,2),"%",sep="")
ve_pp<-cbind(ve_pp_severe,ve_pp_moderate,ve_pp_normal,ve_pp_upper,mean_ve,sd_ve)
colnames(ve_pp)<-c("<5","[5,7)","[7,20]",">20","Mean","SD")
rownames(ve_pp)<-rownames(prop_matrix_ve)
aaa[[i]]<-cbind(va_pp,ve_pp)
}
df <- do.call("rbind", aaa)
write.csv(df,file="df.csv")
#另外可尝试tableone::CreateTableOne()
#stargazer()
最后的效果是这样的(左边自行加上各个变量名):
one hot procedure
for(i in 1:ncol(Data_Tmp))
{
if(class(Data_Tmp[,i])=="character"){
Data_Tmp[,i]<-as.factor(Data_Tmp[,i])
}
}
NN <- sum(sapply(Data_Tmp, FUN = class) != "factor")
NRT <- dim(Data_Tmp)[1]
NR_Tmp <- dim(Data_Tmp)[2]
Matrix_Nu <- matrix(0, nrow = NRT, ncol = NN)
colnames(Matrix_Nu) <- names(Data_Tmp)[sapply(Data_Tmp, FUN = class) != "factor"]
#Matrix_Nu:数值变量
J <- 1
for(i in 1:NR_Tmp)
{
if(!is.factor(Data_Tmp[,i]))
{
Matrix_Nu[,J] <- Data_Tmp[,i]
J <- J+1
}
}
NC <- sum(sapply(Data_Tmp, FUN = class) == "factor")
Matrix_Ca <- matrix(factor(0), nrow = NRT, ncol = NC)
colnames(Matrix_Ca) <- names(Data_Tmp)[sapply(Data_Tmp, FUN = class) == "factor"]
J <- 1
for(i in 1:NR_Tmp)
{
if(is.factor(Data_Tmp[,i]))
{
QQbq <- Data_Tmp[,i]
levels(QQbq) <- c(levels(QQbq), "NA")
QQbq[is.na(QQbq)] <- "NA"
Matrix_Ca[,J] <- QQbq
J <- J+1
}
}
DF_Ca <- as.data.frame(Matrix_Ca)
for(i in 1:ncol(DF_Ca)){
DF_Ca[,i]<-as.factor(DF_Ca[,i])
}
DF_Ca<-DF_Ca[,as.data.frame(DF_Ca[1,] %>% c %>% sapply(FUN = nlevels))[,1]!=1]#delete the factor which levels==1
#DF_Ca<-
(DF_Ca[1,] %>% c %>% sapply(FUN = nlevels))
#sapply(DF_Ca, FUN = levels)
xnam <- names(DF_Ca)
fmla <- as.formula(paste("~ ", paste(xnam, collapse = "+")) )
Ca_M <- model.matrix(fmla, DF_Ca)
#sapply(Matrix_Ca, FUN = levels)
#Matrix_Nu[is.na(Matrix_Nu)] <- 0
Training_Final <- cbind(Matrix_Nu, Ca_M)
DF_Ca %>% as_tibble %>% apply(MARGIN = 2, FUN = levels)
Training_Finals <- cbind(scale(Matrix_Nu), Ca_M)
Training_Finals %>% glimpse
DF_Ca[1,] %>% c %>% sapply(FUN = nlevels)
head(Matrix_Nu)
Training_Final1s<-as.data.frame(Training_Finals)