目录
数据读取
df1=fread("E:/Research/df1.csv",header=T)
df2=fread("E:/Research/df2.csv",header=T)
选择行列
df=df[,Variable] #返回向量
df=df[,.(Variable)] #返回data.table
df=df[,c("Variable"),with=FALSE] #返回一个数据框
df=df[["Variable"]] #返回一列x向量,同第一个
# Varibale可以是列名也可以是列编号
数据清洗
缺失值处理
缺失值删除
df=df[ID !="NA"]
缺失值替换
SparseVariables = c("Variable1","Variable2","Variable3")
for (col in SparseVariables)
set(df, which(is.na(df[[col]])),col,0)
转换数据类型
NumericVariables=c("Var1","Var2")
df[,(NumericVariables):=lapply(.SD,as.numeric),.SDcols=NumericVariables]
生成新的列
对所有行生成
df[,":="(Var1 = Var2+ Var3)]
df[,":="(Var1 = Var2+ Var3, Var4 = Var5 + Var6)]
df[, c('Var1', 'Var5') := list(Var2+ Var3, Var5 + Var6)]
对满足条件的行生成
df[Var1=="0",':='(Var1_flag = 1)]
筛选
按照列的值去筛选
df2=df[Var1=="1"]
合并两个表
df_final=df_1[df_2,on="ID",nomatch=0]
长数据和宽数据的转换
宽数据转换为长数据
df_long=melt(df_wide,id.vars = "ID",measure.vars = c("Var1","Var2","Var3"),variable.name = "Var",value.name = "Var_valuet)
长数据转换为宽数据
df_wide=dcast(df_long,ID+Var1+Var2~Var3,value.var ="ValueName",fun.aggregate = sum)
排序
setkey(df_cluster,label)
分组统计
df_PPEG=df3 %$% .[,":="(diff=abs(Po-Pr))] %$% .[,.(PPGE=mean(diff,na.rm = TRUE)),by=ID]
df_SDGB=df2[,.(SDGB=sd(na.omit(glucose))),by=c("ID","Day")] %$% .[,.(meanSDGB=mean(SDGB,na.rm = TRUE)),by=ID]
#计算每组的行数
df[,.N,by=group_name]