catboost的R实现(包含样本权重&指标权重设置,shap值,不平衡数据集处理)

读入数据

#L-name contrast
XMDZ<-read.csv(file='C:/Users/13771/Desktop/XMDZ.csv', header=FALSE,sep=",")
#data union
jzx<-read.csv(file='C:/Users/13771/Desktop/2甲状腺肿瘤.csv', row.names = 1,header=TRUE, sep=",")
rx<-read.csv(file='C:/Users/13771/Desktop/3乳腺恶性肿瘤.csv', row.names = 1,header=TRUE, sep=",")
yx<-read.csv(file='C:/Users/13771/Desktop/胰腺肿瘤.csv',row.names = 1, header=TRUE,sep=",")
sg<-read.csv(file='C:/Users/13771/Desktop/食管癌.csv',row.names = 1, header=TRUE, sep=",")
data<-rbind(jzx,rx,yx,sg)

列类型转换

#identify colunm type
library(do)
cdv1<-sapply(data, typeof)
table(cdv1)
cdv<-Replace(cdv1,pattern=c('double:numeric','character:factor'
                            ,'logical:factor','integer:numeric'))
table(cdv)

以规定的列类型重新读入

#read data with colclass
JZX<-read.csv(file='C:/Users/13771/Desktop/2甲状腺肿瘤.csv',row.names = 1, na.strings='0',header=TRUE,colClasses =cdv, sep=",")
RX<-read.csv(file='C:/Users/13771/Desktop/3乳腺恶性肿瘤.csv',row.names = 1,na.strings='0', header=TRUE, colClasses =cdv, sep=",")
YX<-read.csv(file='C:/Users/13771/Desktop/胰腺肿瘤.csv', row.names = 1,na.strings='0',header=TRUE,colClasses =cdv, sep=",")
SG<-read.csv(file='C:/Users/13771/Desktop/食管癌.csv',row.names = 1,na.strings='0', header=TRUE,colClasses =cdv,  sep=",")
data<-rbind(JZX,RX,YX,SG)
#delete column
#data<-subset(data,select=-c(DIAGNOSTIC,CHECKEROPINION,SAMPLETYPE))

定义自变量列和因变量列

#define label
data$TYPE = gsub(".*胰腺.*" , "1", data$TYPE )
data$TYPE[which(data$TYPE !=1)] <- 0
table(data$TYPE)
#transform label type
table(sapply(data$TYPE, typeof))
data$TYPE<-sapply(data$TYPE,as.integer)
table(sapply(data$TYPE, typeof))
 

降维&填补缺失值(其实在catboost包里面有自带的缺失值补充参数,这一步可以不做。)

#dimension reduction 
library(caret) 
zerovar=nearZeroVar(data1) 
data2=data1[,-zerovar]

set.seed(100)
rrfMod <- train(Class ~ ., data=data,method="RRF")
data <- varImp(rrfMod, scale=F)

#bagging tree
imputation_bag <- preProcess(data,method = 'bagImpute')
data <- predict(imputation_bag, data)

采取不放回方式进行抽样,减轻计算压力

#train data sampling(no replace)
library(sampling)
#help("sample")
sample_data <- data[sample(1:nrow(data),7100,replace = FALSE),]
#test_data <- data[sample(1:nrow(data),2102,replace = FALSE),]
train_data<-sample_data[1:5000,]
test_data<-sample_data[5001:7100,]
table(test_data$TYPE)
table(train_data$TYPE)

#l0480, --*糖类抗原19-9(西门子)
#l0580, --*淀粉酶
#l0043, --*总胆红素
#找到关键字不为空的行,对该行加权重

jhgjh<-subset(train_data,select=c(L0480,L0580,L0043))
#ctn<-jhgjh$L0618[1]
s_weight<-c(1:dim(train_data)[1])
#is.na(jhgjh$L0580[1])
num <- 1
while(num <= length(train_data[,1])){
  if((!train_data$L0480[num]==ctn)##&(!is.na(train_data$L0043[num]))
     ){
    s_weight[num]=20
    num = num + 1
  }
  else{
    s_weight[num]=1
    num = num + 1
  }
}
table(s_weight)
sum(is.na(data$L0580))

#l0480, --*糖类抗原19-9(西门子)
#l0580, --*淀粉酶
#l0043, --*总胆红素

lev<-c("L0480","L0580","L0043")
#lev<-c("L0580")
a<-colnames(train_data)

#找到关键字所在的列数

f_weight<-c(1:dim(train_data)[2])
num <- 1
while(num <= length(train_data[1,])){
  if(colnames(train_data)[num] %in% lev){
    f_weight[num]=4
    num = num + 1
  }
  else{
    f_weight[num]=1
    num = num + 1
  }
}
#table(train_data$L0580)
table(f_weight)

数据标准化处理,catboost算法本身会进行标准化,此步骤可省略

#data$TYPE=as.data.frame(lapply(JZX$DIAGNOSTIC,as.integer))
#Standardized data
num <- 3
while(num <= length(data[1,])){
  if(is.numeric(data[,num])){
    data[,num]<-
      (data[,num]-min(data[,num],na.rm = TRUE))/(max(data[,num],na.rm = TRUE)-min(data[,num],na.rm = TRUE))
    num = num + 1
  }
  else{
    num = num + 1
  }
}

#train_data<-train_data[order(-train_data$L0618,train_data$L0621,train_data$L0444),]
#train_data[4500:5000,L0618]

加载pool数据集

#Load the CatBoost dataset
train_pool <- catboost.load_pool(data=train_data[,-c(2)], label=train_data[,c(2)],weight=s_weight)    
test_pool <- catboost.load_pool(data=test_data[,-c(2)], label=test_data[,c(2)])    
#help(catboost.load_pool)

针对不平衡数据集的处理

rescaling<-sum(train_data$TYPE == 1)/sum(train_data$TYPE == 0)
#1/rescaling
c_weight<-c(1,1/rescaling)

参数设置

#set params&model&predict by diseases
fit_params_yx <- list(iterations = 588,   #最佳树 
                      thread_count = 10,    
                      class_weights = c_weight,#样本权重
                      #feature_weights = f_weight,#指标权重
                      loss_function = 'Logloss',    #损失函数
                      #ignored_features = c("L1563","L1564","L1505","L1508","L1502","L1504",
                      #                     "L1503","L5004","L1174"),   # 忽略不重要的列
                      border_count = 32,    
                      depth =5,    #树深度
                      #leaf_estimation_iterations=2,#迭代次数
                      #use_best_model=TRUE,#最佳树选择
                      learning_rate = 0.03,
                      #eval_metric='Accuracy'#精度计算方式
                      #random_seed=42,
                      one_hot_max_size=4,
                      l2_leaf_reg = 3.5,
                      nan_mode = 'Min',#空值处理方式
                      #train_dir = 'train_dir',    
                      logging_level = 'Silent'   #控制日志输出与否 
)        
model_yx<-catboost.train(train_pool,test_pool,fit_params_yx)
#help(catboost.train)

预测结果

#2 type of prediction
prediction1 <- catboost.predict(model_yx, test_pool, prediction_type = 'Probability')    
cat("Sample predictions: ", sample(prediction1, 5), "\n")
labels1 <- catboost.predict(model_yx, test_pool, prediction_type = 'Class')    
cat("Sample labels: ", sample(labels1, 5), "\n")
help(catboost.predict)

输出训练集预测准确度

#output accuracy&threshold moving
rescaling<-sum(train_data$TYPE == 1)/sum(train_data$TYPE == 0)
rescaling<-rescaling/(1+rescaling)
calc_accuracy <- function(prediction, expected) {    
  #labels <- ifelse(prediction > rescaling, 1, 0)    
  #labels <- ifelse(prediction > 0.5, 1, 0)    
  labels<-labels1
  accuracy <- sum(labels == expected) / length(labels)    
  return(accuracy)    
}    
# ??works properly only for Logloss    

accuracy <- calc_accuracy(prediction1, test_data[,c(2)])    
cat("\nAccuracy: ", accuracy, "\n")    

#prediction<-0.5*(prediction1+1/(1+rescaling))

模型加载与保存

#load model&save model
catboost.save_model(model_yx,
                    "D:/model_yx")
getwd()

获取各指标重要性

#get_feature_importance
#help(catboost.get_feature_importance)
im<-catboost.get_feature_importance(model_yx, train_pool)
im<-data.frame(im)
library(dplyr)
im <- add_rownames(im, "VALUE")
im<-im[order(-im$im),]
im<-im[1:20,]
im<-merge(im,XMDZ,by.x="VALUE",by.y="V1")
im<-im[order(-im$im),]

获取SHAP值

#get shap importance
im_shap<-catboost.get_feature_importance(model_rx, test_pool,type='ShapValues')
im_shap<-data.frame(im_shap)
names(im_shap)<-colnames(test_data[,-c(2)])
row.names(im_shap)<-rownames(test_data[,-c(2)])
im_sharp<- im_shap[ ,colnames(im_shap) %in% im$VALUE]
table(im_shap$L1600)
#cat("\nTree count: ", model$tree_count, "\n")

  • 1
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值