catboost的R实现(包含样本权重&指标权重设置，shap值，不平衡数据集处理)

ningnd

已于 2022-03-16 10:27:02 修改

阅读量2.2k

点赞数 1

文章标签： r语言 boost

于 2021-12-16 14:41:48 首次发布

本文链接：https://blog.csdn.net/ningnd/article/details/121974364

版权

读入数据

#L-name contrast
XMDZ<-read.csv(file='C:/Users/13771/Desktop/XMDZ.csv', header=FALSE,sep=",")
#data union
jzx<-read.csv(file='C:/Users/13771/Desktop/2甲状腺肿瘤.csv', row.names = 1,header=TRUE, sep=",")
rx<-read.csv(file='C:/Users/13771/Desktop/3乳腺恶性肿瘤.csv', row.names = 1,header=TRUE, sep=",")
yx<-read.csv(file='C:/Users/13771/Desktop/胰腺肿瘤.csv',row.names = 1, header=TRUE,sep=",")
sg<-read.csv(file='C:/Users/13771/Desktop/食管癌.csv',row.names = 1, header=TRUE, sep=",")
data<-rbind(jzx,rx,yx,sg)

列类型转换

#identify colunm type
library(do)
cdv1<-sapply(data, typeof)
table(cdv1)
cdv<-Replace(cdv1,pattern=c('double:numeric','character:factor'
,'logical:factor','integer:numeric'))
table(cdv)

以规定的列类型重新读入

#read data with colclass
JZX<-read.csv(file='C:/Users/13771/Desktop/2甲状腺肿瘤.csv',row.names = 1, na.strings='0',header=TRUE,colClasses =cdv, sep=",")
RX<-read.csv(file='C:/Users/13771/Desktop/3乳腺恶性肿瘤.csv',row.names = 1,na.strings='0', header=TRUE, colClasses =cdv, sep=",")
YX<-read.csv(file='C:/Users/13771/Desktop/胰腺肿瘤.csv', row.names = 1,na.strings='0',header=TRUE,colClasses =cdv, sep=",")
SG<-read.csv(file='C:/Users/13771/Desktop/食管癌.csv',row.names = 1,na.strings='0', header=TRUE,colClasses =cdv, sep=",")
data<-rbind(JZX,RX,YX,SG)
#delete column
#data<-subset(data,select=-c(DIAGNOSTIC,CHECKEROPINION,SAMPLETYPE))

定义自变量列和因变量列

#define label
data$TYPE = gsub(".*胰腺.*" , "1", data$TYPE )
data$TYPE[which(data$TYPE !=1)] <- 0
table(data$TYPE)
#transform label type
table(sapply(data$TYPE, typeof))
data$TYPE<-sapply(data$TYPE,as.integer)
table(sapply(data$TYPE, typeof))

降维&填补缺失值（其实在catboost包里面有自带的缺失值补充参数，这一步可以不做。）

#dimension reduction
library(caret)
zerovar=nearZeroVar(data1)
data2=data1[,-zerovar]

set.seed(100)
rrfMod <- train(Class ~ ., data=data,method="RRF")
data <- varImp(rrfMod, scale=F)

#bagging tree
imputation_bag <- preProcess(data,method = 'bagImpute')
data <- predict(imputation_bag, data)

采取不放回方式进行抽样，减轻计算压力

#train data sampling(no replace)
library(sampling)
#help("sample")
sample_data <- data[sample(1:nrow(data),7100,replace = FALSE),]
#test_data <- data[sample(1:nrow(data),2102,replace = FALSE),]
train_data<-sample_data[1:5000,]
test_data<-sample_data[5001:7100,]
table(test_data$TYPE)
table(train_data$TYPE)

#l0480, --*糖类抗原19-9（西门子）
#l0580, --*淀粉酶
#l0043, --*总胆红素
#找到关键字不为空的行,对该行加权重

jhgjh<-subset(train_data,select=c(L0480,L0580,L0043))
#ctn<-jhgjh$L0618[1]
s_weight<-c(1:dim(train_data)[1])
#is.na(jhgjh$L0580[1])
num <- 1
while(num <= length(train_data[,1])){
if((!train_data$L0480[num]==ctn)##&(!is.na(train_data$L0043[num]))
){
s_weight[num]=20
num = num + 1
}
else{
s_weight[num]=1
num = num + 1
}
}
table(s_weight)
sum(is.na(data$L0580))

#l0480, --*糖类抗原19-9（西门子）
#l0580, --*淀粉酶
#l0043, --*总胆红素

lev<-c("L0480","L0580","L0043")
#lev<-c("L0580")
a<-colnames(train_data)

#找到关键字所在的列数

f_weight<-c(1:dim(train_data)[2])
num <- 1
while(num <= length(train_data[1,])){
if(colnames(train_data)[num] %in% lev){
f_weight[num]=4
num = num + 1
}
else{
f_weight[num]=1
num = num + 1
}
}
#table(train_data$L0580)
table(f_weight)

数据标准化处理，catboost算法本身会进行标准化，此步骤可省略

#data$TYPE=as.data.frame(lapply(JZX$DIAGNOSTIC,as.integer))
#Standardized data
num <- 3
while(num <= length(data[1,])){
if(is.numeric(data[,num])){
data[,num]<-
(data[,num]-min(data[,num],na.rm = TRUE))/(max(data[,num],na.rm = TRUE)-min(data[,num],na.rm = TRUE))
num = num + 1
}
else{
num = num + 1
}
}

#train_data<-train_data[order(-train_data$L0618,train_data$L0621,train_data$L0444),]
#train_data[4500:5000,L0618]

加载pool数据集

#Load the CatBoost dataset
train_pool <- catboost.load_pool(data=train_data[,-c(2)], label=train_data[,c(2)],weight=s_weight)
test_pool <- catboost.load_pool(data=test_data[,-c(2)], label=test_data[,c(2)])
#help(catboost.load_pool)

针对不平衡数据集的处理

rescaling<-sum(train_data$TYPE == 1)/sum(train_data$TYPE == 0)
#1/rescaling
c_weight<-c(1,1/rescaling)

参数设置

#set params&model&predict by diseases
fit_params_yx <- list(iterations = 588, #最佳树
thread_count = 10,
class_weights = c_weight,#样本权重
#feature_weights = f_weight,#指标权重
loss_function = 'Logloss', #损失函数
#ignored_features = c("L1563","L1564","L1505","L1508","L1502","L1504",
# "L1503","L5004","L1174"), # 忽略不重要的列
border_count = 32,
depth =5, #树深度
#leaf_estimation_iterations=2,#迭代次数
#use_best_model=TRUE,#最佳树选择
learning_rate = 0.03,
#eval_metric='Accuracy'#精度计算方式
#random_seed=42,
one_hot_max_size=4,
l2_leaf_reg = 3.5,
nan_mode = 'Min',#空值处理方式
#train_dir = 'train_dir',
logging_level = 'Silent' #控制日志输出与否
)
model_yx<-catboost.train(train_pool,test_pool,fit_params_yx)
#help(catboost.train)

预测结果

#2 type of prediction
prediction1 <- catboost.predict(model_yx, test_pool, prediction_type = 'Probability')
cat("Sample predictions: ", sample(prediction1, 5), "\n")
labels1 <- catboost.predict(model_yx, test_pool, prediction_type = 'Class')
cat("Sample labels: ", sample(labels1, 5), "\n")
help(catboost.predict)

输出训练集预测准确度

#output accuracy&threshold moving
rescaling<-sum(train_data$TYPE == 1)/sum(train_data$TYPE == 0)
rescaling<-rescaling/(1+rescaling)
calc_accuracy <- function(prediction, expected) {
#labels <- ifelse(prediction > rescaling, 1, 0)
#labels <- ifelse(prediction > 0.5, 1, 0)
labels<-labels1
accuracy <- sum(labels == expected) / length(labels)
return(accuracy)
}
# ??works properly only for Logloss

accuracy <- calc_accuracy(prediction1, test_data[,c(2)])
cat("\nAccuracy: ", accuracy, "\n")

#prediction<-0.5*(prediction1+1/(1+rescaling))

模型加载与保存

#load model&save model
catboost.save_model(model_yx,
"D:/model_yx")
getwd()

获取各指标重要性

#get_feature_importance
#help(catboost.get_feature_importance)
im<-catboost.get_feature_importance(model_yx, train_pool)
im<-data.frame(im)
library(dplyr)
im <- add_rownames(im, "VALUE")
im<-im[order(-im$im),]
im<-im[1:20,]
im<-merge(im,XMDZ,by.x="VALUE",by.y="V1")
im<-im[order(-im$im),]

获取SHAP值

#get shap importance
im_shap<-catboost.get_feature_importance(model_rx, test_pool,type='ShapValues')
im_shap<-data.frame(im_shap)
names(im_shap)<-colnames(test_data[,-c(2)])
row.names(im_shap)<-rownames(test_data[,-c(2)])
im_sharp<- im_shap[ ,colnames(im_shap) %in% im$VALUE]
table(im_shap$L1600)
#cat("\nTree count: ", model$tree_count, "\n")

ningnd

关注

1
点赞
踩
18

收藏

觉得还不错? 一键收藏
4
评论
catboost的R实现(包含样本权重&指标权重设置，shap值，不平衡数据集处理)

#L-name contrastXMDZ<-read.csv(file='C:/Users/13771/Desktop/XMDZ.csv', header=FALSE,sep=",")#data unionjzx<-read.csv(file='C:/Users/13771/Desktop/2甲状腺肿瘤.csv', row.names = 1,header=TRUE, sep=",")rx<-read.csv(file='C:/Users/13771/Desktop/3乳腺恶性肿瘤
复制链接

扫一扫