读入数据
#L-name contrast
XMDZ<-read.csv(file='C:/Users/13771/Desktop/XMDZ.csv', header=FALSE,sep=",")
#data union
jzx<-read.csv(file='C:/Users/13771/Desktop/2甲状腺肿瘤.csv', row.names = 1,header=TRUE, sep=",")
rx<-read.csv(file='C:/Users/13771/Desktop/3乳腺恶性肿瘤.csv', row.names = 1,header=TRUE, sep=",")
yx<-read.csv(file='C:/Users/13771/Desktop/胰腺肿瘤.csv',row.names = 1, header=TRUE,sep=",")
sg<-read.csv(file='C:/Users/13771/Desktop/食管癌.csv',row.names = 1, header=TRUE, sep=",")
data<-rbind(jzx,rx,yx,sg)
列类型转换
#identify colunm type
library(do)
cdv1<-sapply(data, typeof)
table(cdv1)
cdv<-Replace(cdv1,pattern=c('double:numeric','character:factor'
,'logical:factor','integer:numeric'))
table(cdv)
以规定的列类型重新读入
#read data with colclass
JZX<-read.csv(file='C:/Users/13771/Desktop/2甲状腺肿瘤.csv',row.names = 1, na.strings='0',header=TRUE,colClasses =cdv, sep=",")
RX<-read.csv(file='C:/Users/13771/Desktop/3乳腺恶性肿瘤.csv',row.names = 1,na.strings='0', header=TRUE, colClasses =cdv, sep=",")
YX<-read.csv(file='C:/Users/13771/Desktop/胰腺肿瘤.csv', row.names = 1,na.strings='0',header=TRUE,colClasses =cdv, sep=",")
SG<-read.csv(file='C:/Users/13771/Desktop/食管癌.csv',row.names = 1,na.strings='0', header=TRUE,colClasses =cdv, sep=",")
data<-rbind(JZX,RX,YX,SG)
#delete column
#data<-subset(data,select=-c(DIAGNOSTIC,CHECKEROPINION,SAMPLETYPE))
定义自变量列和因变量列
#define label
data$TYPE = gsub(".*胰腺.*" , "1", data$TYPE )
data$TYPE[which(data$TYPE !=1)] <- 0
table(data$TYPE)
#transform label type
table(sapply(data$TYPE, typeof))
data$TYPE<-sapply(data$TYPE,as.integer)
table(sapply(data$TYPE, typeof))
降维&填补缺失值(其实在catboost包里面有自带的缺失值补充参数,这一步可以不做。)
#dimension reduction
library(caret)
zerovar=nearZeroVar(data1)
data2=data1[,-zerovar]set.seed(100)
rrfMod <- train(Class ~ ., data=data,method="RRF")
data <- varImp(rrfMod, scale=F)#bagging tree
imputation_bag <- preProcess(data,method = 'bagImpute')
data <- predict(imputation_bag, data)
采取不放回方式进行抽样,减轻计算压力
#train data sampling(no replace)
library(sampling)
#help("sample")
sample_data <- data[sample(1:nrow(data),7100,replace = FALSE),]
#test_data <- data[sample(1:nrow(data),2102,replace = FALSE),]
train_data<-sample_data[1:5000,]
test_data<-sample_data[5001:7100,]
table(test_data$TYPE)
table(train_data$TYPE)
#l0480, --*糖类抗原19-9(西门子)
#l0580, --*淀粉酶
#l0043, --*总胆红素
#找到关键字不为空的行,对该行加权重
jhgjh<-subset(train_data,select=c(L0480,L0580,L0043))
#ctn<-jhgjh$L0618[1]
s_weight<-c(1:dim(train_data)[1])
#is.na(jhgjh$L0580[1])
num <- 1
while(num <= length(train_data[,1])){
if((!train_data$L0480[num]==ctn)##&(!is.na(train_data$L0043[num]))
){
s_weight[num]=20
num = num + 1
}
else{
s_weight[num]=1
num = num + 1
}
}
table(s_weight)
sum(is.na(data$L0580))
#l0480, --*糖类抗原19-9(西门子)
#l0580, --*淀粉酶
#l0043, --*总胆红素
lev<-c("L0480","L0580","L0043")
#lev<-c("L0580")
a<-colnames(train_data)
#找到关键字所在的列数
f_weight<-c(1:dim(train_data)[2])
num <- 1
while(num <= length(train_data[1,])){
if(colnames(train_data)[num] %in% lev){
f_weight[num]=4
num = num + 1
}
else{
f_weight[num]=1
num = num + 1
}
}
#table(train_data$L0580)
table(f_weight)
数据标准化处理,catboost算法本身会进行标准化,此步骤可省略
#data$TYPE=as.data.frame(lapply(JZX$DIAGNOSTIC,as.integer))
#Standardized data
num <- 3
while(num <= length(data[1,])){
if(is.numeric(data[,num])){
data[,num]<-
(data[,num]-min(data[,num],na.rm = TRUE))/(max(data[,num],na.rm = TRUE)-min(data[,num],na.rm = TRUE))
num = num + 1
}
else{
num = num + 1
}
}
#train_data<-train_data[order(-train_data$L0618,train_data$L0621,train_data$L0444),]
#train_data[4500:5000,L0618]
加载pool数据集
#Load the CatBoost dataset
train_pool <- catboost.load_pool(data=train_data[,-c(2)], label=train_data[,c(2)],weight=s_weight)
test_pool <- catboost.load_pool(data=test_data[,-c(2)], label=test_data[,c(2)])
#help(catboost.load_pool)
针对不平衡数据集的处理
rescaling<-sum(train_data$TYPE == 1)/sum(train_data$TYPE == 0)
#1/rescaling
c_weight<-c(1,1/rescaling)
参数设置
#set params&model&predict by diseases
fit_params_yx <- list(iterations = 588, #最佳树
thread_count = 10,
class_weights = c_weight,#样本权重
#feature_weights = f_weight,#指标权重
loss_function = 'Logloss', #损失函数
#ignored_features = c("L1563","L1564","L1505","L1508","L1502","L1504",
# "L1503","L5004","L1174"), # 忽略不重要的列
border_count = 32,
depth =5, #树深度
#leaf_estimation_iterations=2,#迭代次数
#use_best_model=TRUE,#最佳树选择
learning_rate = 0.03,
#eval_metric='Accuracy'#精度计算方式
#random_seed=42,
one_hot_max_size=4,
l2_leaf_reg = 3.5,
nan_mode = 'Min',#空值处理方式
#train_dir = 'train_dir',
logging_level = 'Silent' #控制日志输出与否
)
model_yx<-catboost.train(train_pool,test_pool,fit_params_yx)
#help(catboost.train)
预测结果
#2 type of prediction
prediction1 <- catboost.predict(model_yx, test_pool, prediction_type = 'Probability')
cat("Sample predictions: ", sample(prediction1, 5), "\n")
labels1 <- catboost.predict(model_yx, test_pool, prediction_type = 'Class')
cat("Sample labels: ", sample(labels1, 5), "\n")
help(catboost.predict)
输出训练集预测准确度
#output accuracy&threshold moving
rescaling<-sum(train_data$TYPE == 1)/sum(train_data$TYPE == 0)
rescaling<-rescaling/(1+rescaling)
calc_accuracy <- function(prediction, expected) {
#labels <- ifelse(prediction > rescaling, 1, 0)
#labels <- ifelse(prediction > 0.5, 1, 0)
labels<-labels1
accuracy <- sum(labels == expected) / length(labels)
return(accuracy)
}
# ??works properly only for Loglossaccuracy <- calc_accuracy(prediction1, test_data[,c(2)])
cat("\nAccuracy: ", accuracy, "\n")#prediction<-0.5*(prediction1+1/(1+rescaling))
模型加载与保存
#load model&save model
catboost.save_model(model_yx,
"D:/model_yx")
getwd()
获取各指标重要性
#get_feature_importance
#help(catboost.get_feature_importance)
im<-catboost.get_feature_importance(model_yx, train_pool)
im<-data.frame(im)
library(dplyr)
im <- add_rownames(im, "VALUE")
im<-im[order(-im$im),]
im<-im[1:20,]
im<-merge(im,XMDZ,by.x="VALUE",by.y="V1")
im<-im[order(-im$im),]
获取SHAP值
#get shap importance
im_shap<-catboost.get_feature_importance(model_rx, test_pool,type='ShapValues')
im_shap<-data.frame(im_shap)
names(im_shap)<-colnames(test_data[,-c(2)])
row.names(im_shap)<-rownames(test_data[,-c(2)])
im_sharp<- im_shap[ ,colnames(im_shap) %in% im$VALUE]
table(im_shap$L1600)
#cat("\nTree count: ", model$tree_count, "\n")