文章来源:
http://www.itnose.net/detail/6185647.html
#---------------------------------------------------#
#程序说明:类别不平衡问题处理
#---------------------------------------------------#
# 加载数据,删除冒号和句号,并追加列名
hyper <-read.csv('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/hypothyroid.data', header=F)
names <- read.csv('http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/hypothyroid.names', header=F, sep='\t')[[1]]
names <- gsub(pattern =":|[.]", replacement="", x = names)
colnames(hyper)<-names
# 我们将第一列的列名从 hypothyroid, negative改成target,并将negative变成0,其他值变成1.
colnames(hyper)[1]<-"target"
colnames(hyper)
hyper$target<-ifelse(hyper$target=="negative",0,1)
# 检查下阳性和阴性的结果
table(hyper$target)
prop.table(table(hyper$target))
head(hyper,2)
# 这数据都是因子型变量(字符型的值),这些都需要转换成二值化的数字,以方便建模:
ind<-sapply(hyper,is.factor)
hyper[ind]<-lapply(hyper[ind],as.character)
hyper[hyper=="?"]=NA
hyper[hyper=="f"]=0
hyper[hyper=="t"]=1
hyper[hyper=="n"]=0
hyper[hyper=="y"]=1
hyper[hyper=="M"]=0
hyper[hyper=="F"]=1
hyper[ind]<-lapply(hyper[ind],as.numeric)
#用均值代替缺失值
replaceNAWithMean<-function(x) {replace(x,is.na(x),mean(x[!is.na(x)]))}
hyper<-replaceNAWithMean(hyper)
#模型研究
#我们利用caret包中的createDataPartition(数据分割功能)函数将数据随机分成相同的两份。
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(1234)
splitIndex <- createDataPartition(hyper$target,time=1,p=0.5,list=FALSE)
trainSplit <- hyper[splitIndex,]
testSplit <- hyper[-splitIndex,]
prop.table(table(trainSplit$target))
#我们利用caret包中的treebag模型算法,对训练集数据建立模型,并对测试集数据进行预测。
ctrl <- trainControl(method="cv",number=5)
tbmodel <- train(target~.,data=trainSplit,method="treebag",
trControl=ctrl)
predictors <- names(trainSplit)[names(trainSplit)!='target']
pred <- predict(tbmodel$finalModel,testSplit[,predictors])
#为了评估模型,我们用pROC包的roc函数算auc得分和画图
library(pROC)
auc<-roc(testSplit$target,pred)
print(auc)
plot(auc,ylim=c(0,1),print.thres=TRUE,main=paste('AUC',round(auc$auc[[1]],2)))
abline(h=1,col="blue",lwd=2)
abline(h=0,col="red",lwd=2)
#使用SMOTE来平衡
library(DMwR)
trainSplit$target <- as.factor(trainSplit$target)
trainSplit <- SMOTE(target~.,trainSplit,perc.over=100,perc.under=200)
trainSplit$target <- as.numeric(trainSplit$target)
# 我们再次用prop.table()函数检查结果的平衡性,确定我们已经让阴性、阳性数据达到相同。
prop.table(table(trainSplit$target))
# 再次建立treebag模型
tbmodel<-train(target~.,data=trainSplit,method="treebag",
trControl=ctrl)
predictors <- names(trainSplit)[names(trainSplit)!='target']
pred <- predict(tbmodel$finalModel,testSplit[,predictors])
auc <- roc(testSplit$target,pred)
print(auc)
plot(auc,ylim=c(0,1),print.thres=TRUE,
main=paste('AUC',round(auc$auc[[1]],2)))
abline(h=1,col="blue",lwd=2)
abline(h=0,col="red",lwd=2)