趁有时间整理一下毕业论文的模型代码,主要用到的算法有XGBoost,SVM,Ensemble learning,LR,RF,NN,K-近邻;
- 构建的初始信贷风险评估指标体系指标太多,信息冗余容易出现过拟合的状况,此处精简指标主要用到的是XGBoost的特征重要性排序,选择它是因为可以通过加入正则项控制模型复杂度,加快算法训练速度,提升模型运行效率。
library(caret)
input39 <- read_excel("C:/Users/Libby/OneDrive/Data for thesis/定稿/input39.xlsx")
sum(is.na(input39))
input39[is.na(input39)]<-0#缺失值用0填补
scainput<-scale(input39[,2:41]) #标准化
input<-data.frame(input39[,1],scainput)
#均衡化,二分类数据分布不均衡很影响分类结果,做一个均衡化处理
library(DMwR)
table(input$Y)
Y<-as.factor(input$Y)
finput<-data.frame(Y,input[,2:41])
binput<-SMOTE(Y~.,finput,perc.over = 200,perc.under = 160)
table(binput$Y)
#xgboost
library(xgboost)
library(Matrix)
library(Ckmeans.1d.dp)
set.seed(100)
sam<-sample(nrow(binput),nrow(binput)*0.7)#划分训练集和测试集
xtr<-binput[sam,]
xte<-binput[-sam,]
xdtr<-data.matrix(xtr[,-1])#训练集解释变量矩阵
ydtr<-xtr[,1]#被解释变量
dtr<-xgb.DMatrix(data = xdtr,label=ydtr)
xdte<-data.matrix(xte[,-1])
ydte<-xte[,1]
dte<-xgb.DMatrix(data = xdte,label=ydte)
xgb<-xgboost(data = dtr,nrounds = 5)
importance<-xgb.importance(model = xgb)
xgb.plot.importance(importance)
inpnames<-inputchinese[,-1]#画图需要将变量名字转换为中文
inp<-xgb.importance(names(inpnames),model = xgb)
xgb.plot.importance(inp)
pre_xgb<-round(predict(xgb,newdata = dte))
table(pre_xgb,xte$Y)
xgb.plot.importance(inp)
xdtr1<-data.matrix(xtr[,-c(1,6,26,35,17,7,14,13,29,3,5,19,40,10,27,32,30,31,34,4,9,16,18,23,33,38)])#剔除不重要变量
dtr1<-xgb.DMatrix(data = xdtr1,label=ydtr)
xdte1<-data.matrix(xte[,-c(1,6,26,35,17,7,14,13,29,3,5,19,40,10,27,32,30,31,34,4,9,16,18,23,33,38)])
dte1<-xgb.DMatrix(data = xdte1,label=ydte)
xgb1<-xgboost(data = dtr1,nrounds = 5)
importance1<-xgb.importance(model = xgb1)
xgb.plot.importance(importance1)#
pre_xgb1<-round(predict(xgb1,newdata = dte1))
table(pre_xgb1,xte$Y)#可对比剔除变量后的分类结果
#SVM核函数选择
library(e1071)
library(pROC)
library(ggplot2)
library(foreach)
library(iterators)
library(parallel)
library(doParallel)
library(rpart)
library(rpart.plot)
library(DMwR)
#剔除变量后的新数据集
svminput<-binput[,-c(6,26,35,17,7,14,13,29,3,5,19,40,10,27,32,30,31,34,4,9,16,18,23,33,38)]
names(svminput)[2:16]<-c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12","X13","X14","X15")
set.seed(100)
sam<-sample(nrow(svminput),nrow(svminput)*0.7)
traindata<-svminput[sam,]
testdata<-svminput[-sam,]
#线性核函数
tsline<-tune.svm(Y~.,data = traindata,kernel="linear",cost = 2^(0:4))
summary(tsline)
svmline<-svm(Y~.,data = traindata,kernel="linear",cost=1)
preline<-predict(svmline,testdata[,-1])
tableline<-table(preline,testdata$Y)
confusionMatrix(tableline)
lineroc<-roc(testdata$Y,factor(preline,ordered = T))
plot(lineroc,print.auc=T,auc.polygon=T,grid=c(0.2,0.2),grid.col=c("grey","grey"),max.auc.polygon=T,auc.polygon.col="lightgrey",print.thres=T,main="线性核函数ROC曲线")
#多项式核函数
tspoly<-tune.svm(Y~.,data = traindata,kernel="polynomial",gamma = 2^(-2:2),cost = 2^(-1:4),degree = 2^(0:3))
summary(tspoly)
svmpoly<-svm(Y~.,data = traindata,kernel="polynomial",gamma = 0.5,cost = 4,degree =4)
prepoly<-predict(svmpoly,testdata[,-