前言
这是kaggle上关于Credit Risk的一个建模流程,其中非常有重要参考价值的点在于其衍生变量构造这个板块,值得我们借鉴。
#数据下载地址:https://www.kaggle.com/c/home-credit-default-risk/data
###########建模流程############
#加载需要使用的包
library(tidyverse)
library(xgboost)
library(magrittr)
set.seed(0)
-----------------------------------------------------
getwd() -----#获取当前工作路径
setwd('C:/Users/lenovo/Desktop/HOME credic fraud') -----#设定当前路径为工作路径
cat("Loading data...\n")
tr <- read_csv("application_train.csv") ---#加载train data
te <- read_csv("application_test.csv") ---#加载test data
dim(tr) #train维度
head(tr) #前5行
names(tr) # 列名
dim(te) #test 维度
#管道函数---将数据character/factor,转换成integer类型
bureau <- read_csv("bureau.csv") %>%
mutate_if(is.character, funs(factor(.) %>% as.integer()))
dim(bureau)
head(bureau)
str(bureau)
#factor vars to integer vars
#.meanings all data
cred_card_bal <- read_csv("credit_card_balance.csv") %>%
mutate_if(is.character, funs(factor(.) %>% as.integer()))
pos_cash_bal <- read_csv("POS_CASH_balance.csv") %>%
mutate_if(is.character, funs(factor(.) %>% as.integer()))
prev <- read_csv("previous_application.csv") %>%
mutate_if(is.character, funs(factor(.) %>% as.integer()))
#---------------------------
cat("Preprocessing...\n")
#筛选查看相关数据
subset(bureau, SK_ID_CURR=='100001')
View(subset(avg_bureau, SK_ID_CURR=='100001'))
#使用%$%把左侧的程序的数据集A传递右侧程序的B函数,同时传递数据集A的属性名,
#作为B函数的内部变量方便对A数据集进行处理,最后完成数据计算
avg_bureau <- bureau %>%
group_by(SK_ID_CURR) %>%
summarise_all(funs(mean), na.rm = TRUE) %>%
mutate(buro_count = bureau %>%
group_by(SK_ID_CURR) %>%
count() %$% n)
avg_cred_card_bal <- cred_card_bal %>%
group_by(SK_ID_CURR) %>%
summarise_all(funs(mean), na.rm = TRUE) %>%
mutate(card_count = cred_card_bal %>%
group_by(SK_ID_CURR) %>%
count() %$% n)
avg_pos_cash_bal <- pos_cash_bal %>%
group_by(SK_ID_CURR) %>%
summarise_all(funs(mean), na.rm = TRUE) %>%
mutate(pos_count = pos_cash_bal %>%
group_by(SK_ID_PREV, SK_ID_CURR) %>%
group_by(SK_ID_CURR) %>%
count() %$% n)
avg_prev <- prev %>%
group_by(SK_ID_CURR) %>%
summarise_all(funs(mean), na.rm = TRUE) %>%
mutate(nb_app = prev %>%
group_by(SK_ID_CURR) %>%
count() %$% n)
tri <- 1:nrow(tr)
y <- tr$TARGET
tr_te <- tr %>%
select(-TARGET) %>%
bind_rows(te) %>%
left_join(avg_bureau, by = "SK_ID_CURR") %>%
left_join(avg_cred_card_bal, by = "SK_ID_CURR") %>%
left_join(avg_pos_cash_bal, by = "SK_ID_CURR") %>%
left_join(avg_prev, by = "SK_ID_CURR") %>%
mutate_if(is.character, funs(factor(.) %>% as.integer())) %>%
data.matrix()
rm(tr, te, prev, avg_prev, bureau, avg_bureau, cred_card_bal,
avg_cred_card_bal, pos_cash_bal, avg_pos_cash_bal)
gc()
#---------------------------
cat("Preparing data...\n")
#加载xgboost模块
library(xgboost)
dtest <- xgb.DMatrix(data = tr_te[-tri, ])
tr_te <- tr_te[tri, ]
tri <- caret::createDataPartition(y, p = 0.9, list = F) %>% c()
dtrain <- xgb.DMatrix(data = tr_te[tri, ], label = y[tri])
dval <- xgb.DMatrix(data = tr_te[-tri, ], label = y[-tri])
cols <- colnames(tr_te)
rm(tr_te, y, tri); gc()
#---------------------------
cat("Training model...\n")
###设定 xgboost 相关参数
p <- list(objective = "binary:logistic",
booster = "gbtree",
eval_metric = "auc",
nthread = 8,
eta = 0.025,
max_depth = 6,
min_child_weight = 19,
gamma = 0,
subsample = 0.8,
colsample_bytree = 0.632,
alpha = 0,
lambda = 0.05,
nrounds = 2000)
m_xgb <- xgb.train(p, dtrain, p$nrounds, list(val = dval), print_every_n = 50, early_stopping_rounds = 200)
xgb.importance(cols, model=m_xgb) %>%
xgb.plot.importance(top_n = 30)
#---------------------------
read_csv("sample_submission.csv") %>%
mutate(SK_ID_CURR = as.integer(SK_ID_CURR),
TARGET = predict(m_xgb, dtest)) %>%
write_csv(paste0("tidy_xgb_", round(m_xgb$best_score, 4), ".csv"))
#######################################################
#version2
#加载parallel包
library(parallel)
#detectCores函数可以告诉你你的CPU可使用的核数
clnum<-detectCores()
#设置参与并行的CPU核数目,这里我们使用了所有的CPU核,也就是我们刚才得到的clnum,具体到这个案例,clnum=4
cl <- makeCluster(getOption("cl.cores", clnum));
memory.limit(size=20480)
#---------------------------
getwd()
setwd('C:/Users/lenovo/Desktop/HOME credic fraud')
cat("Loading data...\n")
library(tidyverse)
library(xgboost)
library(magrittr)
set.seed(5)
#---------------------------
cat("Loading data...\n")
bbalance <- read_csv("bureau_balance.csv")
bureau <- read_csv("bureau.csv")
cc_balance <- read_csv("credit_card_balance.csv")
payments <- read_csv("installments_payments.csv")
pc_balance <- read_csv("POS_CASH_balance.csv")
prev <- read_csv("previous_application.csv")
tr <- read_csv("application_train.csv")
te <- read_csv("application_test.csv")
#---------------------------
cat("Preprocessing...\n")
fn <- funs(mean, sd, min, max, sum, n_distinct, .args = list(na.rm = TRUE))
#View(subset(bbalance, SK_ID_BUREAU=='5715448'))
#bbalance[which(bbalance$SK_ID_BUREAU=="5715448"),]
#View(subset(sum_bbalance, SK_ID_BUREAU=='5715448'))
sum_bbalance <- bbalance %>%
mutate_if(is.character, funs(factor(.) %>% as.integer)) %>%
group_by(SK_ID_BUREAU) %>%
summarise_all(fn)
rm(bbalance); gc()
sum_bureau <- bureau %>%
left_join(sum_bbalance, by = "SK_ID_BUREAU") %>%
select(-SK_ID_BUREAU) %>%
mutate_if(is.character, funs(factor(.) %>% as.integer)) %>%
group_by(SK_ID_CURR) %>%
summarise_all(fn)
rm(bureau, sum_bbalance); gc()
sum_cc_balance <- cc_balance %>%
select(-SK_ID_PREV) %>%
mutate_if(is.character, funs(factor(.) %>% as.integer)) %>%
group_by(SK_ID_CURR) %>%
summarise_all(fn)
rm(cc_balance); gc()
sum_payments <- payments %>%
select(-SK_ID_PREV) %>%
mutate(PAYMENT_PERC = AMT_PAYMENT / AMT_INSTALMENT,
PAYMENT_DIFF = AMT_INSTALMENT - AMT_PAYMENT,
DPD = DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT,
DBD = DAYS_INSTALMENT - DAYS_ENTRY_PAYMENT,
DPD = ifelse(DPD > 0, DPD, 0),
DBD = ifelse(DBD > 0, DBD, 0)) %>%
group_by(SK_ID_CURR) %>%
summarise_all(fn)
rm(payments); gc()
sum_pc_balance <- pc_balance %>%
select(-SK_ID_PREV) %>%
mutate_if(is.character, funs(factor(.) %>% as.integer)) %>%
group_by(SK_ID_CURR) %>%
summarise_all(fn)
rm(pc_balance); gc()
sum_prev <- prev %>%
select(-SK_ID_PREV) %>%
mutate_if(is.character, funs(factor(.) %>% as.integer)) %>%
mutate(DAYS_FIRST_DRAWING = ifelse(DAYS_FIRST_DRAWING == 365243, NA, DAYS_FIRST_DRAWING),
DAYS_FIRST_DUE = ifelse(DAYS_FIRST_DUE == 365243, NA, DAYS_FIRST_DUE),
DAYS_LAST_DUE_1ST_VERSION = ifelse(DAYS_LAST_DUE_1ST_VERSION == 365243, NA, DAYS_LAST_DUE_1ST_VERSION),
DAYS_LAST_DUE = ifelse(DAYS_LAST_DUE == 365243, NA, DAYS_LAST_DUE),
DAYS_TERMINATION = ifelse(DAYS_TERMINATION == 365243, NA, DAYS_TERMINATION),
APP_CREDIT_PERC = AMT_APPLICATION / AMT_CREDIT) %>%
group_by(SK_ID_CURR) %>%
summarise_all(fn)
rm(prev); gc()
tri <- 1:nrow(tr)
y <- tr$TARGET
tr_te <- tr %>%
select(-TARGET) %>%
bind_rows(te) %>%
left_join(sum_bureau, by = "SK_ID_CURR") %>%
left_join(sum_cc_balance, by = "SK_ID_CURR") %>%
left_join(sum_payments, by = "SK_ID_CURR") %>%
left_join(sum_pc_balance, by = "SK_ID_CURR") %>%
left_join(sum_prev, by = "SK_ID_CURR") %>%
select(-SK_ID_CURR) %>%
mutate_if(is.character, funs(factor(.) %>% as.integer)) %>%
mutate(na = apply(., 1, function(x) sum(is.na(x))),
DAYS_EMPLOYED = ifelse(DAYS_EMPLOYED == 365243, NA, DAYS_EMPLOYED),
DAYS_EMPLOYED_PERC = sqrt(DAYS_EMPLOYED / DAYS_BIRTH),
INCOME_CREDIT_PERC = AMT_INCOME_TOTAL / AMT_CREDIT,
INCOME_PER_PERSON = log1p(AMT_INCOME_TOTAL / CNT_FAM_MEMBERS),
ANNUITY_INCOME_PERC = sqrt(AMT_ANNUITY / (1 + AMT_INCOME_TOTAL)),
LOAN_INCOME_RATIO = AMT_CREDIT / AMT_INCOME_TOTAL,
ANNUITY_LENGTH = AMT_CREDIT / AMT_ANNUITY,
CHILDREN_RATIO = CNT_CHILDREN / CNT_FAM_MEMBERS,
CREDIT_TO_GOODS_RATIO = AMT_CREDIT / AMT_GOODS_PRICE,
INC_PER_CHLD = AMT_INCOME_TOTAL / (1 + CNT_CHILDREN),
SOURCES_PROD = EXT_SOURCE_1 * EXT_SOURCE_2 * EXT_SOURCE_3,
CAR_TO_BIRTH_RATIO = OWN_CAR_AGE / DAYS_BIRTH,
CAR_TO_EMPLOY_RATIO = OWN_CAR_AGE / DAYS_EMPLOYED,
PHONE_TO_BIRTH_RATIO = DAYS_LAST_PHONE_CHANGE / DAYS_BIRTH,
PHONE_TO_EMPLOY_RATIO = DAYS_LAST_PHONE_CHANGE / DAYS_EMPLOYED)
docs <- str_subset(names(tr), "FLAG_DOC")
live <- str_subset(names(tr), "(?!NFLAG_)(?!FLAG_DOC)(?!_FLAG_)FLAG_")
inc_by_org <- tr_te %>%
group_by(ORGANIZATION_TYPE) %>%
summarise(m = median(AMT_INCOME_TOTAL)) %$%
setNames(as.list(m), ORGANIZATION_TYPE)
rm(tr, te, fn, sum_bureau, sum_cc_balance,
sum_payments, sum_pc_balance, sum_prev); gc()
tr_te %<>%
mutate(DOC_IND_KURT = apply(tr_te[, docs], 1, moments::kurtosis),
LIVE_IND_SUM = apply(tr_te[, live], 1, sum),
NEW_INC_BY_ORG = recode(tr_te$ORGANIZATION_TYPE, !!!inc_by_org),
NEW_EXT_SOURCES_MEAN = apply(tr_te[, c("EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3")], 1, mean),
NEW_SCORES_STD = apply(tr_te[, c("EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3")], 1, sd))%>%
mutate_all(funs(ifelse(is.nan(.), NA, .))) %>% #缺失值判断
mutate_all(funs(ifelse(is.infinite(.), NA, .))) %>% #inf判断
data.matrix()
#---------------------------
cat("Preparing data...\n")
dtest <- xgb.DMatrix(data = tr_te[-tri, ])
tr_te <- tr_te[tri, ]
tri <- caret::createDataPartition(y, p = 0.9, list = F) %>% c()
dtrain <- xgb.DMatrix(data = tr_te[tri, ], label = y[tri])
dval <- xgb.DMatrix(data = tr_te[-tri, ], label = y[-tri])
cols <- colnames(tr_te)
rm(tr_te, y, tri); gc()
#---------------------------
cat("Training model...\n")
p <- list(objective = "binary:logistic",
booster = "gbtree",
eval_metric = "auc",
nthread = 4,
eta = 0.05,
max_depth = 6,
min_child_weight = 30,
gamma = 0,
subsample = 0.85,
colsample_bytree = 0.7,
colsample_bylevel = 0.632,
alpha = 0,
lambda = 0,
nrounds = 2000)
set.seed(5)
m_xgb <- xgb.train(p, dtrain, p$nrounds, list(val = dval), print_every_n = 50, early_stopping_rounds = 300)
xgb.importance(cols, model=m_xgb) %>%
xgb.plot.importance(top_n = 30)
#---------------------------
read_csv("sample_submission.csv") %>%
mutate(SK_ID_CURR = as.integer(SK_ID_CURR),
TARGET = predict(m_xgb, dtest)) %>%
write_csv(paste0("tidy_xgb_", round(m_xgb$best_score, 5), ".csv"))
参考文章:原文链接