在实验室搬砖之后,继续我们的kaggle数据分析之旅,这次数据也是答主在kaggle上选择的比较火的一份关于人力资源的数据集,关注点在于员工离职的分析和预测,依然还是从数据读取,数据预处理,EDA和机器学习建模这几个部分开始进行,***使用集成学习中比较火的random forest算法来预测离职情况。
数据读取
setwd("E:/kaggle/human resource")
library(data.table)
library(plotly)
library(corrplot)
library(randomForest)
library(pROC)
library(tidyverse)
library(caret)
hr
glimpse(hr)
sapply(hr,function(x){sum(is.na(x))})
————————————————————————————————————————————————————————————————————————————————————
Observations: 14,999
Variables: 10
$ satisfaction_level 0.38, 0.80, 0.11, 0.72, 0.37, 0.41, 0.10, 0.92, 0.89, 0.42, 0.45, 0.11, 0.84, 0.41, 0.36, 0.38, 0.45, 0.78, 0.45, 0.76, 0.11, 0.3...
$ last_evaluation 0.53, 0.86, 0.88, 0.87, 0.52, 0.50, 0.77, 0.85, 1.00, 0.53, 0.54, 0.81, 0.92, 0.55, 0.56, 0.54, 0.47, 0.99, 0.51, 0.89, 0.83, 0.5...
$ number_project 2, 5, 7, 5, 2, 2, 6, 5, 5, 2, 2, 6, 4, 2, 2, 2, 2, 4, 2, 5, 6, 2, 6, 2, 2, 5, 4, 2, 2, 2, 6, 2, 2, 2, 4, 6, 2, 2, 6, 2, 5, 2, 2, ...
$ average_montly_hours 157, 262, 272, 223, 159, 153, 247, 259, 224, 142, 135, 305, 234, 148, 137, 143, 160, 255, 160, 262, 282, 147, 304, 139, 158, 242,...
$ time_spend_company 3, 6, 4, 5, 3, 3, 4, 5, 5, 3, 3, 4, 5, 3, 3, 3, 3, 6, 3, 5, 4, 3, 4, 3, 3, 5, 5, 3, 3, 3, 4, 3, 3, 3, 6, 4, 3, 3, 4, 3, 5, 3, 3, ...
$ Work_accident 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...<