应用的知识
最优分段
主成分分析
贝叶斯网络
套袋算法
spearman相关系数
数据重命名、连接、聚合等等处理
code
#加载所需的包
library(data.table)
library(dplyr)
library(psych)
library(caret)
library(smbinning)
#设置工作路径
setwd('D:\\R\\wokingdiretory\\work\\4_29')
#读取数据
ST2015 <- read.csv('xx.csv', sep = "",fileEncoding = 'utf-8')
ST2016 <- read.csv('xxx.csv', sep = ",",fileEncoding = 'utf-8')
ZC2015 <- read.csv('xxxx.csv', sep = ",",fileEncoding = 'utf-8')
ZC2016 <- read.csv('xxxxx.csv', sep = ",",fileEncoding = 'utf-8')
head(ST2015);dim(ST2015);str(ST2015)
head(ST2016);dim(ST2016);str(ST2016)
head(ZC2015);dim(ZC2015);str(ZC2015)
head(ZC2016);dim(ZC2016);str(ZC2016)
###################################数据整合###################################
ST2015 <- as.data.table(ST2015)
ST2016 <- as.data.table(ST2016)
ZC2015 <- as.data.table(ZC2015)
ZC2016 <- as.data.table(ZC2016)
ST2015 <- ST2015[,ClassName :='ST']
ZC2015 <- ZC2015[,ClassName :='ZC']
ST2016 <- ST2016[,ClassName :='ST']
ZC2016 <- ZC2016[,ClassName :='ZC']
dt_2015 <- rbind(ST2015,ZC2015)
dt_2016 <- rbind(ST2016,ZC2016)
data <- rbind(dt_2015,dt_2016)
total_name <- names(data)
split_name <- unlist(strsplit(names(data), "_"))
dis_list <- seq(1,length(split_name),2)
names(data)<- c(split_name[-dis_list],'ClassName')
data <- as.data.frame(data)
for (i in 4:(ncol(data)-1)) {
data[,i] <- as.numeric(data[,i])
}
str(data)
# 遍历计算data的缺失百分比
num_na <- c()
for (i in 1:ncol(data)) {
num_na[i] <- sum(is.na(data[,i]))/nrow(data)*100
}
xuhao <- which(num_na>25)
names(data)[xuhao]
data <- data[,-xuhao]
dim(data)
################################ spearman相关系数检验
data$ClassName[which(data$ClassName=="ZC")] <- 1
data$ClassName[which(data$ClassName=="ST")] <- 0
data$ClassName <- as.numeric(data$ClassName)
spear_result <- corr.test(data[4:(ncol(data)-1)], data[ncol(data)], method = "spearman")
spear_result $r
spear_result $p
save_var_list <- which(abs(spear_result$r)>0.1)
temp_dt_r <- (spear_result$r)[save_var_list,]
temp_dt_p <- (spear_result$p)[save_var_list,]
temp_dt_r <- as.data.frame(temp_dt_r)
temp_dt_p <- as.data.frame(temp_dt_p)
temp_dt <- data.frame(temp_dt_r,temp_dt_p)
temp_name <- row.names(temp_dt)
temp_dt &l