加载相关包并应用
install.packages("xlsx")
install.packages("ggplot2")
install.packages("gridExtra")
install.packages("dplyr")
library(xlsx)
library(ggplot2)
library(gridExtra)
library(dplyr)
读取数据bankloan
-
数据预处理,调整数据类型,将年龄、工龄分组
年龄分组:每隔10岁为一组,
工龄分组:0, 1, 3, 5, 10, 15, 20, 30, 40
bankloan <- read.xlsx(file = "E:\\bankloan.xlsx",
sheetIndex = 1)
bankloan$age_group <- cut(bankloan$年龄, breaks = c(0, 10, 20, 30, 40, 50))
bankloan$seniority_group <- cut(bankloan$工龄, breaks = c(0, 1, 3, 5, 10, 15, 20, 30, 40))
注:其中路径改为自己电脑中该文件所在路径。
2.绘制违约与不违约客户的年龄的条形图
(1表示违约,0表示不违约)
p1 <- ggplot(data=bankloan,mapping = aes(x = 违约, y = 年龄))
geom_bar(stat = 'identity',
color = 'black',
fill = 'steelblue')
labs(x = '')
p2 <- ggplot(data = bankloan,
mapping = aes(x = reorder(违约,-年龄),y = 年龄))+
geom_bar(stat = 'identity',color = 'black',fill = 'steelblue')+
labs(x = '')+
geom_text(mapping = aes(x = 违约,y = 年龄,label = 年龄,vjust = -0.2))+
geom_hline(yintercept = mean(bankloan$年龄),color = 'red',lty = 'dashed')
grid.arrange(p1,p2,ncol = 2)
3.绘制不同年龄、教育和工龄的客户收入的直方图
new_bankloan <- data.frame(age_group = c(0, 10, 20, 30, 40, 50),
default = c('0', '1'))
p3 <- ggplot(data = new_bankloan,
mapping = aes(x = age_group)) +
geom_histogram(color = 'black',
fill = 'steelblue',
binwidth = 5)
X = seq(from = min(new_bankloan$age_group), to = max(new_bankloan$age_group), length = 10000)
X_dnorm <- dnorm(x = X, mean = mean(new_bankloan$age_group), sd = sd(new_bankloan$age_group))
p4 <- ggplot(data = new_bankloan,
mapping = aes(x = age_group,
y = ..density..)) +
geom_histogram(color = 'black',
fill = 'steelblue',
binwidth = 5) +
geom_density(color = 'red',
lwd = 1) +
geom_line(mapping = aes(x = X, y = X_dnorm), data = data.frame(X, X_dnorm),
color = 'black',
lwd = 1,
lty = 2)
plot(p4)
p5 <- ggplot(data = bankloan,
mapping = aes(x = 受教育程度)) +
geom_histogram(color = 'black',
fill = 'steelblue',
binwidth = 5)
X = seq(from = min(bankloan$受教育程度), to = max(bankloan$受教育程度), length = 10000)
X_dnorm <- dnorm(x = X, mean = mean(bankloan$受教育程度), sd = sd(bankloan$受教育程度))
p6 <- ggplot(data = bankloan,
mapping = aes(x = 受教育程度,
y = ..density..)) +
geom_histogram(color = 'black',
fill = 'steelblue',
binwidth = 5) +
geom_density(color = 'red',
lwd = 1) +
geom_line(mapping = aes(x = X, y = X_dnorm), data = data.frame(X, X_dnorm),
color = 'black',
lwd = 1,
lty = 2)
plot(p6)
p7 <- ggplot(data = bankloan,
mapping = aes(x = 工龄)) +
geom_histogram(color = 'black',
fill = 'steelblue',
binwidth = 5)
X = seq(from = min(bankloan$工龄), to = max(bankloan$工龄), length = 10000)
X_dnorm <- dnorm(x = X,mean = mean(bankloan$工龄), sd = sd(bankloan$工龄))
p8 <- ggplot(data = bankloan,
mapping = aes(x = 工龄,
y = ..density..)) +
geom_histogram(color = 'black',
fill = 'steelblue',
binwidth = 5) +
geom_density(color = 'red',
lwd = 1) +
geom_line(mapping = aes(x = X, y = X_dnorm), data = data.frame(X, X_dnorm),
color = 'black',
lwd = 1,
lty = 2)
plot(p8)
4.绘制不同年龄、教育和工龄的客户收入的核密度图
p9 <- ggplot(data=bankloan,mapping = aes(x = 年龄, y = 收入))+
geom_point(alpha = 0.3,color = 'steelblue')+
geom_density_2d(color = 'black',
lwd = 1,
h = c(0.4,0.6),
n = 300)+
guides(color = FALSE)
plot(p9)
p10 <- ggplot(data=bankloan,mapping = aes(x = 受教育程度, y = 收入))+
geom_point(alpha = 0.3,color = 'steelblue')+
geom_density_2d(color = 'black',
lwd = 1,
h = c(0.4,0.6),
n = 300)+
guides(color = FALSE)
plot(p10)
p11 <- ggplot(data=bankloan,mapping = aes(x = 工龄, y = 收入))+
geom_point(alpha = 0.3,color = 'steelblue')+
geom_density_2d(color = 'black',
lwd = 1,
h = c(0.4,0.6),
n = 300)+
guides(color = FALSE)
plot(p11)
5.绘制不同年龄、教育和工龄的客户收入的箱线图
p12 <- ggplot(data = bankloan,
mapping = aes(x = factor(年龄),y = 收入))+
geom_boxplot(fill = 'gray',
color = 'steelblue',
outlier.fill = 'red',
outlier.color = 'red',
outlier.shape = 24)+
stat_summary(fun.y = 'mean',
geom = 'point',
colour = 'black')+
labs(x = '工龄')
plot(p12)
p13 <- ggplot(data = bankloan,
mapping = aes(x = factor(受教育程度),y = 收入))+
geom_boxplot(fill = 'gray',
color = 'steelblue',
outlier.fill = 'red',
outlier.color = 'red',
outlier.shape = 24)+
stat_summary(fun.y = 'mean',
geom = 'point',
colour = 'black')+
labs(x = '受教育程度')
plot(p13)
p14 <- ggplot(data = bankloan,
mapping = aes(x = factor(工龄),y = 收入))+
geom_boxplot(fill = 'gray',
color = 'steelblue',
outlier.fill = 'red',
outlier.color = 'red',
outlier.shape = 24)+
stat_summary(fun.y = 'mean',
geom = 'point',
colour = 'black')+
labs(x = '工龄')
plot(p14)
6.根据客户的年龄、教育和工龄对客户收入分组, 绘制不同年龄、教育和工龄的客户收入的小提琴图
bankloan$age_group <- cut(bankloan$年龄, breaks = c(0, 10, 20, 30, 40, 50))
bankloan$seniority_group <- cut(bankloan$工龄, breaks = c(0, 1, 3, 5, 10, 15, 20, 30, 40))
bankloan$education_group <- cut(bankloan$受教育程度, breaks = c(1,2,3,4))
p15 <- ggplot(data = bankloan,
mapping = aes(x = age_group,y = 收入))+
geom_violin(fill = 'steelblue',
scale = 'count')+
geom_boxplot(width = 0.2,outlier.color = 'red')+
stat_summary(fun.y = 'mean', geom = 'point', size = 3,
shape = 18, colour = 'orange')+
labs(x = '年龄')
plot(p15)
p16 <- ggplot(data = bankloan,
mapping = aes(x = education_group,y = 收入))+
geom_violin(fill = 'steelblue',
scale = 'count')+
geom_boxplot(width = 0.2,outlier.color = 'red')+
stat_summary(fun.y = 'mean', geom = 'point', size = 3,
shape = 18, colour = 'orange')+
labs(x = '教育')
plot(p16)
p17 <- ggplot(data = bankloan,
mapping = aes(x = seniority_group,y = 收入))+
geom_violin(fill = 'steelblue',
scale = 'count')+
geom_boxplot(width = 0.2,outlier.color = 'red')+
stat_summary(fun.y = 'mean', geom = 'point', size = 3,
shape = 18, colour = 'orange')+
labs(x = '工龄')
plot(p17)
7.绘制不同年龄、教育和工龄下客户的收入与负债的散点图
p18 <- ggplot(data = bankloan,
mapping = aes(x = 收入,y = 负债率,
color = age_group,shape = age_group))+
geom_point()+
scale_color_manual(values = c('0-9' = '', '10-19' = '', '20-29' = '', '30-39' = '', '40-49' = '', '50-59' = ''))+
scale_shape_manual(values = 15:20)+
theme(legend.title = element_blank(),
legend.background = element_blank())+
guides(color = guide_legend(nrow = 1))
plot(p18)
p19 <- ggplot(data = bankloan,
mapping = aes(x = 收入,y = 负债率,
color = education_group,shape = age_group))+
geom_point()+
scale_color_manual(values = c('1' = '', '2' = '', '3' = '', '4' = ''))+
scale_shape_manual(values = 15:18)
theme(legend.title = element_blank(),
legend.background = element_blank())+
guides(color = guide_legend(nrow = 1))
plot(p19)
p20 <- ggplot(data = bankloan,
mapping = aes(x = 收入,y = 负债率,
age_group = age_group,color = age_group))+
geom_point()+
scale_color_manual(values = c('0' = '', '10' = '', '20' = '', '30' = '', '40' = '', '50' = ''))+
scale_shape_manual(values = 15:20)
theme(legend.title = element_blank(),
legend.background = element_blank())+
guides(color = guide_legend(nrow = 1))
plot(p20)
注:其中scale_color_manual函数中颜色需自定义。