R语言手册(第三站 探索性数据分析)
标签: R语言
1.读入chrn数据集
churn <- d.csv(file = "C:/.../churn.txt", stringsAsFactors = TRUE)
#显示前10条记录
churn[1:10,]
#总结客户流失变量
sum.churn <- summary(churn$Churn)
sum.churn
#计算客户流失比例
prop.churn <- sum(churn$Churn == "True")/length(churn$Churn)
prop.churn
2.流失变量的条形图
barplot(sum.churn,
ylim=c(0,3000),
main="Bar Graph of Churners and Non-Churners",
col="lightblue")
box(which ="plot", ty="solid", col="black")
输出:
3.为客户流失和国际套餐的计数建表
counts <-table(churnSChurn,
churn$Int.l. Plan,
dnn=c("Churn","International Plan"))
counts
输出:
4.叠加柱状图
barplot(counts,
legend=rownames(counts),
col =c("blue","red"),
ylim=c(0,3300),
ylab="Count",
xlab="International Plan",
main="Comparison Bar Chart: Churn Proportionsby International Plan")
box(which ="plot", Ity ="solid", col="black")
输出:
5.创建两个变量的汇总表
sumtable <- addmargins(counts, FUN=sum)
sumtable
6.创建分行比例表
输出:
row. margin<-round(prop.table(counts, margin=1),4)*100
row. margin
7.创建分列的比例表
输出:
col. margin <-round(prop. table(counts, margin=2),4)*100
col. margin
输出:
8.带有图例的聚类条形图
barplot(t(counts),
col = c("blue","green"),
ylim = c(0,3300),
ylab = "Counts",
xlab = "Churn",
main = "Intemational PlanCountby Chum",
beside = TRUE)
legend("topright", c(rownames(counts)),
col=c("blue","green"),
pch=15,
title="Intl Plan")
box(which ="plot", ty="solid", col="black")
输出:
9.客户服务呼叫的非覆盖直方图
hist(churn$CustServ.Calls,
xlim=c(0,10),
col="lightblue",
ylab="Count",
xlab="Customer Service Calls",
main="Histogramof Customer Service Calls")
输出:
10.下载并安装R包ggplot2
install. packages("ggplot2")
# Pick any CRAN mirror
#(seeexample image)
# Open the newpackage library(ggplot2)
11.覆盖条形图
geplot()+
geom_bar(data=churn,
aes(x=factor(churnSCustServ. Calls),
fill =factor(churnSChurn)), position="stack")+
scalex discrete("Customer Service Calls")+
scale y continuous("Percent")+
guides(fill=guide _legend(title="Churn"))+
scale_fill_manual(values=c("blue","red"))
输出:
geplot()+
geom _bar(data=churn,
aes(x=factor(churn$CustServ. Calls)
fill=factor(churnSChurn)), position="fill")+
scalex discrete("Customer Service Calls"+
scaley continuous("Percent")+
guides(fill=guide _legend(title="Churn"))+
scale_fill_manual(values=c("blue","red"))
输出:
12.t-检验和国际电话的两个例子
#数据分割
churn. false<-subset(churn, churn$Churn=="False")
churn. true<-subset(churn, churn$Churn=="True")
#运行检验
t. test(churn.falseSIntl.Calls, churn.trueSIntl.Calls)
输出:
13.傍晚使用时长和白天使用时长的散点图,将客户流失着色
lot(churnSEve. Mins,
churnSDay. Mins,
xlim=c(0,400),
ylim=c(0,400),
xlab="Evening Minutes",
ylab="Day Minutes",
main="Scatterplot of Day and Evening Minutes by Churn",
col =ifelse(churnSChurn="True","red","blue"))
legend("topright",
c("True","False"),
col =c("red","blue"),
pch=1,
title="Churn")
输出:
14.白天使用时长和客户服务电话量的散点图,将客户流失着色
plot(churn$Day.Mins,
churn$CustServ.Calls,
xlim=c(0,400),
xlab="Day Minutes",
ylab="Customer Service Calls",
main="Scatterplot of Day Minutes and Customer Service Calls by Churn",
col=ifelse(churnSChurn="True","red","blue"),
pch=ifelse(churnSChurn="True",16,20))
legend("topright",
c("True","False"),
col=c("red","blue"),
pch=c(16,20),
title ="Churn")
输出:
15.散点图矩阵
pairs(churnSDay.Mins+
crurn$Day.Calls+
churnSDay.Charge)
输出:
16.白天费用和白天使用时长的回归分析
fit<-Im(churn$Day.Charge~churnSDay.Mins)
summary(fit)
输出:
16.相关值和p值
days <-cbind(churn$Day.Mins,
churnSDay.Calls,
churnSDay.Charge)
MinsCallsTest<-cor.test(churn$Day.Mins,churn$Day. Calls)
MinsChargeTest <-cor.test(churn$Day.Mins,churnSDay.Charge)
CallsCharge Test <-cor.test(churnSDay.Calls,churnsDay.Charge)
round(cor(days),4)
MinsCallsTest$p.value
MinsChargeTest$p.value
CallsChargeTest$p.value
输出:
17相关值和p值以矩阵形式展现
#收集感兴趣的变量
corrdata<-cbind(churn$Account.Length,
churn$VMail.Message,
churn$Day.Mins,
churn$Day.Calls,
churn$CustServ.Calls)
#声明矩阵
corrpvalues<-matrix(rep(0,25), ncol=5)
#使用相关系数填充矩阵
for(i in 1:4){
for (G in(i+l):5){
corrpvalues[i,j]<-
corrpvalues[j,i]<-
round(cor.test(corrdata[,i], corrdata[j])$p.value,4)
}
}
round(cor(corrdata),4)
corrpvalues
输出: