导入数据集
将movie、MPAA、competition、star、genre、TechEffect按字符型读入其他变量按数值型读入
learn<-
read.csv("E:\\RHome\\movie_learning.csv",
colClasses = c(rep("character",6),
rep("numeric",4))) %>%
mutate(MPAA = as.factor(MPAA)) %>%
mutate(competition = as.factor(competition)) %>%
mutate(star = as.factor(star)) %>%
mutate(genre = as.factor(genre)) %>%
mutate(TechEffect = as.factor(TechEffect))%>%
mutate(TechEffect = as.factor(TechEffect))%>%
mutate(GrossCat = as.factor(GrossCat))%>%
mutate(GrossCat2 = as.factor(GrossCat2))
将不是哑变量形式的定类自变量转换成因子型变量。
test <-
read.csv("E:\\RHome\\movie_test.csv",
colClasses = c(rep("character",6),
rep("numeric",4))) %>%
mutate(MPAA = as.factor(MPAA)) %>%
mutate(competition = as.factor(competition)) %>%
mutate(star = as.factor(star)) %>%
mutate(genre = as.factor(genre)) %>%
mutate(TechEffect = as.factor(TechEffect))%>%
mutate(GrossCat = as.factor(GrossCat))%>%
mutate(GrossCat2 = as.factor(GrossCat2))
在屏幕上查看movie_learning和movie_test数据集中各个变量的基本情况
str(learn)
str(test)
可以看出,movie_learning中genre变量的类型为“Factor w/ 8 levels”,而movie_test中genre变量的类型为“Factor w/ 6 levels”。下面需要修改这个变量,使它们在movie_learning和movie_test数据集的因子水平一样。
table(learn$genre)
table(test$genre)
通过查看两个数据集中genre变量的频数分布可以看出:
学习数据集movie_learning中genre的取值"Action"和"Docum"在测试数据集movie_test中没有出现。修改test中的genre变量的因子水平。
使用factor()函数将genre变量设为因子型变量,因子水平与movie_learning中genre变量的因子水平一样。
对二值因变量GrossCat2建立决策树模型
找出二值因变量
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='G'){insurance$MPAA[i]=as.numeric(1)}}
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='PG'){insurance$MPAA[i]=as.numeric(2)}}
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='PG13'){insurance$MPAA[i]=as.numeric(3)}}
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='R'){insurance$MPAA[i]=as.numeric(4)}}
for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='High'){insurance$competition[i]=as.numeric(1)}}
for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='Medium'){insurance$competition[i]=as.numeric(2)}}
for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='Low'){insurance$competition[i]=as.numeric(3)}}
for(i in 1:length(insurance$star)){if(insurance$star[i]=='A'){insurance$star[i]=as.numeric(1)}}
for(i in 1:length(insurance$star)){if(insurance$star[i]=='B'){insurance$star[i]=as.numeric(2)}}
for(i in 1:length(insurance$star)){if(insurance$star[i]=='C'){insurance$star[i]=as.numeric(3)}}
for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='High'){insurance$TechEffect[i]=as.numeric(1)}}
for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='Medium'){insurance$TechEffect[i]=as.numeric(2)}}
for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='Low'){insurance$TechEffect[i]=as.numeric(3)}}
insurance=as.data.frame(lapply(insurance,as.numeric))
cor(insurance)
得出其相关度最高的两个项为sequel+screens
建立模型
model<- rpart(GrossCat2 ~ sequel+screens,learn)
查看模型结果
summary(model)
决策树可视化
rpart.plot(model)
列出对应规则
asRules(model)
使用fancyRpartPlot展现更美观的决策树
fancyRpartPlot(model)
查看交叉验证结果
model$cptable
查看交叉验证结果图
plotcp(model)
根据交叉验证结果,找出估计误差最小时的cp值,并重新建立模型。
xerr <-model$cptable[,"xerror"]
minxerr <- which.min(xerr)
选择交叉验证的估计误差最小时对应的cp
mincp <-model$cptable[minxerr, "CP"]
model.prune <- prune(model,cp=mincp)
新模型
fancyRpartPlot(model.prune)
进行预测
pred<-predict(model,test,type="class")
存为数据框
yucess= data.frame(test$GrossCat2,pred)
对多值因变量GrossCat建立决策树模型
决策树模型
model<- rpart(GrossCat ~ MPAA+competition+star+sequel+TechEffect+screens,learn)
查看模型结果
summary(model)
决策树可视化
rpart.plot(model)
列出对应规则
asRules(model)
使用fancyRpartPlot展现更美观的决策树
fancyRpartPlot(model)
查看交叉验证结果
model$cptable
查看交叉验证结果图
plotcp(model)
根据交叉验证结果,找出估计误差最小时的cp值,并重新建立模型。
xerr <-model$cptable[,"xerror"]
minxerr <- which.min(xerr)
选择交叉验证的估计误差最小时对应的cp
mincp <-model$cptable[minxerr, "CP"]
model.prune <- prune(model,cp=mincp)
新模型
fancyRpartPlot(model.prune)
进行预测
pred<-predict(model,test,type="class")
存为数据框
yucess= data.frame(test$GrossCat,pred)
#
源代码:
library(magrittr)
library(dplyr)
insurance<- read.csv(file="E:\\RHome\\movie_learning.csv",header=T,fileEncoding = "utf-8")
learn<-
read.csv("E:\\RHome\\movie_learning.csv",
colClasses = c(rep("character",6),
rep("numeric",4))) %>%
mutate(MPAA = as.factor(MPAA)) %>%
mutate(competition = as.factor(competition)) %>%
mutate(star = as.factor(star)) %>%
mutate(genre = as.factor(genre)) %>%
mutate(TechEffect = as.factor(TechEffect))%>%
mutate(TechEffect = as.factor(TechEffect))%>%
mutate(GrossCat = as.factor(GrossCat))%>%
mutate(GrossCat2 = as.factor(GrossCat2))
test <-
read.csv("E:\\RHome\\movie_test.csv",
colClasses = c(rep("character",6),
rep("numeric",4))) %>%
mutate(MPAA = as.factor(MPAA)) %>%
mutate(competition = as.factor(competition)) %>%
mutate(star = as.factor(star)) %>%
mutate(genre = as.factor(genre)) %>%
mutate(GrossCat = as.factor(GrossCat))%>%
mutate(GrossCat2 = as.factor(GrossCat2))
str(learn)
str(test)
table(learn$genre)
table(test$genre)
test <- test %>%
mutate(genre=
factor(genre,
levels=c("Action","Cartoon","Comedy","Docum","Horror",
"ModerDrama","SciFi","Thriller")))
library(rpart)
library(rpart.plot)
library(rattle)
#-------------------------------------------------------------------------
#GrossCat2建立决策树模型
#决策树模型
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='G'){insurance$MPAA[i]=as.numeric(1)}}
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='PG'){insurance$MPAA[i]=as.numeric(2)}}
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='PG13'){insurance$MPAA[i]=as.numeric(3)}}
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='R'){insurance$MPAA[i]=as.numeric(4)}}
for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='High'){insurance$competition[i]=as.numeric(1)}}
for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='Medium'){insurance$competition[i]=as.numeric(2)}}
for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='Low'){insurance$competition[i]=as.numeric(3)}}
for(i in 1:length(insurance$star)){if(insurance$star[i]=='A'){insurance$star[i]=as.numeric(1)}}
for(i in 1:length(insurance$star)){if(insurance$star[i]=='B'){insurance$star[i]=as.numeric(2)}}
for(i in 1:length(insurance$star)){if(insurance$star[i]=='C'){insurance$star[i]=as.numeric(3)}}
for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='High'){insurance$TechEffect[i]=as.numeric(1)}}
for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='Medium'){insurance$TechEffect[i]=as.numeric(2)}}
for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='Low'){insurance$TechEffect[i]=as.numeric(3)}}
insurance=as.data.frame(lapply(insurance,as.numeric))
cor(insurance)#得出其相关度最高的两个项为sequel+screens
model<- rpart(GrossCat2 ~ sequel+screens,learn)
summary(model)
rpart.plot(model)
asRules(model)
fancyRpartPlot(model)
model$cptable
plotcp(model)
xerr <-model$cptable[,"xerror"]
minxerr <- which.min(xerr)
mincp <-model$cptable[minxerr, "CP"]
model.prune <- prune(model,cp=mincp)
fancyRpartPlot(model.prune)
pred<-predict(model,test,type="class")
yucess= data.frame(test$GrossCat2,pred)
#-------------------------------------------------------------------------
#GrossCat建立决策树模型
model<- rpart(GrossCat ~ MPAA+competition+star+sequel+TechEffect+screens,learn)
summary(model)
rpart.plot(model)
asRules(model)
fancyRpartPlot(model)
model$cptable
plotcp(model)
xerr <-model$cptable[,"xerror"]
minxerr <- which.min(xerr)
mincp <-model$cptable[minxerr, "CP"]
model.prune <- prune(model,cp=mincp)
fancyRpartPlot(model.prune)
pred<-predict(model,test,type="class")
yucess= data.frame(test$GrossCat,pred)