R语言建立决策树模型(movie数据集)

该博客介绍了如何使用R语言对电影数据进行预处理,包括将定类变量转换为因子型,并确保训练集和测试集的一致性。接着,建立了针对二值和多值因变量的决策树模型,进行了交叉验证,优化了模型并进行预测。最后展示了决策树的可视化和规则总结。
摘要由CSDN通过智能技术生成

导入数据集

将movie、MPAA、competition、star、genre、TechEffect按字符型读入其他变量按数值型读入

learn<- 
  read.csv("E:\\RHome\\movie_learning.csv",

           colClasses = c(rep("character",6),

                          rep("numeric",4))) %>%


  mutate(MPAA = as.factor(MPAA)) %>%

  mutate(competition = as.factor(competition)) %>%

  mutate(star = as.factor(star)) %>%

  mutate(genre = as.factor(genre)) %>%

  mutate(TechEffect = as.factor(TechEffect))%>%

  mutate(TechEffect = as.factor(TechEffect))%>%

  mutate(GrossCat = as.factor(GrossCat))%>%

  mutate(GrossCat2 = as.factor(GrossCat2))

将不是哑变量形式的定类自变量转换成因子型变量。

test <-

  read.csv("E:\\RHome\\movie_test.csv",

           colClasses = c(rep("character",6),

                          rep("numeric",4))) %>%

  mutate(MPAA = as.factor(MPAA)) %>%

  mutate(competition = as.factor(competition)) %>%

  mutate(star = as.factor(star)) %>%

  mutate(genre = as.factor(genre)) %>%

  mutate(TechEffect = as.factor(TechEffect))%>%

  mutate(GrossCat = as.factor(GrossCat))%>%

  mutate(GrossCat2 = as.factor(GrossCat2))

在屏幕上查看movie_learning和movie_test数据集中各个变量的基本情况

str(learn)

str(test)

 

可以看出,movie_learning中genre变量的类型为“Factor w/ 8 levels”,而movie_test中genre变量的类型为“Factor w/ 6 levels”。下面需要修改这个变量,使它们在movie_learning和movie_test数据集的因子水平一样

table(learn$genre)

table(test$genre)

 

通过查看两个数据集中genre变量的频数分布可以看出:

学习数据集movie_learning中genre的取值"Action"和"Docum"在测试数据集movie_test中没有出现。修改test中的genre变量的因子水平。

使用factor()函数将genre变量设为因子型变量,因子水平与movie_learning中genre变量的因子水平一样。

对二值因变量GrossCat2建立决策树模型

找出二值因变量

for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='G'){insurance$MPAA[i]=as.numeric(1)}}

for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='PG'){insurance$MPAA[i]=as.numeric(2)}}

for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='PG13'){insurance$MPAA[i]=as.numeric(3)}}

for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='R'){insurance$MPAA[i]=as.numeric(4)}}

for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='High'){insurance$competition[i]=as.numeric(1)}}

for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='Medium'){insurance$competition[i]=as.numeric(2)}}

for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='Low'){insurance$competition[i]=as.numeric(3)}}

for(i in 1:length(insurance$star)){if(insurance$star[i]=='A'){insurance$star[i]=as.numeric(1)}}

for(i in 1:length(insurance$star)){if(insurance$star[i]=='B'){insurance$star[i]=as.numeric(2)}}

for(i in 1:length(insurance$star)){if(insurance$star[i]=='C'){insurance$star[i]=as.numeric(3)}}

for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='High'){insurance$TechEffect[i]=as.numeric(1)}}

for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='Medium'){insurance$TechEffect[i]=as.numeric(2)}}

for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='Low'){insurance$TechEffect[i]=as.numeric(3)}}

insurance=as.data.frame(lapply(insurance,as.numeric))

cor(insurance)

得出其相关度最高的两个项为sequel+screens

建立模型

model<- rpart(GrossCat2 ~ sequel+screens,learn)

查看模型结果

summary(model)

 

决策树可视化 

rpart.plot(model)

 

列出对应规则

asRules(model)

 

使用fancyRpartPlot展现更美观的决策树

fancyRpartPlot(model)   

 

查看交叉验证结果

model$cptable   

 

查看交叉验证结果图

plotcp(model) 

 

根据交叉验证结果,找出估计误差最小时的cp值,并重新建立模型。

xerr <-model$cptable[,"xerror"]

minxerr <- which.min(xerr)

选择交叉验证的估计误差最小时对应的cp

mincp <-model$cptable[minxerr, "CP"]

model.prune <- prune(model,cp=mincp)

新模型

fancyRpartPlot(model.prune)

 

进行预测

pred<-predict(model,test,type="class")

存为数据框

yucess= data.frame(test$GrossCat2,pred)

 

对多值因变量GrossCat建立决策树模型

决策树模型

model<- rpart(GrossCat ~ MPAA+competition+star+sequel+TechEffect+screens,learn)

查看模型结果

summary(model)

 

决策树可视化  

rpart.plot(model)

列出对应规则

asRules(model)

 

使用fancyRpartPlot展现更美观的决策树

fancyRpartPlot(model) 

 

查看交叉验证结果

model$cptable 

 

查看交叉验证结果图

plotcp(model)   

 

根据交叉验证结果,找出估计误差最小时的cp值,并重新建立模型。

xerr <-model$cptable[,"xerror"]

minxerr <- which.min(xerr)

选择交叉验证的估计误差最小时对应的cp

mincp <-model$cptable[minxerr, "CP"]

model.prune <- prune(model,cp=mincp)

新模型

fancyRpartPlot(model.prune)

 

进行预测

pred<-predict(model,test,type="class")

存为数据框

yucess= data.frame(test$GrossCat,pred)

#

 

源代码:

library(magrittr)
library(dplyr)
insurance<- read.csv(file="E:\\RHome\\movie_learning.csv",header=T,fileEncoding = "utf-8")

learn<- 
  read.csv("E:\\RHome\\movie_learning.csv",
           colClasses = c(rep("character",6),
                          rep("numeric",4))) %>%

  mutate(MPAA = as.factor(MPAA)) %>%
  mutate(competition = as.factor(competition)) %>%
  mutate(star = as.factor(star)) %>%
  mutate(genre = as.factor(genre)) %>%
  mutate(TechEffect = as.factor(TechEffect))%>%
  mutate(TechEffect = as.factor(TechEffect))%>%
  mutate(GrossCat = as.factor(GrossCat))%>%
  mutate(GrossCat2 = as.factor(GrossCat2))


test <- 
  read.csv("E:\\RHome\\movie_test.csv",
           colClasses = c(rep("character",6),
                          rep("numeric",4))) %>%
  mutate(MPAA = as.factor(MPAA)) %>%
  mutate(competition = as.factor(competition)) %>%
  mutate(star = as.factor(star)) %>%
  mutate(genre = as.factor(genre)) %>%
  mutate(GrossCat = as.factor(GrossCat))%>%
  mutate(GrossCat2 = as.factor(GrossCat2))


str(learn)
str(test)


table(learn$genre)
table(test$genre)

test <- test %>%
  mutate(genre=
           factor(genre,
                  levels=c("Action","Cartoon","Comedy","Docum","Horror",
                           "ModerDrama","SciFi","Thriller")))

library(rpart)
library(rpart.plot)
library(rattle)
#-------------------------------------------------------------------------
#GrossCat2建立决策树模型
#决策树模型
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='G'){insurance$MPAA[i]=as.numeric(1)}}
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='PG'){insurance$MPAA[i]=as.numeric(2)}}
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='PG13'){insurance$MPAA[i]=as.numeric(3)}}
for(i in 1:length(insurance$MPAA)){if(insurance$MPAA[i]=='R'){insurance$MPAA[i]=as.numeric(4)}}
for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='High'){insurance$competition[i]=as.numeric(1)}}
for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='Medium'){insurance$competition[i]=as.numeric(2)}}
for(i in 1:length(insurance$competition)){if(insurance$competition[i]=='Low'){insurance$competition[i]=as.numeric(3)}}
for(i in 1:length(insurance$star)){if(insurance$star[i]=='A'){insurance$star[i]=as.numeric(1)}}
for(i in 1:length(insurance$star)){if(insurance$star[i]=='B'){insurance$star[i]=as.numeric(2)}}
for(i in 1:length(insurance$star)){if(insurance$star[i]=='C'){insurance$star[i]=as.numeric(3)}}
for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='High'){insurance$TechEffect[i]=as.numeric(1)}}
for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='Medium'){insurance$TechEffect[i]=as.numeric(2)}}
for(i in 1:length(insurance$TechEffect)){if(insurance$TechEffect[i]=='Low'){insurance$TechEffect[i]=as.numeric(3)}}
insurance=as.data.frame(lapply(insurance,as.numeric))
cor(insurance)#得出其相关度最高的两个项为sequel+screens
model<- rpart(GrossCat2 ~ sequel+screens,learn)

summary(model)
rpart.plot(model)
asRules(model)


fancyRpartPlot(model)   

model$cptable 
plotcp(model)   



xerr <-model$cptable[,"xerror"]
minxerr <- which.min(xerr)

mincp <-model$cptable[minxerr, "CP"]

model.prune <- prune(model,cp=mincp) 


fancyRpartPlot(model.prune)

pred<-predict(model,test,type="class")

yucess= data.frame(test$GrossCat2,pred)

#-------------------------------------------------------------------------
#GrossCat建立决策树模型


model<- rpart(GrossCat ~ MPAA+competition+star+sequel+TechEffect+screens,learn)

summary(model)
rpart.plot(model)
asRules(model) 


fancyRpartPlot(model)    

model$cptable 
 
plotcp(model)  

xerr <-model$cptable[,"xerror"]

minxerr <- which.min(xerr)

mincp <-model$cptable[minxerr, "CP"]

model.prune <- prune(model,cp=mincp) 

fancyRpartPlot(model.prune)

pred<-predict(model,test,type="class")

yucess= data.frame(test$GrossCat,pred)


评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

城南望余雪

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值