R语言随机森林模型回归randomForest

小白菜本菜1111

于 2022-05-20 11:09:15 发布

阅读量1.8k

点赞数

分类专栏： R语言文章标签：随机森林算法机器学习 r语言

本文链接：https://blog.csdn.net/qq_44795935/article/details/124879547

版权

R语言专栏收录该内容

1 篇文章 0 订阅

订阅专栏

基于《The influence of the neighbourhood environment on peer-to-peer accommodations: A random forest regression analysis》文章，Redirectinghttps://doi.org/10.1016/j.jhtm.2022.02.028

Multiple linear regression and Random forest regression

# Use the software RStudio 4.0.5

# -*- coding: UTF-8 -*-

# This code uses All room as an example, other types of Airbnb such as entire home/apt, private room, shared room codes are also used to avoid redundancy and will be omitted from the classification codes.

## Loading packages and data------------------------------------------------------------

library(randomForest)
library(pheatmap)

library(extrafont)

library(corrplot)

library(car)

setwd("C:/Users/Desktop/Airbnb") # Setting up the work path

Data_Airbnb <- read.csv("Airbnb_data.csv", sep = ",") # Reading data

## Multiple linear regression---------------------------------------------------------------

Lm_Airbnb <- lm(Airbnb~c("PopDen", "PGDP", "HPrice", "Distance", "BusDen", "MetroDen", "CaterDen", "ShopDen", "RecrDen", "UnivDen", "HotelDen", "AttrDen") ,data = Data_Airbnb) # Modelling

summar(Lm_Airbnb) # View fitting results

lm.pred_Airbnb <- predict(lm_Airbnb, Data_Airbnb) # Predicted results

lm.pred_Airbnb1 <- data.frame(forest.pred_Airbnb, Data_Airbnb) # Comparison of predicted and actual results

# Multicollinearity test

vif(Lm_Airbnb, digits = 3) # Variance inflation factor(VIF)

## Random forest regression--------------------------------------------------------------

set.seed(1234) # Setting up random number seeds

Rf_Airbnb <- randomForest(Airbnb ~ c("popDen", "PGDP", "HPrice", "Distance", "BusDen", "MetroDen", "CaterDen", "ShopDen", "RecrDen", "UnivDen", "HotelDen", "AttrDen"), data = Data_Airbnb, ntree = 500, importance = TRUE) # Modelling

# Cross-validation

set.seed(1234)

result <- rfcv(Data_Airbnb[ ,2:12], Data_Airbnb$Airbnb, cv.fold = 2, scale = "log", step = 0.5) # rfcv is a random forest cross-validation function

result$error.cv # View the crossover error rate table

# Results of random forest regression

forest.pred_Airbnb <- predict(Rf_Airbnb, Data_Airbnb) # Predicted results

forest.pred_Airbnb1 <- data.frame(forest.pred_Airbnb, Data_Airbnb) # Comparison of predicted and actual results

# Checking out the chart

opar <- par(no.readonly = TRUE)

par(lwd = 2, cex = 1, cex.axis = 1, font = 2, cex.lab = 1, tck = -.02)

plot(forest.pred_Airbnb, main = " ", lwd = 2, font.lab = 2, font = 2, ann = FALSE, family = 'Times')

title(xlab = "Number of feature", ylab = "Cross-valication error", font.lab = 2)

par(opar)

# Variable importance - %lncMSE

varImpPlot(forest.pred_Airbnb, family = 'Times')

dev.off()

# The partial dependencies of variables

opar <- par(no.readonly = TRUE)

partialPlot(forest.pred_Airbnb, Data_Airbnb, PopDen, "0", main = " ", xlab = " ", ylab = " ", col = "black")

partialPlot(forest.pred_Airbnb, Data_Airbnb, PGDP, "0", main = " ", xlab = " ", ylab = " ", col = "black") # The same applies to the other variables "Distance", "BusDen", etc.

# Comparison of multiple linear regression and random forest regression results ----------------------------------------------------------------------------------------------

# R-value

cor(lm.pred_Airbnb, Data_Airbnb$Airbnb) # Multiple linear regression R-value

cor(forest.pred_Airbnb, Data_Airbnb$Airbnb) # Random forest regression R-value

# Mean absolute error (MAE)

MAE <- function(actual, predicted){mean(abs(actual - predicted))} # Formula to define MAE

MAE(lm.pred_Airbnb, Data_Airbnb$Airbnb) # Mean absolute error of multiple linear regression

MAE(forest.pred_Airbnb, Data_Airbnb$Airbnb) # Mean absolute error of random forest regression

dev.off()