基于R语言的机器学习学习笔记（多元线性回归, 随机森林，XGBoost）

小琳子要开心呀

已于 2023-05-16 22:04:37 修改

阅读量2k

点赞数 3

文章标签： r语言机器学习随机森林

于 2023-05-06 11:34:16 首次发布

本文链接：https://blog.csdn.net/weixin_43465015/article/details/130335911

版权

基于R语言的机器学习学习笔记

随机森林
- 使用R语言实现随机森林（randomFores）
- 相关知识
多元线性回归
- 使用R语言实现多元线性回归
XGboost（eXtreme Gradient Boosting）

随机森林

使用R语言实现随机森林（randomFores）

## 本例使用的数据集来自R语言, 预测变量是连续值变量, 
## 随机森林执行回归任务, 而不是其它博客常用的分类变量，执行分类任务 

# 安装包
# install.package("randomForest")
library("randomForest")

# 加载数据
data(airquality)             # 使用R语言自带数据(data()可以查看所有可用数据), airquality数据集为连续变量, iris数据集有分类变量
summary(airquality)          # 查看数据统计信息; str(),查看数据结构; head()和tail(), 查看数据的首尾; dim() 查看行列号
# read.csv("C:/test.csv")    # 也可读取自己的数据

# 处理数据
# 划分数据集为训练集(建模)和测试集(预测), 训练集和测试集的比例为7:3或者8:2. caret::createDataPartition函数也可分割数据集
D <- na.omit(airquality)    # 去除NA值
nrow <- nrow(D)             # 计算行数
set.seed(1)                 # 设置随机种子, 不然每次随机的结果都不一样, 无法复现结果
sample <- sample(1:nrow, round(nrow*0.7))    # 随机选择, 7:3分割数据集
D_trainSet <- D[sample, ]      # 训练数据集
D_testSet <- D[-sample, ]	    # 验证数据集

# 构建随机森林模型
set.seed(2)          # 设置种子, 用于复现
# RF预测Ozone(连续的数值变量). ntree设置构建树的数量, 默认为500. maxnodes设置每棵树的最大节点数
RF <- randomForest(Ozone ~ ., data=D_trainSet, importance=TRUE, na.action = na.omit)    
print(RF)                 # 查看变量解释百分比
plot(RF)                  # 绘图, error随着trees变化的折线图, 预测变量为连续变量时, error指的是MSE
hist(treesize(RF))        # 每棵树的节点数, 直方图显示

# 变量重要性排序
imp <- importance(RF)                                 # 没有设置type参数，两种方法计算的重要性的都会输出
varImpPlot(RF, main = "variable importance")          # 图可视化重要性排序(Dotchart)
# write.csv(imp, file = 'D:/RF_importance.csv')       # 保存为csv

# 使用构建的随机森林模型预测新数据集
predict <- predict(RF, D_testSet)
plot(D_testSet$Ozone, predictSet)                         # 简单绘制散点图

# 对比训练数据集和预测数据集的预测精度
pred_trainSet <- predict(RF, D_trainSet)                                     # 使用训练数据预测变量
cat('R2_trainSet', summary(lm(D_trainSet$Ozone ~ pred_trainSet))$r.squared)  # 计算模型精度

pred_testSet <- predict(RF, D_testSet)                                        # 使用测试数据预测变量,并计算模型精度
cat('R2_testSet', summary(lm(D_testSet$Ozone ~ pred_testSet))$r.squared)      # 计算模型精度

# 关于网上教程提到的混淆矩阵和ROC曲线应该是分类变量的分析工具, 连续变量用不了
# 下面代码不重要了----------------------------------------------------------------------------

# 统计模型的预测精度--------------------------------------------------
df <- data.frame( obs = D_testSet$Ozone, pred = pred_testSet ) |> na.omit()           # 构建数据框, 并去除空值
summ <- summary(lm(df$obs ~ df$pred) )
N <- length(df$obs)
intercept <-  round(summ$coefficients[1,1], 4)
slope <- round(summ$coefficients[2,1], 4)
R2 <- round(summ$r.squared, 3); R <- round(cor(df)[1,2], 3)
P <- formatC(summ$coefficients[2,4], digits = 4, format = "f")     
residual <- df$obs - df$pred
MSE <- mean(residual^2)
MAE <- mean(abs(residual))
RMSE <- roune(sqrt(MSE), 2)                 # 均方根误差（残差平方和的均值的平方根），表示模型的误差大小
NRMSE <- round(RMSE/mean(df$obs)*100, 2)    # 相对均方根误差，使用百分比的形式表示模型的误差大小
NRMAE <-  round(MAE/mean(df$obs)*100, 2)

# 使用ggplot绘制散点图-------------------------------------
library(ggplot2)
min <- min(df, na.rm = T); max <- max(df, na.rm = T)
str <- paste0('RMSE = ', round(RMSE,3), '\n',
              'NRMSE = ', round(NRMSE,2), ' %', '\n',
              'R^2 = ', round(R2,3), '\n')
# 绘制散点图
F <- ggplot(df, aes(obs, pred)) + geom_point() +  
  geom_abline(intercept = 0, slope = 1, linetype ="longdash", colour = 'black', size=0.5) +  # 绘制对角线
  geom_smooth(se = T, method = 'lm', size = 0.5, colour = 'black',fill = 'gray') +            # 绘制拟合曲线
  lims(x=c(min, max), y=c(min, max)) +
  labs(x = paste('Observation'), y = paste('Prediction') ) + 
  annotate('text', x = min, y = max, hjust = 'left', vjust = 'top',label = str, size = 4,  family = "sans") + # 标注模型精度
  theme_bw()
F
ggsave(paste0('D:/','point_RF','.tiff'), F, width = 10, height = 9, units = c("cm"), dpi = 600)

多元线性回归

使用R语言实现多元线性回归

# 获取数据----
data(state.x77)                             # 使用R语言数据
D <- data.frame(state.x77) |> na.omit()     # 转数据框格式, 并移除NA
str(D); head(D)                             # 查看数据

# 处理数据(分割数据集为训练和预测)----
set.seed(11)                                # 设置随机种子，用于复现
ind <- sample(nrow(D), nrow(D)*0.7)         # 按照7：3分割数据集为训练和预测
trainSet <- D[, ind]                        # 训练集           
testSet <- D[, -ind]                        # 预测集

# 构建模型----
model <- lm(Murder~., trainSet)                    # 使用数据框其他变量预测Murder变量
summary(model)                                     # 查看模型拟合结果

# 多重共线性----
cor(D)                         # 相关性矩阵, 查看特征之间的相关性
model_step <- step(model)      # 逐步回归，剔除存在共线性的变量
summary(model_step)

# 使用模型预测----
pred <- predict(model_step, newdata = testSet)            # 使用测试集和逐步回归的多元线性模型预测Murder
df_pred <- data.frame(obs = testSet$Murder, pred = pred)  #  
summ <- summary(lm(obs ~ pred, df_pred))      

# 统计模型的精度----
residual <- df_pred$obs - df_pred$pred               
MSE <- mean(residual^2) 
RMSE <- sqrt(MSE)
NRMSE <- RMSE/mean(df$obs)*100
R2 <- summ$r.squared
P <- summ$coefficients[2,4]

# 绘制散点图----
library(ggplot2); library(ggpubr)
min <- min(df, na.rm = T); max <- max(df, na.rm = T)
str <- paste0('RMSE = ', round(RMSE,3), '\n',
              'NRMSE = ', round(NRMSE,2), '%', '\n',
              'R^2 = ', round(R2,3), '\n')
F <- ggplot(df_pred, aes(obs, pred)) + geom_point() +  
  geom_abline(intercept = 0, slope = 1, linetype ="longdash", colour = 'black', size=0.5) +   # 绘制对角线
  geom_smooth(se = T, method = 'lm', size = 0.5, colour = 'black',fill = 'gray') +            # 绘制拟合线
  lims( x=c(min, max), y=c(min, max) ) +
  labs(x = paste('Observation'), y = paste('Prediction') ) + 
  annotate('text', x = min, y = max, hjust = 'left', vjust = 'top',label = str, size = 4,  family = "sans") + # 标注模型精度
  theme_bw()
F
ggsave(paste0('D:/','point_MLR','.tiff'), F, width = 10, height = 9, units = c("cm"), dpi = 600)

参考链接：
# R语言 —— 多元线性回归
https://blog.csdn.net/m0_51339444/article/details/124590708
# R语言——多元线性回归
https://blog.csdn.net/weixin_41030360/article/details/80891738

XGboost（eXtreme Gradient Boosting）

使用R语言的xgboost包

# 安装并加载包 ----
#install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")
library(xgboost)
library(ggplot2)

#  加载数据 ----
data(airquality)   
str(airquality)

# 分割数据 -----
D <- na.omit(airquality)        # 去除NA值
set.seed(1)                          # 设置随机种子, 不然每次随机的结果都不一样, 无法复现结果
sample <- sample(nrow(D),  nrow(D)*0.7)    # 随机选择, 7:3分割数据集
trainSet <- D[sample, ]        # 训练数据集
testSet <- D[-sample, ]	    # 验证数据集

# 将数据处理成XGboost的xgb.DMatrix格式 ------
dtrain <- xgb.DMatrix(data = as.matrix(trainSet[,-1]), label = trainSet[, 1] )   
dtest <- xgb.DMatrix(data = as.matrix(testSet[,-1]), label = testSet[, 1] )   
watchlist <- list(train = dtrain, test = dtest)

# 构建模型 ----
params <- list(eta = 0.5,                    # 学习率, 默认是0.3
               max_depth = 3                   # 每棵树的最大树深, 默认是6
             # objective 默认值是"reg:squarederror", 指定学习任务(回归:reg)和学习目标(squarederror)
             # nthread 默认为最大可用线程数
             )
xgb <- xgb.train(params = params, data = dtrain, nrounds = 10, watchlist = watchlist)   # booster默认为"gbtree";eta默认0.3；max_dept

# 重要性排序
importance <- xgb.importance(xgb)
print(importance) 
xgb.ggplot.importance(importance)    #  重要性排序绘图

# 查看模型的单颗决策数
# xgb.plot.tree(model = xgb, trees = 1, plot_width = 500, plot_height = 500)
# xgb.plot.tree(model = xgb, trees = 1:3, plot_width = 500, plot_height = 500)

# 模型预测 -----
pred_test <- predict(xgb, newdata = dtest)

# 统计模型的精度RMSE -----
df <- data.frame(obs = testSet[,1], pred = pred)
residual <- (df$obs - df$pred); MSE <- mean(residual^2)
RMSE <- sqrt(MSE);  NRMSE <- RMSE/mean(df$obs)*100
# 统计实测值和预测值线性模型的进度
summ <- summary(lm(obs ~ pred, df))
# intercept <-  round(summ$coefficients[1,1],4);  slope <- round(summ$coefficients[2,1],4)
# P <- formatC(summ$coefficients[2,4], digits = 4, format = "f" )
R2 <- summ$r.squared;  R <- cor(df)[1,2]

# 绘制散点图 ----
library(ggplot2); library(ggpubr)
min <- min(df, na.rm = T); max <- max(df, na.rm = T)
str <- paste0('RMSE = ', round(RMSE,3), '\n',
                     'NRMSE = ', round(NRMSE,2), '%', '\n',
                      'R^2 = ', round(R2,3), '\n')
F <- ggplot(df_pred, aes(obs, pred)) + geom_point() +  
  geom_abline(intercept = 0, slope = 1, linetype ="longdash", colour = 'black', size=0.5) +   # 绘制对角线
  geom_smooth(se = T, method = 'lm', size = 0.5, colour = 'black',fill = 'gray') +                    # 绘制拟合线
  lims( x=c(min, max), y=c(min, max) ) +                                                      
  labs(x = paste('Observation'), y = paste('Prediction') ) + 
  annotate('text', x = min, y = max, hjust = 'left', vjust = 'top', label = str, size = 4) +               # 标注模型精度
  theme_bw()
F
ggsave(paste0('D:/','point_MLR','.tiff'), F, width = 10, height = 9, units = c("cm"), dpi = 500)

XGBoost（eXtreme Gradient Boosting）原理：
是在决策树的基础上产生迭代，它以 boosting 的方式结合了多个决策树。通常创建每棵新树是为了通过梯度提升来减少先前模型的误差，误差指的是实际值和预测值之间的差异。把误差作为协变量参与下一个模型的预测，反复执行这个过程，降低出错率，直到决策树指定阈值，模型已经被训练成功。
主要参数介绍：
调参注意事项:
当我们允许模型变得更复杂（例如更深）时，模型具有更好的拟合训练数据的能力，从而产生更少偏差的模型。
然而，如此复杂的模型需要更多的数据来拟合。
控制过拟合
当你观察到训练准确率很高，但测试准确率很低时，很可能你遇到了过拟合问题。
通常有两种方法可以控制 XGBoost 中的过度拟合：
第一种方式是直接控制模型复杂度。
这包括max_depth 和 min_child_weight 和 gamma。
第二种方法是添加随机性以使训练对噪声具有鲁棒性。
这包括 subsample 和 colsample_bytree。
您还可以减步长eta。这样做时请记住增加num_round。
更快的训练性能
有一个名为tree_method的参数，将其设置为historgpu_hist以加快计算速度。

参考链接:
# XGBoost（二）：R语言实现
https://www.jianshu.com/p/38009bec6a55
# R语言机器学习-xgboost (知乎)
https://zhuanlan.zhihu.com/p/607919007
# 官方文档(这里很全面, 有时间建议浏览一下)
https://xgboost.readthedocs.io/en/stable/index.html
# 官方文档 (调参注意事项)
https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html#

小琳子要开心呀

关注

3
点赞
踩
50

收藏

觉得还不错? 一键收藏
0
评论
基于R语言的机器学习学习笔记（多元线性回归, 随机森林，XGBoost）

R语言importance {randomForest}关于变量重要性度量（the variable importance measures）的定义。以下是变量重要性测量的定义。第一个衡量标准是。
复制链接

扫一扫