交叉熵损失结合反向传播算法不调包实现逻辑回归

最新推荐文章于 2024-06-28 13:30:09 发布

置顶麻利麻利哄吧

最新推荐文章于 2024-06-28 13:30:09 发布

阅读量872

点赞数

分类专栏：机器学习 R语言 R && python建模文章标签：逻辑回归机器学习交叉熵损失不调包实现

本文链接：https://blog.csdn.net/weixin_43217641/article/details/129560800

版权

R && python建模同时被 3 个专栏收录

22 篇文章 14 订阅 ¥9.90 ¥99.00

订阅专栏

超级会员免费看

R语言

40 篇文章 8 订阅

订阅专栏

机器学习

18 篇文章 5 订阅

订阅专栏

相关知识

首先介绍交叉熵损失结合反向传播算法不调包实现逻辑回归的相关基本知识。

逻辑回归

逻辑回归是统计学习中的经典分类方法，其可以解决二分类和多分类问题。
对于连续的线性值的输出通过Sigmoid激活函数转为0-1之间的概率值。
公式： $P(Y=1|x)=\frac{exp(wx+b)}{1+exp(wx+b)}$ ， $P(Y=0|x)=\frac{1}{1+exp(wx+b)}$ 。

矩阵求导

更多具体的矩阵求导请参考开源书籍：The matrix cookbook。

反向传播

反向传播是神经网络参数优化的基石。下面我们给出结合随机梯度下降的反向传播算法。
在这里插入图片描述

关于神经网络更多反向传播的细节请参考：深度学习（花书）。

损失函数

假设数据集中有m个样本：
$\begin{bmatrix} z_1 ,& z_2 ,& z_3 ,& z_4 ,& \cdots, & z_m \end{bmatrix}^{T}.$
$\begin{bmatrix} w_1 ,& w_2 ,& w_3 ,& w_4 ,& \cdots, & w_k \end{bmatrix}^{T}.$
$\begin{bmatrix} x_{11} ,& x_{12} ,& x_{13} ,& x_{14} ,& \cdots, & x_{1m} \\ x_{21} ,& x_{22} ,& x_{23} ,& x_{24} ,& \cdots, & x_{2m} \\ x_{31} ,& x_{32} ,& x_{33} ,& x_{34} ,& \cdots, & x_{3m} \\ \vdots & \vdots & \vdots & \vdots & \vdots & \vdots & \\ x_{k1} ,& x_{k2} ,& x_{k3} ,& x_{k4} ,& \cdots, & x_{km} \\ \end{bmatrix}$
$\begin{bmatrix} b_0 ,& b_0 ,& b_0 ,& b_0 ,& \cdots, & b_0 \end{bmatrix}^{T}$ .

$Z = w^{T}X+b$ .
$g(z_i)=\frac{1}{1+exp(z_i)}, z_i\in Z$ .
损失函数为： $J_{Sigmoid}=-\frac{1}{m}\sum_{i=1}^{m}y_{i}log(g(z_i))+(1-y_{i})log(1-g(z_i)).$
$\begin{align} \frac{\partial J_{Sigmoid}}{\partial w}&=-\frac{1}{m}\sum_{i=1}^{m}y_i\frac{1}{g(z_i)}g(z_i)(1-g(z_i))\frac{\partial z_i}{\partial w}+(1-y_i)\frac{-1}{1-g(z_i)}g(z_i)(1-g(z_i))\frac{\partial z_i}{\partial w} \\ & =-\frac{1}{m}\sum_{i=1}^{m}y_i(1-g(z_i))\frac{\partial z_i}{\partial w}+(y_i-1)g(z_i)\frac{\partial z_i}{\partial w} \\ & =\frac{1}{m}\sum_{i=1}^{m}(g(z_i)-y_i)\frac{\partial z_i}{\partial w} \end{align}$

$\frac{\partial z_i}{\partial w}$ ：代表标量对向量求导。
$\begin{align} \frac{\partial J_{Sigmoid}}{\partial b}&=-\frac{1}{m}\sum_{i=1}^{m}y_i\frac{1}{g(z_i)}g(z_i)(1-g(z_i))\frac{\partial z_i}{\partial b}+(1-y_i)\frac{-1}{1-g(z_i)}g(z_i)(1-g(z_i))\frac{\partial z_i}{\partial b} \\ & =-\frac{1}{m}\sum_{i=1}^{m}y_i(1-g(z_i))\frac{\partial z_i}{\partial b}+(y_i-1)g(z_i)\frac{\partial z_i}{\partial b} \\ & =\frac{1}{m}\sum_{i=1}^{m}(g(z_i)-y_i) \end{align}$

$\frac{\partial z_i}{\partial b}$ ：代表标量对标量求导。

代码复现

Sigmoid激活函数

> binary.sigmoid = function(x) 1 / (1 + exp(-x)) # sigmoid函数

梯度与损失信息统计

> binary.summary = function(w, b, X, Y){
+     m = ncol(X)
+     G = binary.sigmoid(t(w) %*% X + b) # sigmoid激活函数值
+     dw = (1 / m) * (X %*% t(G - Y)) # 参数w的梯度
+     db = (1 / m) * rowSums(G - Y) # 参数b的梯度
+    Loss = (-1 / m) * sum(Y * log(G) + (1 - Y) * log(1 - G)) # 损失
+ 
+   gradient = list(dw, db) # 梯度统计
+   return(list(gradient, Loss))   }

二分类预测

> binary.predict = function(w, b, X){
+          m = ncol(X)
+        temp = matrix(0,nrow = m, ncol = 1)
+      G = binary.sigmoid(t(w) %*% X + b) 
+        for(i in 1:m){
+           if(G[1,i] > 0.5) { temp[i,1] = 1  
+            } else { temp[i,1] = 0}}
+      return(temp) }

反向传播

> binary.optimize = function(w, b, X, Y, learning.rate){
+     grad = binary.summary(w, b, X, Y)[[1]]
+     Loss = binary.summary(w, b, X, Y)[[2]]
+     
+     dw = matrix(grad[[1]])
+     db = grad[[2]]
+     
+     w = w - learning.rate * dw
+     b = b - learning.rate * db
+ 
+     params = list(w, b, Loss)
+     return(params)  } # 梯度下降

预测

> binary.model = function(Xtrain, ytrain, Xtest, ytest, learning.rate,
+                number ){ # 迭代次数
+      k = nrow(Xtrain)
+     w = binary.Initalize(k)[[1]] 
+     b = binary.Initalize(k)[[2]] 
+     df = matrix(0,nrow = number, ncol = 3,dimnames = list(NULL, c('train',
+           'test', 'Loss'))) # 准确率 LOSS
+       for(i in 1:number){
+         
+    res = binary.optimize(w, b, Xtrain, ytrain, learning.rate) 
+        w = as.matrix(res[[1]])
+        b = res[[2]]
+     ytrain.pred = binary.predict(w, b, Xtrain)
+     ytest.pred = binary.predict(w, b, Xtest)
+      df[i,] = c(mean(ytrain == ytrain.pred) * 100, 
+                 mean(ytest == ytest.pred) * 100, res[[3]])
+       }
+    return(df)    }

可视化

不同学习率以及不同迭代次数下的分类准确率以及LOSS

以下只是列举了学习率为0.01,0.005,0.002，迭代次数为500,1000时的情况，对于其他的情况欢迎感兴趣的读者尝试以及提出改进意见.

> S1 = binary.model(df_train, y_train, df_test, y_test, 0.01, 500)
> S1 = binary.model(df_train, y_train, df_test, y_test, 0.005, 500)
> S1 = binary.model(df_train, y_train, df_test, y_test, 0.01, 500)
> S2 = binary.model(df_train, y_train, df_test, y_test, 0.005, 500)
> S3 = binary.model(df_train, y_train, df_test, y_test, 0.002, 500)
> S4 = binary.model(df_train, y_train, df_test, y_test, 0.01, 1000)
> S5 = binary.model(df_train, y_train, df_test, y_test, 0.005, 1000)
> S6 = binary.model(df_train, y_train, df_test, y_test, 0.002, 1000)

> p1 = ggplot(melt(S1,id.vars = c('x'))[1:1000,], 
+       aes(x = x, y = value, colour = variable))+
+ geom_point()+
+ geom_line()+
+ labs(colour = 'accuration')
> 
> plot_grid(p1, p2, p3, nrow = 3, ncol = 1)
> p1 = ggplot(melt(S1,id.vars = c('x'))[1:1000,], 
+       aes(x = x, y = value, colour = variable))+
+ geom_point()+
+ geom_line()+
+ labs(colour = 'accuration')+
+ xlab('')
> 
> p1
> p1 = ggplot(melt(S1,id.vars = c('x'))[1:1000,], 
+       aes(x = x, y = value, colour = variable))+
+ geom_point()+
+ geom_line()+
+ labs(colour = 'accuration')+
+ xlab('')
> 
> p2 = ggplot(melt(S2,id.vars = c('V4'))[1:1000,],
+        aes(x = V4, y = value, colour = variable))+
+ geom_point()+
+ geom_line()+
+ scale_colour_manual(values = c('#4DBBD5FF', '#00A087FF'))+
+ labs(colour = 'accuration')+
+ xlab('')
> 
> 
> p3 = ggplot(melt(S3,id.vars = c('V4'))[1:1000,],
+        aes(x = V4, y = value, colour = variable))+
+ geom_point()+
+ geom_line()+
+ scale_colour_manual(values = c('#4DBBD5FF', '#00A087FF'))+
+ labs(colour = 'accuration')+
+ xlab('')
> 
> 
> p4 = ggplot(melt(S4,id.vars = c('V4'))[1:2000,],
+        aes(x = V4, y = value, colour = variable))+
+ geom_point()+
+ geom_line()+
+ scale_colour_manual(values = c('#DC0000FF', '#7E6148FF'))+
+ labs(colour = 'accuration')+
+ xlab('')
> 
> 
> 
> p5 = ggplot(melt(S5,id.vars = c('V4'))[1:2000,],
+        aes(x = V4, y = value, colour = variable))+
+ geom_point()+
+ geom_line()+
+ scale_colour_manual(values = c('#3C5488FF', '#F39B7FFF'))+
+ labs(colour = 'accuration')+
+ xlab('')
> 
> p6 = ggplot(melt(S6,id.vars = c('V4'))[1:2000,],
+        aes(x = V4, y = value, colour = variable))+
+ geom_point()+
+ geom_line()+
+ scale_colour_manual(values = c('#DC0000FF', '#7E6148FF'))+
+ labs(colour = 'accuration')+
+ xlab('')
> 
> 
> plot_grid(p1, p2, p3, nrow = 3, ncol = 1)

在这里插入图片描述

> plot(NULL, type = 'n', xlab = ' ', ylab = 'Loss',
+      xlim = c(1,500), ylim = c(0,1),
+      xaxt = 'n', yaxt = 'n', cex.lab = .7, main = 'circulate=500')
> lines(y1, col = '#E64B35FF', lty = 1, lwd = 2)
> lines(y2, col = '#4DBBD5FF', lty = 1, lwd = 2)
> lines(y3, col = '#00A087FF', lty = 1, lwd = 2)
> legend('topright', legend = c('learning rate 0.01',
+                   'learning rate 0.005', 'learning rate 0.002'),
+        col = c('#E64B35FF', '#4DBBD5FF', '#00A087FF'),
+        lty = c(1, 1, 1), lwd = c(2, 2, 2),
+        cex = .65, inset = .001)
> axis(side = 1, col = '#7E6148FF', cex.axis = .7)
> axis(side = 2, col = '#7E6148FF', cex.axis = .7)
> 
> plot(NULL, type = 'n', xlab = ' ', ylab = 'Loss',
+      xlim = c(1,1000), ylim = c(0,1),
+      xaxt = 'n', yaxt = 'n', cex.lab = .7, main = 'circulate=1000')
> lines(y4, col = '#3C5488FF', lty = 2, lwd = 2)
> lines(y5, col = '#F39B7FFF', lty = 2, lwd = 2)
> lines(y6, col = '#8491B4FF', lty = 2, lwd = 2)
> legend('topright', legend = c('learning rate 0.01',
+                   'learning rate 0.005', 'learning rate 0.002'),
+        col = c('#3C5488FF', '#F39B7FFF', '#8491B4FF'),
+        lty = c(2, 2, 2), lwd = c(2, 2, 2),
+        cex = .65, inset = .001)
> axis(side = 1, col = '#7E6148FF', cex.axis = .7)
> axis(side = 2, col = '#7E6148FF', cex.axis = .7)

在这里插入图片描述

数据资源及代码资源下载

链接如下：
本篇博客数据以及代码资源下载.

关于交叉熵损失结合反向传播算法实现逻辑回归在这里要和大家说声再见了，非常欢迎感兴趣的读者提出问题和建议！

麻利麻利哄吧

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
交叉熵损失结合反向传播算法不调包实现逻辑回归

本篇博客手动实现了交叉熵损失下逻辑回归的不调包实现，并且结合具体的二分类数据对代码进行了测试，可视化了不同参数下训练集与测试集的效果，可以帮助读者尽可能熟悉逻辑回归，掌握神经网络的初步搭建.
复制链接

扫一扫