logistic regression on hadoop

最新推荐文章于 2022-06-18 12:22:16 发布

xiewenbo

最新推荐文章于 2022-06-18 12:22:16 发布

阅读量836

点赞数

分类专栏： hadoop 回归分析 regression 机器学习和数据挖掘

hadoop 同时被 3 个专栏收录

104 篇文章 1 订阅

订阅专栏

机器学习和数据挖掘

64 篇文章 2 订阅

订阅专栏

回归分析 regression

7 篇文章 0 订阅

订阅专栏

# Copyright 2011 Revolution Analytics
#    
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#      http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

 
## see spark implementation http://www.spark-project.org/examples.html
## see nice derivation here http://people.csail.mit.edu/jrennie/writing/lr.pdf

library(rmr2)

## @knitr logistic.regression-signature
logistic.regression = 
  function(input, iterations, dims, alpha){

## @knitr logistic.regression-map
  lr.map =          
    function(., M) {
	  print("M")
	  print(M)
      Y = M[,1] 
      X = M[,-1]
	  #print(Y)
	  #print(X)
	  print("plane")
	  print(plane)
	  print("X:")
	  print(X)
	  print("Y:")
	  print(Y)
	  print("X*Y")
	  print(X*Y)
	  print("g(-Y * as.numeric(X %*% t(plane)))")
	  print(g(-Y * as.numeric(X %*% t(plane))))
	  print("Y * X * g(-Y * as.numeric(X %*% t(plane)))")
	  print(Y * X * g(-Y * as.numeric(X %*% t(plane))))
	  print("t(as.matrix(apply(Y * X * g(-Y * as.numeric(X %*% t(plane))),2,sum)))")
	  print(t(as.matrix(apply(Y * X * g(-Y * as.numeric(X %*% t(plane))),2,sum))))
      keyval(
        1,
        Y * X * 
          g(-Y * as.numeric(X %*% t(plane))))}
## @knitr logistic.regression-reduce
  lr.reduce =
    function(k, Z) 
      keyval(k, t(as.matrix(apply(Z,2,sum))))
## @knitr logistic.regression-main
  plane = t(rep(0, dims))
  g = function(z) 1/(1 + exp(-z))
  for (i in 1:iterations) {
    gradient = 
      values(
        from.dfs(
          mapreduce(
            input,
            map = lr.map,     
            reduce = lr.reduce,
            combine = TRUE)))
    plane = plane + alpha * gradient }
  plane }
## @knitr end

out = list()
test.size = 10
for (be in c("local", "hadoop")) {
  rmr.options(backend = be)
  ## create test set 
  set.seed(0)
## @knitr logistic.regression-data
  eps = rnorm(test.size)
  testdata = 
    to.dfs(
      as.matrix(
        data.frame(
          y = 2 * (eps > 0) - 1,
          x1 = 1:test.size, 
          x2 = 1:test.size + eps)))
## @knitr end  
  out[[be]] = 
## @knitr logistic.regression-run 
    logistic.regression(
      testdata, 1, 2, 0.05)
## @knitr end  
  ## max likelihood solution diverges for separable dataset, (-inf, inf) such as the above
}
stopifnot(
  isTRUE(all.equal(out[['local']], out[['hadoop']], tolerance = 1E-7)))

[1] "M"
y x1 x2
[1,] 1 1 2.262954
[2,] -1 2 1.673767
[3,] 1 3 4.329799
[4,] 1 4 5.272429
[5,] 1 5 5.414641
[6,] -1 6 4.460050
[7,] -1 7 6.071433
[8,] -1 8 7.705280
[9,] -1 9 8.994233
[10,] 1 10 12.404653
[1] "plane"
[,1] [,2]
[1,] 0 0
[1] "X:"
x1 x2
[1,] 1 2.262954
[2,] 2 1.673767
[3,] 3 4.329799
[4,] 4 5.272429
[5,] 5 5.414641
[6,] 6 4.460050
[7,] 7 6.071433
[8,] 8 7.705280
[9,] 9 8.994233
[10,] 10 12.404653
[1] "Y:"
[1] 1 -1 1 1 1 -1 -1 -1 -1 1
[1] "X*Y"
x1 x2
[1,] 1 2.262954
[2,] -2 -1.673767
[3,] 3 4.329799
[4,] 4 5.272429
[5,] 5 5.414641
[6,] -6 -4.460050
[7,] -7 -6.071433
[8,] -8 -7.705280
[9,] -9 -8.994233
[10,] 10 12.404653
[1] "g(-Y * as.numeric(X %*% t(plane)))"
[1] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
[1] "Y * X * g(-Y * as.numeric(X %*% t(plane)))"
x1 x2
[1,] 0.5 1.1314771
[2,] -1.0 -0.8368833
[3,] 1.5 2.1648996
[4,] 2.0 2.6362147
[5,] 2.5 2.7073207
[6,] -3.0 -2.2300250
[7,] -3.5 -3.0357165
[8,] -4.0 -3.8526398
[9,] -4.5 -4.4971164
[10,] 5.0 6.2023267
[1] "t(as.matrix(apply(Y * X * g(-Y * as.numeric(X %*% t(plane))),2,sum)))"
x1 x2
[1,] -4.5 0.3898579