logistic regression on hadoop

# Copyright 2011 Revolution Analytics
#    
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#      http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

 
## see spark implementation http://www.spark-project.org/examples.html
## see nice derivation here http://people.csail.mit.edu/jrennie/writing/lr.pdf

library(rmr2)

## @knitr logistic.regression-signature
logistic.regression = 
  function(input, iterations, dims, alpha){

## @knitr logistic.regression-map
  lr.map =          
    function(., M) {
	  print("M")
	  print(M)
      Y = M[,1] 
      X = M[,-1]
	  #print(Y)
	  #print(X)
	  print("plane")
	  print(plane)
	  print("X:")
	  print(X)
	  print("Y:")
	  print(Y)
	  print("X*Y")
	  print(X*Y)
	  print("g(-Y * as.numeric(X %*% t(plane)))")
	  print(g(-Y * as.numeric(X %*% t(plane))))
	  print("Y * X * g(-Y * as.numeric(X %*% t(plane)))")
	  print(Y * X * g(-Y * as.numeric(X %*% t(plane))))
	  print("t(as.matrix(apply(Y * X * g(-Y * as.numeric(X %*% t(plane))),2,sum)))")
	  print(t(as.matrix(apply(Y * X * g(-Y * as.numeric(X %*% t(plane))),2,sum))))
      keyval(
        1,
        Y * X * 
          g(-Y * as.numeric(X %*% t(plane))))}
## @knitr logistic.regression-reduce
  lr.reduce =
    function(k, Z) 
      keyval(k, t(as.matrix(apply(Z,2,sum))))
## @knitr logistic.regression-main
  plane = t(rep(0, dims))
  g = function(z) 1/(1 + exp(-z))
  for (i in 1:iterations) {
    gradient = 
      values(
        from.dfs(
          mapreduce(
            input,
            map = lr.map,     
            reduce = lr.reduce,
            combine = TRUE)))
    plane = plane + alpha * gradient }
  plane }
## @knitr end

out = list()
test.size = 10
for (be in c("local", "hadoop")) {
  rmr.options(backend = be)
  ## create test set 
  set.seed(0)
## @knitr logistic.regression-data
  eps = rnorm(test.size)
  testdata = 
    to.dfs(
      as.matrix(
        data.frame(
          y = 2 * (eps > 0) - 1,
          x1 = 1:test.size, 
          x2 = 1:test.size + eps)))
## @knitr end  
  out[[be]] = 
## @knitr logistic.regression-run 
    logistic.regression(
      testdata, 1, 2, 0.05)
## @knitr end  
  ## max likelihood solution diverges for separable dataset, (-inf, inf) such as the above
}
stopifnot(
  isTRUE(all.equal(out[['local']], out[['hadoop']], tolerance = 1E-7)))


[1] "M"
       y x1        x2
 [1,]  1  1  2.262954
 [2,] -1  2  1.673767
 [3,]  1  3  4.329799
 [4,]  1  4  5.272429
 [5,]  1  5  5.414641
 [6,] -1  6  4.460050
 [7,] -1  7  6.071433
 [8,] -1  8  7.705280
 [9,] -1  9  8.994233
[10,]  1 10 12.404653
[1] "plane"
     [,1] [,2]
[1,]    0    0
[1] "X:"
      x1        x2
 [1,]  1  2.262954
 [2,]  2  1.673767
 [3,]  3  4.329799
 [4,]  4  5.272429
 [5,]  5  5.414641
 [6,]  6  4.460050
 [7,]  7  6.071433
 [8,]  8  7.705280
 [9,]  9  8.994233
[10,] 10 12.404653
[1] "Y:"
 [1]  1 -1  1  1  1 -1 -1 -1 -1  1
[1] "X*Y"
      x1        x2
 [1,]  1  2.262954
 [2,] -2 -1.673767
 [3,]  3  4.329799
 [4,]  4  5.272429
 [5,]  5  5.414641
 [6,] -6 -4.460050
 [7,] -7 -6.071433
 [8,] -8 -7.705280
 [9,] -9 -8.994233
[10,] 10 12.404653
[1] "g(-Y * as.numeric(X %*% t(plane)))"
 [1] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
[1] "Y * X * g(-Y * as.numeric(X %*% t(plane)))"
        x1         x2
 [1,]  0.5  1.1314771
 [2,] -1.0 -0.8368833
 [3,]  1.5  2.1648996
 [4,]  2.0  2.6362147
 [5,]  2.5  2.7073207
 [6,] -3.0 -2.2300250
 [7,] -3.5 -3.0357165
 [8,] -4.0 -3.8526398
 [9,] -4.5 -4.4971164
[10,]  5.0  6.2023267
[1] "t(as.matrix(apply(Y * X * g(-Y * as.numeric(X %*% t(plane))),2,sum)))"
       x1        x2
[1,] -4.5 0.3898579




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值