# Copyright 2011 Revolution Analytics
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
## see spark implementation http://www.spark-project.org/examples.html
## see nice derivation here http://people.csail.mit.edu/jrennie/writing/lr.pdf
library(rmr2)
## @knitr logistic.regression-signature
logistic.regression =
function(input, iterations, dims, alpha){
## @knitr logistic.regression-map
lr.map =
function(., M) {
print("M")
print(M)
Y = M[,1]
X = M[,-1]
#print(Y)
#print(X)
print("plane")
print(plane)
print("X:")
print(X)
print("Y:")
print(Y)
print("X*Y")
print(X*Y)
print("g(-Y * as.numeric(X %*% t(plane)))")
print(g(-Y * as.numeric(X %*% t(plane))))
print("Y * X * g(-Y * as.numeric(X %*% t(plane)))")
print(Y * X * g(-Y * as.numeric(X %*% t(plane))))
print("t(as.matrix(apply(Y * X * g(-Y * as.numeric(X %*% t(plane))),2,sum)))")
print(t(as.matrix(apply(Y * X * g(-Y * as.numeric(X %*% t(plane))),2,sum))))
keyval(
1,
Y * X *
g(-Y * as.numeric(X %*% t(plane))))}
## @knitr logistic.regression-reduce
lr.reduce =
function(k, Z)
keyval(k, t(as.matrix(apply(Z,2,sum))))
## @knitr logistic.regression-main
plane = t(rep(0, dims))
g = function(z) 1/(1 + exp(-z))
for (i in 1:iterations) {
gradient =
values(
from.dfs(
mapreduce(
input,
map = lr.map,
reduce = lr.reduce,
combine = TRUE)))
plane = plane + alpha * gradient }
plane }
## @knitr end
out = list()
test.size = 10
for (be in c("local", "hadoop")) {
rmr.options(backend = be)
## create test set
set.seed(0)
## @knitr logistic.regression-data
eps = rnorm(test.size)
testdata =
to.dfs(
as.matrix(
data.frame(
y = 2 * (eps > 0) - 1,
x1 = 1:test.size,
x2 = 1:test.size + eps)))
## @knitr end
out[[be]] =
## @knitr logistic.regression-run
logistic.regression(
testdata, 1, 2, 0.05)
## @knitr end
## max likelihood solution diverges for separable dataset, (-inf, inf) such as the above
}
stopifnot(
isTRUE(all.equal(out[['local']], out[['hadoop']], tolerance = 1E-7)))
[1] "M"
y x1 x2
[1,] 1 1 2.262954
[2,] -1 2 1.673767
[3,] 1 3 4.329799
[4,] 1 4 5.272429
[5,] 1 5 5.414641
[6,] -1 6 4.460050
[7,] -1 7 6.071433
[8,] -1 8 7.705280
[9,] -1 9 8.994233
[10,] 1 10 12.404653
[1] "plane"
[,1] [,2]
[1,] 0 0
[1] "X:"
x1 x2
[1,] 1 2.262954
[2,] 2 1.673767
[3,] 3 4.329799
[4,] 4 5.272429
[5,] 5 5.414641
[6,] 6 4.460050
[7,] 7 6.071433
[8,] 8 7.705280
[9,] 9 8.994233
[10,] 10 12.404653
[1] "Y:"
[1] 1 -1 1 1 1 -1 -1 -1 -1 1
[1] "X*Y"
x1 x2
[1,] 1 2.262954
[2,] -2 -1.673767
[3,] 3 4.329799
[4,] 4 5.272429
[5,] 5 5.414641
[6,] -6 -4.460050
[7,] -7 -6.071433
[8,] -8 -7.705280
[9,] -9 -8.994233
[10,] 10 12.404653
[1] "g(-Y * as.numeric(X %*% t(plane)))"
[1] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
[1] "Y * X * g(-Y * as.numeric(X %*% t(plane)))"
x1 x2
[1,] 0.5 1.1314771
[2,] -1.0 -0.8368833
[3,] 1.5 2.1648996
[4,] 2.0 2.6362147
[5,] 2.5 2.7073207
[6,] -3.0 -2.2300250
[7,] -3.5 -3.0357165
[8,] -4.0 -3.8526398
[9,] -4.5 -4.4971164
[10,] 5.0 6.2023267
[1] "t(as.matrix(apply(Y * X * g(-Y * as.numeric(X %*% t(plane))),2,sum)))"
x1 x2
[1,] -4.5 0.3898579