《Spark MLlib 机器学习》第十五章代码
1、神经网络类
package NN
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import org.apache.spark.Logging
import org.apache.spark.mllib.linalg._
import breeze.linalg.{
Matrix => BM,
CSCMatrix => BSM,
DenseMatrix => BDM,
Vector => BV,
DenseVector => BDV,
SparseVector => BSV,
axpy => brzAxpy,
svd => brzSvd
}
import breeze.numerics.{
exp => Bexp,
tanh => Btanh
}
import scala.collection.mutable.ArrayBuffer
import java.util.Random
import scala.math._
/**
* label:目标矩阵
* nna:神经网络每层节点的输出值,a(0),a(1),a(2)
* error:输出层与目标值的误差矩阵
*/
case class NNLabel(label: BDM[Double], nna: ArrayBuffer[BDM[Double]], error: BDM[Double]) extends Serializable
/**
* 配置参数
*/
case class NNConfig(
size: Array[Int],
layer: Int,
activation_function: String,
learningRate: Double,
momentum: Double,
scaling_learningRate: Double,
weightPenaltyL2: Double,
nonSparsityPenalty: Double,
sparsityTarget: Double,
inputZeroMaskedFraction: Double,
dropoutFraction: Double,
testing: Double,
output_function: String) extends Serializable
/**
* NN(neural network)
*/
class NeuralNet(
private var size: Array[Int],
private var layer: Int,
private var activation_function: String,
private var learningRate: Double,
private var momentum: Double,
private var scaling_learningRate: Double,
private var weightPenaltyL2: Double,
private var nonSparsityPenalty: Double,
private var sparsityTarget: Double,
private var inputZeroMaskedFraction: Double,
private var dropoutFraction: Double,
private var testing: Double,
private var output_function: String,
private var initW: Array[BDM[Double]]) extends Serializable with Logging {
// var size=Array(5, 10, 7, 1)
// var layer=4
// var activation_function="tanh_opt"
// var learningRate=2.0
// var momentum=0.5
// var scaling_learningRate=1.0
// var weightPenaltyL2=0.0
// var nonSparsityPenalty=0.0
// var sparsityTarget=0.05
// var inputZeroMaskedFraction=0.0
// var dropoutFraction=0.0
// var testing=0.0
// var output_function="sigm"
/**
* size = architecture;
* n = numel(nn.size);
* activation_function = sigm 隐含层函数Activation functions of hidden layers: 'sigm' (sigmoid) or 'tanh_opt' (optimal tanh).
* learningRate = 2; 学习率learning rate Note: typically needs to be lower when using 'sigm' activation function and non-normalized inputs.
* momentum = 0.5; Momentum
* scaling_learningRate = 1; Scaling factor for the learning rate (each epoch)
* weightPenaltyL2 = 0; 正则化L2 regularization
* nonSparsityPenalty = 0; 权重稀疏度惩罚值on sparsity penalty
* sparsityTarget = 0.05; Sparsity target
* inputZeroMaskedFraction = 0; 加入noise,Used for Denoising AutoEncoders
* dropoutFraction = 0; 每一次mini-batch样本输入训练时,随机扔掉x%的隐含层节点Dropout level (http://www.cs.toronto.edu/~hinton/absps/dropout.pdf)
* testing = 0; Internal variable. nntest sets this to one.
* output = 'sigm'; 输出函数output unit 'sigm' (=logistic), 'softmax' and 'linear' *
*/
def this() = this(NeuralNet.Architecture, 3, NeuralNet.Activation_Function, 2.0, 0.5, 1.0, 0.0, 0.0, 0.05, 0.0, 0.0, 0.0, NeuralNet.Output, Array(BDM.zeros[Double](1, 1)))
/** 设置神经网络结构. Default: [10, 5, 1]. */
def setSize(size: Array[Int]): this.type = {
this.size = size
this
}
/** 设置神经网络层数据. Default: 3. */
def setLayer(layer: Int): this.type = {
this.layer = layer
this
}
/** 设置隐含层函数. Default: sigm. */
def setActivation_function(activation_function: String): this.type = {
this.activation_function = activation_function
this
}
/** 设置学习率因子. Default: 2. */
def setLearningRate(learningRate: Double): this.type = {
this.learningRate = learningRate
this
}
/** 设置Momentum. Default: 0.5. */
def setMomentum(momentum: Double): this.type = {
this.momentum = momentum
this
}
/** 设置scaling_learningRate. Default: 1. */
def setScaling_learningRate(scaling_learningRate: Double): this.type = {
this.scaling_learningRate = scaling_learningRate
this
}
/** 设置正则化L2因子. Default: 0. */
def setWeightPenaltyL2(weightPenaltyL2: Double): this.type = {
this.weightPenaltyL2 = weightPenaltyL2
this
}
/** 设置权重稀疏度惩罚因子. Default: 0. */
def setNonSparsityPenalty(nonSparsityPenalty: Double): this.type = {
this.nonSparsityPenalty = nonSparsityPenalty
this
}
/** 设置权重稀疏度目标值. Default: 0.05. */
def setSparsityTarget(sparsityTarget: Double): this.type = {
this.sparsityTarget = sparsityTarget
this
}
/** 设置权重加入噪声因子. Default: 0. */
def setInputZeroMaskedFraction(inputZeroMaskedFraction: Double): this.type = {
this.inputZeroMaskedFraction = inputZeroMaskedFraction
this
}
/** 设置权重Dropout因子. Default: 0. */
def setDropoutFraction(dropoutFraction: Double): this.type = {
this.dropoutFraction = dropoutFraction
this
}
/** 设置testing. Default: 0. */
def setTesting(testing: Double): this.type = {
this.testing = testing
this
}
/** 设置输出函数. Default: linear. */
def setOutput_function(output_function: String): this.type = {
this.output_function = output_function
this
}
/** 设置初始权重. Default: 0. */
def setInitW(initW: Array[BDM[Double]]): this.type = {
this.initW = initW
this
}
/**
* 运行神经网络算法.
*/
def NNtrain(train_d: RDD[(BDM[Double], BDM[Double])], opts: Array[Double]): NeuralNetModel = {
val sc = train_d.sparkContext
var initStartTime = System.currentTimeMillis()
var initEndTime = System.currentTimeMillis()
// 参数配置 广播配置
var nnconfig = NNConfig(size, layer, activation_function, learningRate, momentum, scaling_learningRate,
weightPenaltyL2, nonSparsityPenalty, sparsityTarget, inputZeroMaskedFraction, dropoutFraction, testing,
output_function)
// 初始化权重
var nn_W = NeuralNet.InitialWeight(size)
if (!((initW.length == 1) && (initW(0) == (BDM.zeros[Double](1, 1))))) {
for (i <- 0 to initW.length - 1) {
nn_W(i) = initW(i)
}
}
var nn_vW = NeuralNet.InitialWeightV(size)
// val tmpw = nn_W(0)
// for (i <- 0 to tmpw.rows - 1) {
// for (j <- 0 to tmpw.cols - 1) {
// print(tmpw(i, j) + "\t")
// }
// println()
// }
// 初始化每层的平均激活度nn.p
// average activations (for use with sparsity)
var nn_p = NeuralNet.InitialActiveP(size)
// 样本数据划分:训练数据、交叉检验数据
val validation = opts(2)
val splitW1 = Array(1.0 - validation, validation)
val train_split1 = train_d.randomSplit(splitW1, System.nanoTime())
val train_t = train_split1(0)
val train_v = train_split1(1)
// m:训练样本的数量
val m = train_t.count
// batchsize是做batch gradient时候的大小
// 计算batch的数量
val batchsize = opts(0).toInt
val numepochs = opts(1).toInt
val numbatches = (m / batchsize).toInt
var L = Array.fill(numepochs * numbatches.toInt)(0.0)
var n = 0
var loss_train_e = Array.fill(numepochs)(0.0)
var loss_val_e = Array.fill(numepochs)(0.0)
// numepochs是循环的次数
for (i <- 1 to numepochs) {
initStartTime = System.currentTimeMillis()
val splitW2 = Array.fill(numbatches)(1.0 / numbatches)
// 根据分组权重,随机划分每组样本数据
val bc_config = sc.broadcast(nnconfig)
for (l <- 1 to numbatches) {
// 权重
val bc_nn_W = sc.broadcast(nn_W)
val bc_nn_vW = sc.broadcast(nn_vW)
// println(i + "\t" + l)
// println("W1")
// val tmpw0 = bc_nn_W.value(0)
// for (i <- 0 to tmpw0.rows - 1) {
// for (j <- 0 to tmpw0.cols - 1) {
// print(tmpw0(i, j) + "\t")
// }
// println()
// }
// println("W2")
// val tmpw1 = bc_nn_W.value(1)
// for (i <- 0 to tmpw1.rows - 1) {
// for (j <- 0 to tmpw1.cols - 1) {
// print(tmpw1(i, j) + "\t")
// }
// println()
// }
// println("W3")
// val tmpw2 = bc_nn_W.value(2)
// for (i <- 0 to tmpw2.rows - 1) {
// for (j <- 0 to tmpw2.cols - 1) {
// print(tmpw2(i, j) + "\t")
// }
// println()
// }
// 样本划分
val train_split2 = train_t.randomSplit(splitW2, System.nanoTime())
val batch_xy1 = train_split2(l - 1)
// val train_split3 = train_t.filter { f => (f._1 >= batchsize * (l - 1) + 1) && (f._1 <= batchsize * (l)) }
// val batch_xy1 = train_split3.map(f => (f._2, f._3))
// Add noise to input (for use in denoising autoencoder)
// 加入noise,这是denoising autoencoder