前面已经陈述过logistic的理论的了,在此就不赘述了(http://blog.csdn.net/legotime/article/details/51312393)
Logistic 函数(分类时有个名字叫Sigmoid函数)如下:
logistic函数早期是用于人口预测的。但随着人们对其的应用扩展,开始慢慢应用于分类问题,而且是神经网络中一个
经常使用的过渡函数,图1是将logistic函数
图1
它的原理是:在分二类的情况下,当h的计算值大于0.5时,让h等于1,h的计算值小于等于于0.5时,让h等于0。這样
对于输入一个X
那么结果就分类 0 或 1,所以达到了分类的效果。当然logistic函数可以应用于多个类的情况。
-----------------------------------------------------------------------------------------------------
spark Logistic模型训练图
-------------------------------------------------------------------------------------------------------
源码分析
package org.apache.spark.mllib.classification
import org.apache.spark.SparkContext
import org.apache.spark.annotation.Since
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.mllib.classification.impl.GLMClassificationModel
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.mllib.linalg.BLAS.dot
import org.apache.spark.mllib.optimization._
import org.apache.spark.mllib.pmml.PMMLExportable
import org.apache.spark.mllib.regression._
import org.apache.spark.mllib.util.{DataValidators, Loader, Saveable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.storage.StorageLevel
/**
* 利用(Multinomial/Binary)logistic回归来训练分类模型
*
* @param weights 特征的权重
* @param intercept 偏置(二元回归的时候是一个值,在多元回归的时候会和特征融合在一起.)
* @param numFeatures 特征的维度
* @param numClasses 多元回归分析中的类分类问题的可能结果的个数。默认情况下,它是二元Logistic回归,numclasses将被设置为2。
*/
@Since("0.8.0")
class LogisticRegressionModel @Since("1.3.0") (
@Since("1.0.0") override val weights: Vector,
@Since("1.0.0") override val intercept: Double,
@Since("1.3.0") val numFeatures: Int,
@Since("1.3.0") val numClasses: Int)
extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable
with Saveable with PMMLExportable {
if (numClasses == 2) {
require(weights.size == numFeatures,
s"LogisticRegressionModel with numClasses = 2 was given non-matching values:" +
s" numFeatures = $numFeatures, but weights.size = ${weights.size}")
} else {
val weightsSizeWithoutIntercept = (numClasses - 1) * numFeatures
val weightsSizeWithIntercept = (numClasses - 1) * (numFeatures + 1)
require(weights.size == weightsSizeWithoutIntercept || weights.size == weightsSizeWithIntercept,
s"LogisticRegressionModel.load with numClasses = $numClasses and numFeatures = $numFeatures" +
s" expected weights of length $weightsSizeWithoutIntercept (without intercept)" +
s" or $weightsSizeWithIntercept (with intercept)," +
s" but was given weights of length ${weights.size}")
}
private val dataWithBiasSize: Int = weights.size / (numClasses - 1)
private val weightsArray: Array[Double] = weights match {
case dv: DenseVector => dv.values
case _ =>
throw new IllegalArgumentException(
s"weights only supports dense vector but got type ${weights.getClass}.")
}
/**
* 构建一个LogisticRegressionModel,权重和偏置都是二维的。
*/
@Since("1.0.0")
def this(weights: Vector, intercept: Double) = this(weights, intercept, weights.size, 2)
private var threshold: Option[Double] = Some(0.5)
/**
* 设置 阈值,对于二分类情况下。這个阈值用于当y大于它时,就在 positive,当小于它时,就分来negative
* 默认情况之恶个這个阈值设置为 0.5
*/
@Since("1.0.0")
def setThreshold(threshold: Double): this.type = {
this.threshold = Some(threshold)
this
}
/**
*返回的阈值(如果有的话),用于将原始预测分数转换为0 / 1预测。它仅用于二进制分类。
*/
@Since("1.3.0")
def getThreshold: Option[Double] = threshold
/**
* 清除阈值,以便“预测”将输出预测值。
* 它仅用于二进制分类
*/
@Since("1.0.0")
def clearThreshold(): this.type = {
threshold = None
this
}
override protected def predictPoint(
dataMatrix: Vector,
weightMatrix: Vector,
intercept: Double) = {
require(dataMatrix.size == numFeatures)
// 如果 dataMatrix和 weightMatrix 具有相同的维度, 那么它是二分类的logistic回归
if (numClasses == 2) {
val margin = dot(weightMatrix, dataMatrix) + intercept
val score = 1.0 / (1.0 + math.exp(-margin))
threshold match {
case Some(t) => if (score > t) 1.0 else 0.0
case None => score
}
} else {
/**
* Compute and find the one with maximum margins. If the maxMargin is negative, then the
* prediction result will be the first class.
*
* PS, if you want to compute the probabilities for each outcome instead of the outcome
* with maximum probability, remember to subtract the maxMargin from margins if maxMargin
* is positive to prevent overflow.
*/
var bestClass = 0
var maxMargin = 0.0
val withBias = dataMatrix.size + 1 == dataWithBiasSize
(0 until numClasses - 1).foreach { i =>
var margin = 0.0
dataMatrix.foreachActive { (index, value) =>
if (value != 0.0) margin += value * weightsArray((i * dataWithBiasSize) + index)
}
// Intercept is required to be added into margin.
if (withBias) {
margin += weightsArray((i * dataWithBiasSize) + dataMatrix.size)
}
if (margin > maxMargin) {
maxMargin = margin
bestClass = i + 1
}
}
bestClass.toDouble
}
}
@Since("1.3.0")
override def save(sc: SparkContext, path: String): Unit = {
GLMClassificationModel.SaveLoadV1_0.save(sc, path, this.getClass.getName,
numFeatures, numClasses, weights, intercept, threshold)
}
override protected def formatVersion: String = "1.0"
override def toString: String = {
s"${super.toString}, numClasses = ${numClasses}, threshold = ${threshold.getOrElse("None")}"
}
}
@Since("1.3.0")
object LogisticRegressionModel extends Loader[LogisticRegressionModel] {
@Since("1.3.0")
override def load(sc: SparkContext, path: String): LogisticRegressionModel = {
val (loadedClassName, version, metadata) = Loader.loadMetadata(sc, path)
// Hard-code class name string in case it changes in the future
val classNameV1_0 = "org.apache.spark.mllib.classification.LogisticRegressionModel"
(loadedClassName, version) match {
case (className, "1.0") if className == classNameV1_0 =>
val (numFeatures, numClasses) = ClassificationModel.getNumFeaturesClasses(metadata)
val data = GLMClassificationModel.SaveLoadV1_0.loadData(sc, path, classNameV1_0)
// numFeatures, numClasses, weights are checked in model initialization
val model =
new LogisticRegressionModel(data.weights, data.intercept, numFeatures, numClasses)
data.threshold match {
case Some(t) => model.setThreshold(t)
case None => model.clearThreshold()
}
model
case _ => throw new Exception(
s"LogisticRegressionModel.load did not recognize model with (className, format version):" +
s"($loadedClassName, $version). Supported:\n" +
s" ($classNameV1_0, 1.0)")
}
}
}
/**
* 用随机梯度下降算法来训练二分类的logitic回归的分类模型
* 默认情况下用L2正则化,它可以通过[[LogisticRegressionWithSGD.optimizer]].来改变
* note:二分类以上的K分类的logistic回归分类 ,Lables 可以为 {0, 1, ..., k - 1}
*/
@Since("0.8.0")
@deprecated("Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS", "2.0.0")
class LogisticRegressionWithSGD private[mllib] (
private var stepSize: Double,
private var numIterations: Int,
private var regParam: Double,
private var miniBatchFraction: Double)
extends GeneralizedLinearAlgorithm[LogisticRegressionModel] with Serializable {
private val gradient = new LogisticGradient()
private val updater = new SquaredL2Updater()
@Since("0.8.0")
override val optimizer = new GradientDescent(gradient, updater)
.setStepSize(stepSize)
.setNumIterations(numIterations)
.setRegParam(regParam)
.setMiniBatchFraction(miniBatchFraction)
override protected val validators = List(DataValidators.binaryLabelValidator)
/**
* 构建一个默认情况下的逻辑回归,默认参数是{stepSize: 1.0,numIterations: 100, regParm: 0.01, miniBatchFraction: 1.0}.
*/
@Since("0.8.0")
def this() = this(1.0, 100, 0.01, 1.0)
override protected[mllib] def createModel(weights: Vector, intercept: Double) = {
new LogisticRegressionModel(weights, intercept)
}
}
/**
* 最先用的方法是随机梯度下降
* NOTE: Logistic 回归的label应该是 {0, 1}
*/
@Since("0.8.0")
@deprecated("Use ml.classification.LogisticRegression or LogisticRegressionWithLBFGS", "2.0.0")
object LogisticRegressionWithSGD {
// NOTE(shivaram): We use multiple train methods instead of default arguments to support
// Java programs.
/**
* 给定一个 pair RDD(label, features) 训练一个logistic回归模型。我们通过特定步长来固定迭代次数。
* 每次迭代用miniBatchFraction来计算梯度。
* NOTE: Labels used in Logistic Regression should be {0, 1}
*
* @param input RDD of (label, array of features) pairs.
* @param numIterations Number of iterations of gradient descent to run.(迭代次数)
* @param stepSize Step size to be used for each iteration of gradient descent.(步长)
* @param miniBatchFraction Fraction of data to be used per iteration.(一次用于迭代的数据量)
* @param initialWeights Initial set of weights to be used. Array should be equal in size to
* the number of features in the data.
*/
@Since("1.0.0")
def train(
input: RDD[LabeledPoint],
numIterations: Int,
stepSize: Double,
miniBatchFraction: Double,
initialWeights: Vector): LogisticRegressionModel = {
new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction)
.run(input, initialWeights)
}
/**
/**
* 给定一个 pair RDD(label, features) 训练一个logistic回归模型。我们通过特定步长来固定迭代次数。
* 每次迭代用miniBatchFraction来计算梯度。
* NOTE: Labels used in Logistic Regression should be {0, 1}
*
* @param input RDD of (label, array of features) pairs.
* @param numIterations Number of iterations of gradient descent to run.
* @param stepSize Step size to be used for each iteration of gradient descent.
* @param miniBatchFraction Fraction of data to be used per iteration.
*/
@Since("1.0.0")
def train(
input: RDD[LabeledPoint],
numIterations: Int,
stepSize: Double,
miniBatchFraction: Double): LogisticRegressionModel = {
new LogisticRegressionWithSGD(stepSize, numIterations, 0.0, miniBatchFraction)
.run(input)
}
/**
/**
* 给定一个 pair RDD(label, features) 训练一个logistic回归模型。我们通过特定步长来固定迭代次数。
* 每次迭代用miniBatchFraction来计算梯度。
* NOTE: Labels used in Logistic Regression should be {0, 1}
*
* @param input RDD of (label, array of features) pairs.
* @param stepSize Step size to be used for each iteration of Gradient Descent.
* @param numIterations Number of iterations of gradient descent to run.
* @return a LogisticRegressionModel which has the weights and offset from training.
*/
@Since("1.0.0")
def train(
input: RDD[LabeledPoint],
numIterations: Int,
stepSize: Double): LogisticRegressionModel = {
train(input, numIterations, stepSize, 1.0)
}
/**
/**
* 给定一个 pair RDD(label, features) 训练一个logistic回归模型。我们通过特定步长来固定迭代次数。
* 每次迭代用miniBatchFraction来计算梯度。
* NOTE: Labels used in Logistic Regression should be {0, 1}
*
* @param input RDD of (label, array of features) pairs.
* @param numIterations Number of iterations of gradient descent to run.
* @return a LogisticRegressionModel which has the weights and offset from training.
*/
@Since("1.0.0")
def train(
input: RDD[LabeledPoint],
numIterations: Int): LogisticRegressionModel = {
train(input, numIterations, 1.0, 1.0)
}
}
/**
* 用Limited-memory BFGS算法来训练二分类/K分类的logitic回归的分类模型,默认情况下是用L2正则化
* note:二分类以上的K分类的logistic回归分类 ,Lables 可以为 {0, 1, ..., k - 1}
* 早期是用 LogisticRegressionWithLBFGS来实现正则化,包括偏置。如果updates是(L1Updater, or SquaredL2Updater) ,
* 那么它应该是来自 ml.LogisticRegression
* 否则就是现在的 mllib下的广义线性算法(GeneralizedLinearAlgorithm)来训练,
*/
@Since("1.1.0")
class LogisticRegressionWithLBFGS
extends GeneralizedLinearAlgorithm[LogisticRegressionModel] with Serializable {
this.setFeatureScaling(true)
@Since("1.1.0")
override val optimizer = new LBFGS(new LogisticGradient, new SquaredL2Updater)
override protected val validators = List(multiLabelValidator)
private def multiLabelValidator: RDD[LabeledPoint] => Boolean = { data =>
if (numOfLinearPredictor > 1) {
DataValidators.multiLabelValidator(numOfLinearPredictor + 1)(data)
} else {
DataValidators.binaryLabelValidator(data)
}
}
/**
* 在多分类(k)的logistic回归中,设置用于类分类问题的可能结果的数量。默认情况下k = 2
*/
@Since("1.3.0")
def setNumClasses(numClasses: Int): this.type = {
require(numClasses > 1)
numOfLinearPredictor = numClasses - 1
if (numClasses > 2) {
optimizer.setGradient(new LogisticGradient(numClasses))
}
this
}
override protected def createModel(weights: Vector, intercept: Double) = {
if (numOfLinearPredictor == 1) {
new LogisticRegressionModel(weights, intercept)
} else {
new LogisticRegressionModel(weights, intercept, numFeatures, numOfLinearPredictor + 1)
}
}
/**
* Run Logistic Regression with the configured parameters on an input RDD
* of LabeledPoint entries.
*
*
* 如果在之前声明了更新的方法是 ml包下面的,那么就是,如果不是那么选择的是mllib包下的更新方法
*/
override def run(input: RDD[LabeledPoint]): LogisticRegressionModel = {
run(input, generateInitialWeights(input), userSuppliedWeights = false)
}
/**
* Run Logistic Regression with the configured parameters on an input RDD
* of LabeledPoint entries.
*
*
* 如果在之前声明了更新的方法是 ml包下面的,那么就是,如果不是那么选择的是mllib包下的更新方法
*/
*note:因为在ml包下没有配置LBFGS更新方法,所以optimizer.setNumCorrections()是无效的
*/
override def run(input: RDD[LabeledPoint], initialWeights: Vector): LogisticRegressionModel = {
run(input, initialWeights, userSuppliedWeights = true)
}
private def run(input: RDD[LabeledPoint], initialWeights: Vector, userSuppliedWeights: Boolean):
LogisticRegressionModel = {
// ml's Logistic regression only supports binary classification currently.
if (numOfLinearPredictor == 1) {
def runWithMlLogisitcRegression(elasticNetParam: Double) = {
// Prepare the ml LogisticRegression based on our settings
val lr = new org.apache.spark.ml.classification.LogisticRegression()
lr.setRegParam(optimizer.getRegParam())
lr.setElasticNetParam(elasticNetParam)
lr.setStandardization(useFeatureScaling)
if (userSuppliedWeights) {
val uid = Identifiable.randomUID("logreg-static")
lr.setInitialModel(new org.apache.spark.ml.classification.LogisticRegressionModel(
uid, initialWeights.asML, 1.0))
}
lr.setFitIntercept(addIntercept)
lr.setMaxIter(optimizer.getNumIterations())
lr.setTol(optimizer.getConvergenceTol())
// Convert our input into a DataFrame
val sqlContext = new SQLContext(input.context)
import sqlContext.implicits._
val df = input.map(_.asML).toDF()
// Determine if we should cache the DF
val handlePersistence = input.getStorageLevel == StorageLevel.NONE
// Train our model
val mlLogisticRegresionModel = lr.train(df, handlePersistence)
// convert the model
val weights = Vectors.dense(mlLogisticRegresionModel.coefficients.toArray)
createModel(weights, mlLogisticRegresionModel.intercept)
}
optimizer.getUpdater() match {
case x: SquaredL2Updater => runWithMlLogisitcRegression(0.0)
case x: L1Updater => runWithMlLogisitcRegression(1.0)
case _ => super.run(input, initialWeights)
}
} else {
super.run(input, initialWeights)
}
}
}
SparkML实验
import org.apache.spark.mllib.classification.{LogisticRegressionModel, LogisticRegressionWithLBFGS} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample").setMaster("local") val sc = new SparkContext(conf) val data = MLUtils.loadLibSVMFile(sc, "C:\\Users\\alienware\\IdeaProjects\\sparkCore\\data\\mllib\\sample_libsvm_data.txt") // Split data into training (60%) and test (40%). val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) // Run training algorithm to build the model val model = new LogisticRegressionWithLBFGS() .setNumClasses(2) .run(training) // Compute raw scores on the test set. val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } predictionAndLabels.foreach(println) // Get evaluation metrics. val metrics = new MulticlassMetrics(predictionAndLabels) val precision = metrics.precision println("Precision = " + precision) // Save and load model model.save(sc, "target/tmp/scalaLogisticRegressionWithLBFGSModel") val sameModel = LogisticRegressionModel.load(sc, "target/tmp/scalaLogisticRegressionWithLBFGSModel") sc.stop() } } //预测数据和实际数据(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(0.0,0.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(0.0,0.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(0.0,0.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(1.0,1.0)
(0.0,0.0)
Precision = 1.0