spark mllib之分类和回归

Problem Type Supported Methods
二分类: 线性SVM(inear SVMs,), 罗辑回归(logistic regression), 决策树(decision trees),随机森林(random forests), 梯度增强树( gradient-boosted trees), 朴素贝叶斯(naive Bayes)
多分类器: 罗辑回归(logistic regression),决策树(decision trees), 随机森林(random forests), 朴素贝叶斯(naive Bayes)
回归: 最小线性二乘法,LASSO回归, 岭回归(ridge regression),决策树( decision trees), 随机森林(random forests), 梯度增强树(gradient-boosted trees), 保序回归(isotonic regression)

分类的目的是将一堆物品分类,最常见的是二分类,通常分为positive和negative的两类,如果有多个分类则为多分类,spark.mllib提供两种线形分类方法,SVM支持向量机和逻辑回归,线形SVM只支持二分类,逻辑回归则支持二分类和多分类,spark.mllib对于这两种方法都支持L1和L2的正则变量,训练集是LabelPoint对象,(label,v1,v2,v3,v4….vn)v都是数字,非数字要转成one hot
编码。

BinaryClassification:

package com.demo.spark.mllib

import scopt.OptionParser
import org.apache.spark.SparkConf
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.optimization.L1Updater
import org.apache.spark.mllib.optimization.SquaredL2Updater
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
object BinaryCLassification {

  object Alogrithm extends Enumeration {
    type Alogrithm = Value
    val SVM, LR = Value
  }

  object RegType extends Enumeration {
    type RegType = Value
    val L1, L2 = Value
  }

  import Alogrithm._
  import RegType._

  case class Params(
      input: String = null,
      numIterations: Int = 100,
      stepSize: Double = 1,
      alogrithm: Alogrithm = LR,
      regType: RegType = L2,
      regParam: Double = 0.01) extends Abstractparams[Params] 


  def run(params:Params): Unit={
    val conf = new SparkConf()
    val sc = new SparkContext(conf)
    Logger.getRootLogger.setLevel(Level.WARN)
    val examples = MLUtils.loadLibSVMFile(sc,params.input).cache()
    val splits = examples.randomSplit(Array(0.8,0.2))
    val training = splits(0).cache()
    val testing = splits(1).cache()

    examples.unpersist(blocking=true)

    val updater = params.regType match{
      case L1=>new L1Updater
      case L2=>new SquaredL2Updater
    }
    val model = params.alogrithm match{
      case LR=>
        val alogrithm = new LogisticRegressionWithLBFGS()
        alogrithm.optimizer.setNumIterations(params.numIterations).setUpdater(updater).setRegParam(params.regParam)
        alogrithm.run(training).clearThreshold()
      case SVM=>
        val alogrithm = new SVMWithSGD
        alogrithm.optimizer.setNumIterations(params.numIterations).setUpdater(updater).setRegParam(params.regParam)
        alogrithm.run(training).clearThreshold()
    }

    val predicition = model.predict(testing.map(_.features))
    val predicitionAndLabel = predicition.zip(testing.map(_.label))
    val metrics = new BinaryClassificationMetrics(predicitionAndLabel)
    println(metrics.areaUnderROC())
    sc.stop()

  }
  def main(args: Array[String]): Unit = {
    val defaultParam = Params()
    val parser = new OptionParser[Params]("BinaryClassification") {
      head("BinaryClassification: an example app for binary classification.")
      opt[Int]("numIterations").text("Number of iteration").action((x, c) => c.copy(numIterations = x))
      opt[Double]("stepSize").text("initial step size(ignored by logistic regression)," + 
          s"default: ${defaultParam.stepSize}").action((x, c) => c.copy(stepSize = x))
      opt[String]("alogrithm").text(s"alogrithm (${Alogrithm.values.mkString(",")}),"+
          s"default:${defaultParam.alogrithm}").action((x,c)=>c.copy(alogrithm=Alogrithm.withName(x)))
      opt[String]("regType").text(s"regularization type(${RegType.values.mkString(",")})"+
          s"deafult:${defaultParam.regType}").action((x,c)=>c.copy(regType=RegType.withName(x)))
      opt[String]("regParam").text(s"regularization parameter,default:${defaultParam.regParam}")
      arg[String]("<input>").required().text("input path to labeled examples in LIBSVM format").action((x,c)=>c.copy(input=x))
      note(
        """
           For example, the following command runs this app on a synthetic dataset:

           bin/spark-submit --class org.apache.spark.examples.mllib.BinaryClassification \
            examples/target/scala-*/spark-examples-*.jar \
            --algorithm LR --regType L2 --regParam 1.0 \
            data/mllib/sample_binary_classification_data.txt
        """    
      )
    }
    parser.parse(args,defaultParam) match{
      case Some(params)=>run(params)
      case _=>sys.exit(1)
    }
  }

}


import scala.reflect.runtime.universe._

/**
 * Abstract class for parameter case classes.
 * This overrides the [[toString]] method to print all case class fields by name and value.
 * @tparam T  Concrete parameter class.
 */
abstract class AbstractParams[T: TypeTag] {

  private def tag: TypeTag[T] = typeTag[T]

  /**
   * Finds all case class fields in concrete class instance, and outputs them in JSON-style format:
   * {
   *   [field name]:\t[field value]\n
   *   [field name]:\t[field value]\n
   *   ...
   * }
   */
  override def toString: String = {
    val tpe = tag.tpe
    val allAccessors = tpe.declarations.collect {
      case m: MethodSymbol if m.isCaseAccessor => m
    }
    val mirror = runtimeMirror(getClass.getClassLoader)
    val instanceMirror = mirror.reflect(this)
    allAccessors.map { f =>
      val paramName = f.name.toString
      val fieldMirror = instanceMirror.reflectField(f)
      val paramValue = fieldMirror.get
      s"  $paramName:\t$paramValue"
    }.mkString("{\n", ",\n", "\n}")
  }
}

决策树,随机森林样例见之前文章这里写链接内容

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值