Spark MLlib之朴素贝叶斯

由于没有找到相应的数据,就自己写了一个随机生成数据的代码:

package mllib

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD

import scala.util.Random

/**
  * created by LMR on 2019/6/10
  */
object NaiveBayesDataProduce {

  def main(args: Array[String]): Unit = {

    val random = new Random()
    val numsInstances = 1000
    val numsfeature = 3
    val numsclass = 3
    var data: Array[LabeledPoint] = Array.fill[LabeledPoint](numsInstances)(null)
    for (i <- 0 to numsInstances - 1)
    {
      val array: Array[Double] = Array.fill[Double](numsfeature)(1)
      for (j <- 0 to numsfeature - 1){
        array(j) =  random.nextInt(5)
      }
      val vector = new DenseVector(array)
      val label: Int = random.nextInt(3)
      data(i) = LabeledPoint(label, vector)
    }
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("naiveBayes")
    val sc = new SparkContext(conf)

    val dataRDD: RDD[LabeledPoint] = sc.parallelize(data)
    MLUtils.saveAsLibSVMFile(dataRDD,"E://output")
  }
}

朴素贝叶斯模型:

package mllib

import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.{MLUtils, SVMDataGenerator}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.{Level, Logger}

/**
  * created by LMR on 2019/6/10
  */
object Naive_Bayes {//要求特征值非负

  Logger.getRootLogger.setLevel(Level.ERROR)
  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("naiveBayes")
    val sc = new SparkContext(conf)

    val data: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "E://output")

    //划分训练集喝测试集
    val splits: Array[RDD[LabeledPoint]] = data.randomSplit(Array(0.6,0.4), seed = 11L)
    val train: RDD[LabeledPoint] = splits(0)
    val test: RDD[LabeledPoint] = splits(1)

    //建立贝叶斯模型
    val model: NaiveBayesModel = NaiveBayes.train(train, lambda = 1.0, modelType = "multinomial")

    //测试集进行测试
    val predictionAndLabel: RDD[(Double, Double)] = test.map(p => (model.predict(p.features), p.label))

    val print_prediction: Array[(Double, Double)] = predictionAndLabel.take(20)
    for (elem <- print_prediction) {println(elem._1 + "\t" + elem._2)}

    //准确率
    val accuracy: Double = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
    println(accuracy)

  }

}

完整代码/数据地址:git地址

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值