使用mllib的的贝叶斯算法进行文本分类


原理解释:用朴素贝叶斯进行文本分类

代码解释:Spark MLlib实现的中文文本分类–Naive Bayes

模型训练

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.HashingTF
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.Row
case class RawDataRecord1(category: String, text: String)
object TestNaiveBayes {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    val sc = new SparkContext(conf)

    val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    import sqlContext.implicits._

    var srcRDD = sc.textFile("/user/chenjinghui/hbaseData4train/").map {
      x =>
        val data = x.split("\t")
        RawDataRecord1(data(0),data(1))
    }

    //70%作为训练数据,30%作为测试数据
    val splits = srcRDD.randomSplit(Array(0.7, 0.3))
    var trainingDF = splits(0).toDF()
    var testDF = splits(1).toDF()

    //将词语转换成数组
    var tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
    var wordsData = tokenizer.transform(trainingDF)
    println("output1:")
    wordsData.select($"category",$"text",$"words").take(1)

    //计算每个词在文档中的词频
    var hashingTF = new HashingTF().setNumFeatures(50000).setInputCol("words").setOutputCol("rawFeatures")
    var featurizedData = hashingTF.transform(wordsData)
    println("output2:")
    featurizedData.select($"category", $"words", $"rawFeatures").take(1)


    //计算每个词的TF-IDF
    var idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
    var idfModel = idf.fit(featurizedData)
    var rescaledData = idfModel.transform(featurizedData)
    println("output3:")
    rescaledData.select($"category", $"features").take(1)

    //转换成Bayes的输入格式
    var trainDataRdd = rescaledData.select($"category",$"features").map {
      case Row(label: String, features: Vector) =>
        LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
    }
    println("output4:")
    trainDataRdd.take(1)

    //训练模型
    val model = NaiveBayes.train(trainDataRdd, lambda = 1.0, modelType = "multinomial")

    //测试数据集,做同样的特征表示及格式转换
    var testwordsData = tokenizer.transform(testDF)
    var testfeaturizedData = hashingTF.transform(testwordsData)
    var testrescaledData = idfModel.transform(testfeaturizedData)
    var testDataRdd = testrescaledData.select($"category",$"features").map {
      case Row(label: String, features: Vector) =>
        LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
    }

    //对测试数据集使用训练模型进行分类预测
    val testpredictionAndLabel = testDataRdd.map(p => (model.predict(p.features), p.label))

    //统计分类准确率
    var testaccuracy = 1.0 * testpredictionAndLabel.filter(x => x._1 == x._2).count() / testDataRdd.count()
    println("output5:")
    println(testaccuracy)
    model.save(sc,"/user/chenjinghui/model2")

  }
}


模型使用:

import scala.reflect.runtime.universe
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.HashingTF
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.mllib.classification.NaiveBayesModel
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.Row
case class RawDataRecord(category: String, text: String)
object NaiveBayesModelUse {
  def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("yarn-client")
    val sc = new SparkContext(conf)
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    import sqlContext.implicits._

    //将原始数据映射到DataFrame中,字段category为分类编号,字段text为分好的词,以空格分隔
    val srcDF = sc.textFile("/user/chenjinghui/news2.txt").map {
      x =>
        val data = x.split("\t")
        RawDataRecord(data(0),data(1))
    }.toDF()
    //将分好的词转换为数组
    val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
    val wordsData = tokenizer.transform(srcDF)
    val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(50000)
    val featurizedData = hashingTF.transform(wordsData)
    //计算TF-IDF值
    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
    val idfModel = idf.fit(featurizedData)
    val rescaledData = idfModel.transform(featurizedData)

    //转换成计算需要的输入格式
    val trainDataRdd = rescaledData.select($"category",$"features").map {
      case Row(label:String,features:Vector)=>
        (label, Vectors.dense(features.toArray))
    }
    //加载模型
    val  model=NaiveBayesModel.load(sc,"/user/chenjinghui/model2")
    //对测试数据集使用训练模型进行分类预测
    val testpredictionAndLabel = trainDataRdd.map(p => (model.predict(p._2), p._1))
    testpredictionAndLabel.saveAsTextFile("/user/chenjinghui/result2/")
  }
}

执行脚本

spark-submit   --class TestNaiveBayes  --master spark://dmp04:7077   --num-executors 6 --driver-memory 3g --executor-memory 512m --total-executor-cores 6 ./text-tain.jar

spark-submit  --master spark://dmp04:7077  --class NaiveBayesModelUse  --num-executors 4 --driver-memory 2g --executor-memory 512m --total-executor-cores 6 ./text-tain.jar


  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值