原理解释:用朴素贝叶斯进行文本分类
代码解释:Spark MLlib实现的中文文本分类–Naive Bayes
模型训练
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.HashingTF
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.Row
case class RawDataRecord1(category: String, text: String)
object TestNaiveBayes {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
var srcRDD = sc.textFile("/user/chenjinghui/hbaseData4train/").map {
x =>
val data = x.split("\t")
RawDataRecord1(data(0),data(1))
}
//70%作为训练数据,30%作为测试数据
val splits = srcRDD.randomSplit(Array(0.7, 0.3))
var trainingDF = splits(0).toDF()
var testDF = splits(1).toDF()
//将词语转换成数组
var tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
var wordsData = tokenizer.transform(trainingDF)
println("output1:")
wordsData.select($"category",$"text",$"words").take(1)
//计算每个词在文档中的词频
var hashingTF = new HashingTF().setNumFeatures(50000).setInputCol("words").setOutputCol("rawFeatures")
var featurizedData = hashingTF.transform(wordsData)
println("output2:")
featurizedData.select($"category", $"words", $"rawFeatures").take(1)
//计算每个词的TF-IDF
var idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
var idfModel = idf.fit(featurizedData)
var rescaledData = idfModel.transform(featurizedData)
println("output3:")
rescaledData.select($"category", $"features").take(1)
//转换成Bayes的输入格式
var trainDataRdd = rescaledData.select($"category",$"features").map {
case Row(label: String, features: Vector) =>
LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
}
println("output4:")
trainDataRdd.take(1)
//训练模型
val model = NaiveBayes.train(trainDataRdd, lambda = 1.0, modelType = "multinomial")
//测试数据集,做同样的特征表示及格式转换
var testwordsData = tokenizer.transform(testDF)
var testfeaturizedData = hashingTF.transform(testwordsData)
var testrescaledData = idfModel.transform(testfeaturizedData)
var testDataRdd = testrescaledData.select($"category",$"features").map {
case Row(label: String, features: Vector) =>
LabeledPoint(label.toDouble, Vectors.dense(features.toArray))
}
//对测试数据集使用训练模型进行分类预测
val testpredictionAndLabel = testDataRdd.map(p => (model.predict(p.features), p.label))
//统计分类准确率
var testaccuracy = 1.0 * testpredictionAndLabel.filter(x => x._1 == x._2).count() / testDataRdd.count()
println("output5:")
println(testaccuracy)
model.save(sc,"/user/chenjinghui/model2")
}
}
import scala.reflect.runtime.universe
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.HashingTF
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.feature.Tokenizer
import org.apache.spark.mllib.classification.NaiveBayesModel
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.Row
case class RawDataRecord(category: String, text: String)
object NaiveBayesModelUse {
def main(args: Array[String]) {
val conf = new SparkConf().setMaster("yarn-client")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
//将原始数据映射到DataFrame中,字段category为分类编号,字段text为分好的词,以空格分隔
val srcDF = sc.textFile("/user/chenjinghui/news2.txt").map {
x =>
val data = x.split("\t")
RawDataRecord(data(0),data(1))
}.toDF()
//将分好的词转换为数组
val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val wordsData = tokenizer.transform(srcDF)
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(50000)
val featurizedData = hashingTF.transform(wordsData)
//计算TF-IDF值
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
//转换成计算需要的输入格式
val trainDataRdd = rescaledData.select($"category",$"features").map {
case Row(label:String,features:Vector)=>
(label, Vectors.dense(features.toArray))
}
//加载模型
val model=NaiveBayesModel.load(sc,"/user/chenjinghui/model2")
//对测试数据集使用训练模型进行分类预测
val testpredictionAndLabel = trainDataRdd.map(p => (model.predict(p._2), p._1))
testpredictionAndLabel.saveAsTextFile("/user/chenjinghui/result2/")
}
}
执行脚本
spark-submit --class TestNaiveBayes --master spark://dmp04:7077 --num-executors 6 --driver-memory 3g --executor-memory 512m --total-executor-cores 6 ./text-tain.jar
spark-submit --master spark://dmp04:7077 --class NaiveBayesModelUse --num-executors 4 --driver-memory 2g --executor-memory 512m --total-executor-cores 6 ./text-tain.jar