spark1.2.0中的朴素贝叶斯分类器可以处理多分类的问题,典型应用如:文本分类。
先看一个使用的范例,代码如下:
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")
val parsedData = data.map { line =>
val parts = line.split(',')
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
}
// Split data into training (60%) and test (40%).
val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
val training = splits(0)
val test = splits(1)
val model = NaiveBayes.train(training, lambda = 1.0) //lambda为平滑因子,避免当特征向量中某个分量为0时,导致整体实例的概率为0
val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
当调用 NaiveBayes.train() 时,代码如下:
def train(input: RDD[LabeledPoint], lambda: Double): NaiveBayesModel = {
new NaiveBayes(lambda).run(input)
}
跟踪run(),如下:
def run(data: RDD[LabeledPoint]) = {
val requireNonnegativeValues: Vector => Unit = (v: Vector) => {
val values = v match {
case sv: SparseVector =>
sv.values
case dv: DenseVector =>
dv.values
}
if (!values.forall(_ >= 0.0)) {
throw new SparkException(s"Naive Bayes requires nonnegative feature values but found $v.")
}
}
// Aggregates term frequencies per label.
// TODO: Calling combineByKey and collect creates two stages, we can implement something
// TODO: similar to reduceByKeyLocally to save one stage.
val aggregated = data.map(p => (p.label, p.features)).combineByKey[(Long, BDV[Double])]( //返回(label,每个特征向量的概率值Px)
createCombiner = (v: Vector) => {
requireNonnegativeValues(v)
(1L, v.toBreeze.toDenseVector)
},
mergeValue = (c: (Long, BDV[Double]), v: Vector) => {
requireNonnegativeValues(v)
(c._1 + 1L, c._2 += v.toBreeze) //这里c._1+1,可以统计文档的数量
},
mergeCombiners = (c1: (Long, BDV[Double]), c2: (Long, BDV[Double])) =>
(c1._1 + c2._1, c1._2 += c2._2)
).collect()
val numLabels = aggregated.length
var numDocuments = 0L
aggregated.foreach { case (_, (n, _)) =>
numDocuments += n
}
val numFeatures = aggregated.head match { case (_, (_, v)) => v.size }
val labels = new Array[Double](numLabels)
val pi = new Array[Double](numLabels)
val theta = Array.fill(numLabels)(new Array[Double](numFeatures))
val piLogDenom = math.log(numDocuments + numLabels * lambda)
var i = 0
aggregated.foreach { case (label, (n, sumTermFreqs)) =>
labels(i) = label
val thetaLogDenom = math.log(brzSum(sumTermFreqs) + numFeatures * lambda) //构造分母sum,每个分类下的所有特征向量的值累加
pi(i) = math.log(n + lambda) - piLogDenom //表示每个类的先验概率
var j = 0
while (j < numFeatures) {
theta(i)(j) = math.log(sumTermFreqs(j) + lambda) - thetaLogDenom //分子为每个特征值Xi, 结果即为公式:log(Xi/sum)
j += 1
}
i += 1
}
new NaiveBayesModel(labels, pi, theta) //返回的模型
}
最后看一下模型预测的代码:
override def predict(testData: Vector): Double = {
labels(brzArgmax(brzPi + brzTheta * testData.toBreeze)) //brzArgmax函数为:取向量中的最大值的索引号,即取labels数组中最大概率的的那个值
}
************* The End *************