根据TDA里面的情感分析实现的基于spark的朴素贝叶斯分类器
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import scala.util.parsing.json._
import java.util.StringTokenizer
import scala.collection.mutable.HashMap
import java.io.PrintWriter
import java.io.File
object SimpleApp{
def main(args: Array[String]){
var totalWordNumber = 0
var happyProb = 0.0
var sadProb = 0.0
//var classiferPara: (Int, Double, Double) = (0,0.0,0.0)
//val logFile = "/home/hrl/spark-0.9.1/README.md"
val sc = new SparkContext("local", "Simple App", "/home/hrl/spark-0.9.1", List("target/scala-2.10/simple-project_2.10-1.0.jar"))
//val logData = sc.textFile(logFile, 2).cache()
val owsFile = "/home/hrl/spark-0.9.1/apps/sentimenAnalysis/data/ows.json"
//val owsFile = "/home/hrl/spark-0.9.1/apps/sentimenAnalysis/data/ows_sample.json"
val outputFile = "/home/hrl/spark-0.9.1/apps/sentimenAnalysis/data/output.txt"
//val logData = sc.textFile(owsFile)
val owsData = sc.textFile(owsFile)
//only twittContent
val twittTextPrimitive = owsData.map(parseJson _)
val twittText = twittTextPrimitive.map(filterStopWord)
val twittTextWithSent = twittText.map(parseSentiment _)
//twittTextWithSent.collect()
//(twitt, (1,0)) (happy, sad)
val twittWord = twittTextWithSent.map(mapSentWord _)
//twittWord.collect
//twittWordGroup is instance of HashMap[Word, (Int, Int)] not RDD
val twittWordGroup = groupWordNumber(twittWord)
//twittWordGroup.collect
//totalWordNumber = countTotalWord(twittWordGroup)
//(totalWordNumber, happyProb, sadProb)
val classiferPara:(Int, Double, Double) = classiferParameters(twittWordGroup)
val twitt = "thing NYC could do to #Occupy is what they are doing right now. Suppression always always has the opposite effect"
val classProbs = classify(twitt, twittWordGroup, classiferPara)
writeToFile(outputFile, twittWordGroup)
println("job done")
//val numAs = logData.filter(line => line.contains("a")).count()
//val numBs = logData.filter(line => line.contains("b")).count()
//println("Lines with a: %s, Lines with b: %s".format(numAs, numB