基于spark的朴素贝叶斯分类器

根据TDA里面的情感分析实现的基于spark的朴素贝叶斯分类器



import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import scala.util.parsing.json._
import java.util.StringTokenizer

import scala.collection.mutable.HashMap

import java.io.PrintWriter
import java.io.File



object SimpleApp{
    def main(args: Array[String]){
        var totalWordNumber = 0
        var happyProb = 0.0
        var sadProb = 0.0

        

 
        //var classiferPara: (Int, Double, Double) = (0,0.0,0.0)

        //val logFile = "/home/hrl/spark-0.9.1/README.md"
        val sc = new SparkContext("local", "Simple App", "/home/hrl/spark-0.9.1", List("target/scala-2.10/simple-project_2.10-1.0.jar"))
        //val logData = sc.textFile(logFile, 2).cache()
        val owsFile = "/home/hrl/spark-0.9.1/apps/sentimenAnalysis/data/ows.json"
        //val owsFile = "/home/hrl/spark-0.9.1/apps/sentimenAnalysis/data/ows_sample.json"
        val outputFile = "/home/hrl/spark-0.9.1/apps/sentimenAnalysis/data/output.txt"
        //val logData = sc.textFile(owsFile)
        val owsData = sc.textFile(owsFile)
        
        //only twittContent
        val twittTextPrimitive = owsData.map(parseJson _)
        val twittText = twittTextPrimitive.map(filterStopWord)
        val twittTextWithSent = twittText.map(parseSentiment _)
        //twittTextWithSent.collect()

        //(twitt, (1,0)) (happy, sad)
        val twittWord = twittTextWithSent.map(mapSentWord _)
        //twittWord.collect

        //twittWordGroup is instance of HashMap[Word, (Int, Int)] not RDD
        val twittWordGroup = groupWordNumber(twittWord)
        //twittWordGroup.collect
        
        //totalWordNumber = countTotalWord(twittWordGroup)
        //(totalWordNumber, happyProb, sadProb)
        val classiferPara:(Int, Double, Double) = classiferParameters(twittWordGroup)
        
        val twitt = "thing NYC could do to #Occupy is what they are doing right now. Suppression always  always has the opposite effect"

        val classProbs = classify(twitt, twittWordGroup, classiferPara)

        writeToFile(outputFile, twittWordGroup)
        println("job done")
        //val numAs = logData.filter(line => line.contains("a")).count()
        //val numBs = logData.filter(line => line.contains("b")).count()
        //println("Lines with a: %s, Lines with b: %s".format(numAs, numB
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值