spark 新闻相识度计算

/**
  * Created by zxl on 2016/5/5.
  * 余弦相识度计算
  */

import java.sql.{Connection, DriverManager, ResultSet}
import java.text.SimpleDateFormat
import java.util.Date
;
import kafka.serializer.StringDecoder
import kafka.producer._
import org.apache.log4j.{Level, Logger}
import org.apache.spark._
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.{Vector, Vectors,SparseVector =>SV}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.Row
import org.apache.spark.streaming._
import org.apache.spark.streaming.kafka._
import org.apache.spark.sql.Row;

object Cossimi {
  case class RawDataRecord(category:String,text:String)
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("cos")
    val sc = new SparkContext(sparkConf)

    val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    import sqlContext.implicits._
    Logger.getRootLogger.setLevel(Level.ERROR)
    var srcRDD = sc.textFile("/home/spark/art.txt").map {
      x =>
        var data = x.split(",")
        RawDataRecord(data(0), data(1))
    }

    var trainingDF = srcRDD.toDF()
    trainingDF.take(2).foreach(println);
    var tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
    var wordsData = tokenizer.transform(trainingDF)

    println("output1")

    //===============hashing tf =============
    var hashingTF = new HashingTF().setNumFeatures(500000).setInputCol("words").setOutputCol("rawFeatures")
    
    var featurizedData = hashingTF.transform(wordsData)
    
    var idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
    var idfModel = idf.fit(featurizedData)

    
    val rescaledData = idfModel.transform(featurizedData)
    //val content_list = collection.mutable.ListBuffer[String]()
    val content_list = rescaledData.select($"category", $"features").collect().toList

    println("out put 2")
   
    val docSims = rescaledData.select($"category", $"features").map {
      case Row(id1, idf1) =>
        import breeze.linalg._
        val sv1 = idf1.asInstanceOf[SV]
        val bsv1 = new SparseVector[Double](sv1.indices, sv1.values, sv1.size)
        content_list.filter(_(0) != id1).map { case Row(id2, idf2) =>
            val sv2 = idf2.asInstanceOf[SV]
            val bsv2 = new SparseVector[Double](sv2.indices, sv2.values, sv2.size)
            val cosSim = bsv1.dot(bsv2).asInstanceOf[Double] / (norm(bsv1) * norm(bsv2))
            (id1, id2, cosSim)
        }
    }
    //docSims.take(5).foreach(println)
    println("insert mysql ......")
    docSims.foreach{
      cos_list =>
        cos_list.sortWith( _._3 >_._3).take(5).foreach{
          x =>
            val news_id = x._1
            val t_id = x._2
            val value = x._3
            insert_sim(news_id.toString,t_id.toString,value)
        }
    }
    sc.stop()
  }
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值