spark 中rdd与dataframe的合并

最新推荐文章于 2023-03-10 14:03:54 发布

0x12A2A7F

最新推荐文章于 2023-03-10 14:03:54 发布

阅读量3k

点赞数 1

分类专栏： Big Data 文章标签： Spark

Big Data 专栏收录该内容

77 篇文章 1 订阅

订阅专栏

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// scalastyle:off println
package com.spark.test

import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.mllib.classification.NaiveBayesModel
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.Time
import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.Tokenizer
import org.ansj.splitWord.analysis.ToAnalysis
import org.ansj.util.FilterModifWord
import java.util.Arrays
import org.apache.spark.mllib.feature.HashingTF
import scala.collection.JavaConversions._
import org.apache.spark.mllib.feature.IDF
import org.apache.spark.mllib.feature.IDFModel
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.HTable
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes

object NetworkNewsClassify1 {
  var sameModel = null 

  /** Case class for converting RDD to DataFrame */
  case class Record(content: String,time:String,title:String)


  /** Lazily instantiated singleton instance of SQLContext */
  object SQLContextSingleton {

    @transient  private var instance: SQLContext = _

    def getInstance(sparkContext: SparkContext): SQLContext = {
      if (instance == null) {
        instance = new SQLContext(sparkContext)
      }
      instance
    }
  }

  def main(args: Array[String]) {
//    if (args.length < 2) {
//      System.err.println("Usage: NetworkWordCount <hostname> <port>")
//      System.exit(1)
//    }

    StreamingExamples.setStreamingLogLevels()

    // Create the context with a 1 second batch size
    val sparkConf = new SparkConf().setAppName("NetworkNewsClassify")
    sparkConf.setMaster("local[2]");
    val ssc = new StreamingContext(sparkConf, Seconds(1))

    // Create a socket stream on target ip:port and count the   获取json信息
    val lines = ssc.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_SER)
    val myNaiveBayesModel = NaiveBayesModel.load(ssc.sparkContext, "D:/myNaiveBayesModel")
    //将接送转换成rdd
    lines.foreachRDD((rdd: RDD[String], time: Time) => {
      // Get the singleton instance of SQLContext
      val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext)
      import sqlContext.implicits._

      val newsDF = sqlContext.read.json(rdd)
      newsDF.count();
      val featurizedData = newsDF.map{
          line => 
            val temp = ToAnalysis.parse(line.getAs("title"))
            //加入停用词 
            FilterModifWord.insertStopWords(Arrays.asList("r","n"))
            //加入停用词性???? 
            FilterModifWord.insertStopNatures("w",null,"ns","r","u","e")
            val filter = FilterModifWord.modifResult(temp)
            //此步骤将会只取分词，不附带词性
            val words = for(i<-Range(0,filter.size())) yield filter.get(i).getName
            //println(words.mkString("  ;  "));
            //计算每个词在文档中的词频
            new HashingTF(500000).transform(words)
      }.cache()
      if(featurizedData.count()>0){
        //计算每个词的TF-IDF
        val idf = new IDF()
        val idfModel = idf.fit(featurizedData)
        val tfidfData = idfModel.transform(featurizedData);
        //分类预测
        val resultData = myNaiveBayesModel.predict(tfidfData)
        println(resultData)

        //将result结果与newsDF信息join在一起
        //**??? 不会实现了。。。**
        //保存新闻到hbase中

      }

    })


    ssc.start()
    ssc.awaitTermination()
  }
}

其中newsDF是新闻信息，包含字段（title，body，date），resultData 是通过贝叶斯模型预测的新闻类型，我现在希望把result结果作为一个type字段与newsDF合并（join），保存到hbase中，这个合并的操作怎么做呢