Spark 两个RDD按key合并(join算子和cogroup算子)

在工作中经常遇到需要合并RDD的情况,记录下处理情况。join和cogroup算子都能达到要求,按key合并,只是当rdd存在多个相同的key时候,最终的输出结果不一样。网上找到了处理情况,自己也测试了,代码如下:

object Test {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().appName("test").
      master("local[2]").getOrCreate()

    val sc = spark.sparkContext
    sc.setLogLevel("WARN")

    /**
      * id      name
      * 1       zhangsan
      * 2       lisi
      * 3       wangwu
      */
    val idName = sc.parallelize(Array((1, "zhangsan"), (2, "lisi"), (3, "wangwu")))

    /**
      * id      age
      * 1       30
      * 2       29
      * 4       21
      */
    val idAge = sc.parallelize(Array((1, 30), (2, 29), (4, 21)))

    println("cogroup")

    /**
      * (1,(CompactBuffer(zhangsan),CompactBuffer(30)))
      * (2,(CompactBuffer(lisi),CompactBuffer(29)))
      * (3,(CompactBuffer(wangwu),CompactBuffer()))
      * (4,(CompactBuffer(),CompactBuffer(21)))
      */
    idName.cogroup(idAge).collect().foreach(println)

    println("join")
    // fullOuterJoin于cogroup的结果类似, 只是数据结构不一样
    /**
      * (1,(Some(zhangsan),Some(30)))
      * (2,(Some(lisi),Some(29)))
      * (3,(Some(wangwu),None))
      * (4,(None,Some(21)))
      */
    idName.fullOuterJoin(idAge).collect().foreach(println)

    /**
      * id      score
      * 1       100
      * 2       90
      * 2       95
      */
    val idScore = sc.parallelize(Array((1, 100), (2, 90), (2, 95)))

    println("cogroup, 出现相同id时")

    /**
      * (1,(CompactBuffer(zhangsan),CompactBuffer(100)))
      * (2,(CompactBuffer(lisi),CompactBuffer(90, 95)))
      * (3,(CompactBuffer(wangwu),CompactBuffer()))
      */
    idName.cogroup(idScore).collect().foreach(println)

    println("join, 出现相同id时")

    /**
      * (1,(Some(zhangsan),Some(100)))
      * (2,(Some(lisi),Some(90)))
      * (2,(Some(lisi),Some(95)))
      * (3,(Some(wangwu),None))
      */
    idName.fullOuterJoin(idScore).collect().foreach(println)
  }


}

参考:https://blog.csdn.net/wo334499/article/details/51689563

发布了2 篇原创文章 · 获赞 0 · 访问量 4489
展开阅读全文

sparkrdd与dataframe的合并join

06-22

以下是我写的代码: ``` /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // scalastyle:off println package com.shine.ncc import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.mllib.classification.NaiveBayesModel import org.apache.spark.rdd.RDD import org.apache.spark.streaming.Time import org.apache.spark.sql.SQLContext import org.apache.spark.SparkContext import org.apache.spark.ml.feature.Tokenizer import org.ansj.splitWord.analysis.ToAnalysis import org.ansj.util.FilterModifWord import java.util.Arrays import org.apache.spark.mllib.feature.HashingTF import scala.collection.JavaConversions._ import org.apache.spark.mllib.feature.IDF import org.apache.spark.mllib.feature.IDFModel import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.client.HTable import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.util.Bytes object NetworkNewsClassify1 { var sameModel = null /** Case class for converting RDD to DataFrame */ case class Record(content: String,time:String,title:String) /** Lazily instantiated singleton instance of SQLContext */ object SQLContextSingleton { @transient private var instance: SQLContext = _ def getInstance(sparkContext: SparkContext): SQLContext = { if (instance == null) { instance = new SQLContext(sparkContext) } instance } } def main(args: Array[String]) { // if (args.length < 2) { // System.err.println("Usage: NetworkWordCount <hostname> <port>") // System.exit(1) // } StreamingExamples.setStreamingLogLevels() // Create the context with a 1 second batch size val sparkConf = new SparkConf().setAppName("NetworkNewsClassify") sparkConf.setMaster("local[2]"); val ssc = new StreamingContext(sparkConf, Seconds(1)) // Create a socket stream on target ip:port and count the 获取json信息 val lines = ssc.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_SER) val myNaiveBayesModel = NaiveBayesModel.load(ssc.sparkContext, "D:/myNaiveBayesModel") //将接送转换成rdd lines.foreachRDD((rdd: RDD[String], time: Time) => { // Get the singleton instance of SQLContext val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext) import sqlContext.implicits._ val newsDF = sqlContext.read.json(rdd) newsDF.count(); val featurizedData = newsDF.map{ line => val temp = ToAnalysis.parse(line.getAs("title")) //加入停用词 FilterModifWord.insertStopWords(Arrays.asList("r","n")) //加入停用词性???? FilterModifWord.insertStopNatures("w",null,"ns","r","u","e") val filter = FilterModifWord.modifResult(temp) //此步骤将会只取分词,不附带词性 val words = for(i<-Range(0,filter.size())) yield filter.get(i).getName //println(words.mkString(" ; ")); //计算每个词在文档中的词频 new HashingTF(500000).transform(words) }.cache() if(featurizedData.count()>0){ //计算每个词的TF-IDF val idf = new IDF() val idfModel = idf.fit(featurizedData) val tfidfData = idfModel.transform(featurizedData); //分类预测 val resultData = myNaiveBayesModel.predict(tfidfData) println(resultData) //将result结果与newsDF信息join在一起 //**??? 不会实现了。。。** //保存新闻到hbase中 } }) ssc.start() ssc.awaitTermination() } } ``` 其中newsDF是新闻信息,包含字段(title,body,date),resultData 是通过贝叶斯模型预测的新闻类型,我现在希望把result结果作为一个type字段与newsDF合并(join),保存到hbase中,这个合并的操作怎么做呢 问答

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览