Spark RDD

最新推荐文章于 2024-07-17 09:49:29 发布

一只小菜鸟(*￣︶￣)

最新推荐文章于 2024-07-17 09:49:29 发布

阅读量303

点赞数

本文链接：https://blog.csdn.net/mao502010435/article/details/89712570

版权

Spark 专栏收录该内容

20 篇文章 0 订阅

订阅专栏

RDD : Resilient Distributed Dataset,弹性分布式数据集

是spark的基本数据结构，是不可变数据集。RDD中的数据集进行逻辑分区，每个分区可以单独在集群节点进行计算。可以包含任何java,scala，python和自定义类型。

RDD是只读的记录分区集合。RDD具有容错机制。

创建RDD方式，一、并行化一个现有集合。

   hadoop 花费90%时间用户rw。

   内存处理计算。在job间进行数据共享。内存的IO速率高于网络和disk的10 ~ 100之间。

   内部包含5个主要属性
   1.分区列表
   2.针对每个split的计算函数。
   3.对其他rdd的依赖列表
   4.可选，如果是KeyValueRDD的话，可以带分区类。
   5.可选，首选块位置列表(hdfs block location);

默认并发度
local.backend.defaultParallelism() = scheduler.conf.getInt("spark.default.parallelism", totalCores)
taskScheduler.defaultParallelism = backend.defaultParallelism()
sc.defaultParallelism =...; taskScheduler.defaultParallelism
defaultMinPartitions = math.min(defaultParallelism, 2)
sc.textFile(path,defaultMinPartitions) //1,2

RDD变换

返回指向新rdd的指针，在rdd之间创建依赖关系。每个rdd都有计算函数和指向父RDD的指针。

map() //对每个元素进行变换，应用变换函数
//(T)=>V

flatMap() //压扁,T => TraversableOnce[U]

reduceByKey(*) //按key聚合。

 val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;
        val sc = new SparkContext(conf)

        val rdd1 = sc.textFile("d:/scala/test.txt",4)
        val rdd2 = rdd1.flatMap(_.split(" ")) ;
        //val rdd3 = rdd2.map(word=>{println("start") ;val t = (word,1) ;println(t + " : end") ; t})
        val rdd3 = rdd2.mapPartitions(it=>{
            import scala.collection.mutable.ArrayBuffer ;
            val buf = ArrayBuffer[String]()
            val tname = Thread.currentThread().getName
            println(tname + " : mapPartitions start ");
            for (e <- it) {
                buf.+=("_" + e);
            }
            buf.iterator
        });
            val rdd5 = rdd3.map(word=>{
            val tname = Thread.currentThread().getName
            println(tname + " : map " + word);
            (word,1)});
            val rdd4 = rdd5.reduceByKey(_ + _)
            val r = rdd4.collect()
            r.foreach(println)
}

mapPartitions() //对每个分区进行应用变换，输入的Iterator,返回新的迭代器，可以对分区进行函数处理。
//Iterator<T> => Iterator<U>

mapPartitionsWithIndex(func) //同上，(Int, Iterator<T>) => Iterator<U>

def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;
        val sc = new SparkContext(conf)

        val rdd1 = sc.textFile("d:/scala/test.txt",4)
        val rdd2 = rdd1.flatMap(_.split(" ")) ;

        val rdd3 = rdd2.mapPartitionsWithIndex((index,it) => {
            import scala.collection.mutable.ArrayBuffer;
            val buf = ArrayBuffer[String]()
            val tname = Thread.currentThread().getName
            println(tname + " : " + index + " : mapPartitions start ");
            for (e <- it) {
                buf.+=("_" + e);
            }
            buf.iterator
        });

        val rdd5 = rdd3.map(word=>{
            val tname = Thread.currentThread().getName
            println(tname + " : map " + word);
            (word,1)});
        val rdd4 = rdd5.reduceByKey(_ + _)
        val r = rdd4.collect()
        r.foreach(println)
    }

filter() //过滤器,(T)=>Boolean

//统计hello的个数
  def wordCount3(args: Array[String]): Unit ={
    val conf = new SparkConf();
    //        conf.setAppName("wc");
    //        conf.setMaster("local");
    val sc = new SparkContext(conf);
    val r = sc.textFile(args(0)).flatMap(line=>line.split(" ")).filter(_.equals("hello")).map((_,1)).reduceByKey(_ + _).collect();
    r.foreach(println);
  }

    sample(withReplacement, fraction, seed)   //采样返回采样的RDD子集。
                                           //withReplacement 元素是否可以多次采样.
                                           //fraction : 期望采样数量.[0,1]

def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;
        val sc = new SparkContext(conf)
        val rdd1 = sc.textFile("d:/scala/test.txt",4)
        val rdd2 = rdd1.flatMap(_.split(" "))

        val rdd3 = rdd2.sample(false,0.5)
        rdd3.collect().foreach(println)

    }

   union()                       //类似于mysql union操作。
                                           //select * from persons where id < 10
                                           //union select * from id persons where id > 29 ;

    def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;
        val sc = new SparkContext(conf)
        val rdd1 = sc.textFile("d:/scala/log.txt",4)
        //所有error
        val errorRDD = rdd1.filter(_.toLowerCase.contains("error"))

        //所有warn行
        val warnRDD = rdd1.filter(_.toLowerCase.contains("warn"));

        val allRDD = errorRDD.union(warnRDD);

        allRDD.collect().foreach(println)
    }

intersection //交集,提取两个rdd中都含有的元素。

def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;
        val sc = new SparkContext(conf)
        val rdd1 = sc.textFile("d:/scala/log.txt",4)
        //所有error
        val errorRDD = rdd1.filter(_.toLowerCase.contains("error"))

        //所有warn行
        val warnRDD = rdd1.filter(_.toLowerCase.contains("warn"));

        val intersecRDD = errorRDD.intersection(warnRDD);

        intersecRDD.collect().foreach(println)

    }

distinct([numTasks])) //去重,去除重复的元素。

def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;
        val sc = new SparkContext(conf)
        val rdd1 = sc.textFile("d:/scala/log.txt",4)
        val rdd2 = rdd1.flatMap(_.split(" "))
        val rdd3 = rdd2.distinct()

        rdd3.collect().foreach(println)
}

groupByKey() //(K,V) => (K,Iterable<V>)

    def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;
        val sc = new SparkContext(conf)
        val rdd1 = sc.textFile("d:/scala/stus.txt",4)
        val rdd2 = rdd1.map(line=>{
            val key = line.split(" ")(3)
            (key,line)
        })
        val rdd3 = rdd2.groupByKey()
        rdd3.collect().foreach(t=>{
            val key = t._1;
            println(key + " : ====================")
            for (e <- t._2){
                println(e)
            }
        })
    }

aggregateByKey(zeroValue)(seqOp, combOp, [numTasks]) //按照key进行聚合
key:String U:Int = 0

 def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;
        val sc = new SparkContext(conf)
        val rdd1 = sc.textFile("d:/scala/stus.txt",4)
        val rdd2 = rdd1.flatMap(_.split(" "))
        val rdd3 = rdd2.map((_,1))

        def seq(a: Int, b: Int): Int = {
            math.max(a, b)
        }

        def comb(a: Int, b: Int): Int = {
            a + b
        }
        rdd3.aggregateByKey(3)(seq, (a:Int,b:Int)=>{
            a + b
        })
    }

sortByKey //排序

join(otherDataset, [numTasks]) //连接,(K,V).join(K,W) =>(K,(V,W))

    def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;
        //名单
        val sc = new SparkContext(conf)
        val namesRDD1 = sc.textFile("d:/scala/names.txt");
        val namesRDD2 = namesRDD1.map(line=>{
            var arr = line.split(" ")
            (arr(0).toInt,arr(1))
        })
        //总成绩
        val scoreRDD1 = sc.textFile("d:/scala/scores.txt");
        val scoreRDD2 = scoreRDD1.map(line => {
            var arr = line.split(" ")
            (arr(0).toInt, arr(1).toInt)
        })

        val rdd = namesRDD2.join(scoreRDD2)
        rdd.collect().foreach(t=>{
            println(t._1 + " : " + t._2)
        })
    }

cogroup //协分组
//(K,V).cogroup(K,W) =>(K,(Iterable<V>,Iterable))

def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;
        val sc = new SparkContext(conf)
        val rdd1 = sc.textFile("d:/scala/cogroup-1.txt",4)
        //K,V
        val rdd2 = rdd1.map(line=>{
            val arr = line.split(" ")
            (arr(0),arr(1))
        })

        //K,W
        val rdd3 = sc.textFile("d:/scala/cogroup-2.txt", 4)
        //key,value
        val rdd4 = rdd3.map(line => {
            val arr = line.split(" ")
            (arr(0), arr(1))
        })

        val rdd = rdd2.cogroup(rdd4)
        rdd.collect().foreach(t=>{
            println(t._1 + ":=================")
            for( e <- t._2._1){
                println(e)
            }
            for (e <- t._2._2) {
                println(e)
            }
        })
    }

cartesian(otherDataset) //笛卡尔积,RR[T] RDD[U] => RDD[(T,U)]

    def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;
        val sc = new SparkContext(conf)
        val rdd1 = sc.parallelize(Array("tom","tomas","tomasle","tomson"))
        val rdd2 = sc.parallelize(Array("1234","3456","5678","7890"))

        val rdd = rdd1.cartesian(rdd2);
        rdd.collect().foreach(t=>println(t))
    }

pipe //将rdd的元素传递给脚本或者命令，执行结果返回形成新的RDD（在linux可用）

    def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;

        val sc = new SparkContext(conf)
        val rdd = sc.parallelize(Array("file:///d:","file:///e:","file:///f:",3))
        val rdd0 = rdd.pipe("ls ")
        rdd0.collect().foreach(println)
    }

coalesce(numPartitions) //减少分区
repartition //可增可减

    def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[8]") ;
        val sc = new SparkContext(conf)

        val rdd1 = sc.textFile("d:/scala/test.txt",4)
        println("rdd1' parti : " + rdd1.partitions.length)

        //降低分区
        val rdd11 = rdd1.coalesce(5);

        //再分区,可憎可减
        val rdd111 = rdd11.repartition(5)

        val rdd2 = rdd111.flatMap(_.split(" "))
        println("rdd2' parti : " + rdd2.partitions.length)

        //
        val rdd3 = rdd2.map((_,1))
        println("rdd3' parti : " + rdd3.partitions.length)
    }

repartitionAndSortWithinPartitions(partitioner) //再分区并在分区内进行排序

RDD Action

   collect()   //收集rdd元素形成数组.
   count()                                  //统计rdd元素的个数
   reduce()                               //聚合,返回一个值。
   first //取出第一个元素take(1)
   take //提取前三个
   takeSample (withReplacement,num, [seed])
   takeOrdered(n, [ordering])

    def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[8]") ;
        val sc = new SparkContext(conf)

        val rdd1 = sc.textFile("d:/scala/test.txt",4)
        //println(rdd1.first())
        rdd1.take(3).foreach(println)
    }

saveAsTextFile(path) //保存到文件
saveAsSequenceFile(path) //保存成序列文件

saveAsObjectFile(path) (Java and Scala)

countByKey()

数据倾斜问题：

//重写key,多次组合
def main(args: Array[String]): Unit = {
        val conf = new SparkConf()
        conf.setAppName("WordCountScala")
        conf.setMaster("local[4]") ;
        val sc = new SparkContext(conf)
        val rdd1 = sc.textFile("d:/scala/test.txt",4)
        rdd1.flatMap(_.split(" ")).map((_,1)).map(t=>{
            val word = t._1
            val r = Random.nextInt(100)
            (word + "_" + r,1)
        }).reduceByKey(_ + _,4).map(t=>{
            val word = t._1;
            val count = t._2;
            val w = word.split("_")(0)
            (w,count)
        }).reduceByKey(_ + _,4).saveAsTextFile("d:/scala/out/lean");
    }