1.mapPartitions
scala
/*
map
mapPartitions ,一次处理一个分区的所有数据,
处理完毕之后再把数据返回这个分区对应的子RDD的新的分区,
因为一次执行,跑的一个分区的所有数据
真正执行起来比Map要快很多,而且更加适合在一些场景下,
希望创建某些对象比较少的情况 ,但是它的弊端是 一次处理的数据是整个分区的数据,当内存
不是很充分的 情况下,发生内存溢出oom: out of memary
*/
def mapPartitions(sc: SparkContext): Unit = {
val rdd: RDD[String] = sc.parallelize(Array("hello world", "hi boy", "hello girl", "hi hello"))
val resRdd = rdd.mapPartitions(it => {
val buffer = new ArrayBuffer[String]()
while (it.hasNext) {
val split = it.next().split(" ")
for (word <- it) {
buffer += word + "-@"
}
}
buffer.toIterator
})
resRdd.foreach(println(_))
}
Java
private static void mapPartitions(JavaSparkContext sc) {
JavaRDD<String> pRdd = sc.parallelize(Arrays.asList("hello world", "hi boy", "hello girl", "hi hello"));
;
JavaRDD<String> resRdd = pRdd.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Iterator<String> call(Iterator<String> it) throws Exception {
ArrayList<String> res = new ArrayList<String>();
while (it.hasNext()) {
String[] split = it.next().split(" ");
for (String words : split) {
res.add(words + "@");
}
// System.out.println(res);
}
return res.iterator();
}
});
System.out.println(resRdd.collect());
// System.out.println(resRdd.toString());//×
2.mapPartitionsWithIndex
scala
def mapPartitionsWithIndex(sc: SparkContext): Unit = {
val arrRdd: RDD[String] = sc.parallelize(Array("hello world", "hi boy", "hello girl", "hi hello"))
// 多了一个index,这个index是一个索引,是每个分区的编号索引,编号是从0 开始的
val resRdd = arrRdd.mapPartitionsWithIndex((index, it) => {
//index是分区编号
val bf = new ArrayBuffer[String]()
while (it.hasNext) {
val split = it.next().split(" ")
for (word <- it) {
bf += word + "_" + index
}
}
bf.toIterator
})
resRdd.foreach(println(_))
}
Java
private static void mapPartitionsWithIndex(JavaSparkContext sc) {
/*
* JavaRDD<String> arrRdd =
* sc.parallelize(Arrays.asList("hello world","hi boy","hello girl"
* ,"hi hello"));
*/
JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3);
JavaRDD<Tuple2<Integer, Integer>> resAdd = rdd1.mapPartitionsWithIndex(
new Function2<Integer, Iterator<Integer>, Iterator<Tuple2<Integer, Integer>>>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public Iterator<Tuple2<Integer, Integer>> call(Integer index, Iterator<Integer> it)
throws Exception {
ArrayList<Tuple2<Integer, Integer>> tuple2s = new ArrayList<Tuple2<Integer, Integer>>();
while (it.hasNext()) {
Integer next = it.next();
tuple2s.add(new Tuple2<Integer, Integer>(index, next));
}
return tuple2s.iterator();
}
}, false);
resAdd.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {
@Override
public void call(Tuple2<Integer, Integer> t) throws Exception {
System.out.println(t);
}
});
}
println("=======================================================================================")
private static void mapPartitionsWithIndex1(JavaSparkContext sc) {
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("hello world", "hi boy", "hello girl", "hi hello"));
JavaRDD<Tuple2<Integer, String>> resRdd = rdd1
.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<Tuple2<Integer, String>>>() {
@Override
public Iterator<Tuple2<Integer, String>> call(Integer index, Iterator<String> it) throws Exception {
ArrayList<Tuple2<Integer, String>> tuple2 = new ArrayList<Tuple2<Integer, String>>();
while (it.hasNext()) {
String[] split = it.next().split(" ");
for (String words : split) {
tuple2.add(new Tuple2<Integer, String>(index, words));
}
}
return tuple2.iterator();
}
}, false);
resRdd.foreach(new VoidFunction<Tuple2<Integer, String>>() {
@Override
public void call(Tuple2<Integer, String> t) throws Exception {
System.out.println(t);
}
});
}
3.sample
Scala
// 取抽样 : 中国人的平均身高, 抽样,数学问题,可以用很少的数据,代替全局的情况
def sample(sc: SparkContext): Unit = {
val numRdd: RDD[Int] = sc.parallelize(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
//withReplacement 有无放回 fraction 抽样的比例 seed 每隔几个一次
//val resRdd1: RDD[Int] = numRdd.sample(false,0.5)
val resRdd2: RDD[Int] = numRdd.sample(true, 0.8)
//resRdd1.foreach(print(_))
resRdd2.foreach(println(_))
}
Java
private static void sample(JavaSparkContext sc) {
JavaRDD<Integer> numRdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 0));
JavaRDD<Integer> resRdd = numRdd.sample(false, 0.5);// true?
resRdd.foreach(new VoidFunction<Integer>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public void call(Integer t) throws Exception {
System.out.println(t.toString());
}
});
}
4.reparation
scala
// 重新规划分区数: repartition 還有一個coalesce ,合并分区
/*
def repartition(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope {
coalesce(numPartitions, shuffle = true)
}
repartition 相当于调用了coalesce 里面shuffle = true,就是这种情况下会发生shuffle
def coalesce(numPartitions: Int, shuffle: Boolean = false),默认情况下没有shuffle
repartition 是coalesce 特殊情况
为什么shuffle为true 就可以增加分区数呢??
shuffle就是数据可以从rdd的一个分区的数据可以被转换为多个子rdd的分区里面
问题:这两个算子的使用场景??
reparation:主要用来增加分区,增加并行计算的粒度,如果并行度很低,这样运行的速度比较慢
coalesce:合并分区,减少并行度 ,第一考虑什么时候减少并行度,如果资源本身就比较少,你搞太多的并行度,没意义,
搞这么多并行度,一个并行度对应一个task,也在有资源的浪费
filter:过滤操作之后,往往会导致数据不均匀,这个时候可以尝试合并分区,因为task的数量变少了,减少了对资源的消耗
*/
def reparation(sc: SparkContext): Unit = {
val numRdd: RDD[Int] = sc.parallelize(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0), 3)
println(numRdd.getNumPartitions)
//3
//shuffle=true 增大分区个数 提高并行度
val rdd1 = numRdd.repartition(5) //5
println(rdd1.getNumPartitions)
//3
val c1Rdd: RDD[Int] = numRdd.coalesce(5, false) //3
println(c1Rdd.getNumPartitions)
val c2Rdd: RDD[Int] = numRdd.coalesce(5, true) //5
println(c2Rdd.getNumPartitions)
val c3Rdd: RDD[Int] = numRdd.coalesce(2, false) //2
println(c3Rdd.getNumPartitions)
}
Java
private static void reparation(JavaSparkContext sc) {
JavaRDD<Integer> numRdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 0), 3);
System.out.println(numRdd.getNumPartitions());// 3
JavaRDD<Integer> rdd1 = numRdd.repartition(5);
System.out.println(rdd1.getNumPartitions());// 5
JavaRDD<Integer> c1Rdd = numRdd.coalesce(5, false);
System.out.println(c1Rdd.getNumPartitions());// 3
JavaRDD<Integer> c2Rdd = numRdd.coalesce(5, true);
System.out.println(c2Rdd.getNumPartitions());// 5
JavaRDD<Integer> c3Rdd = numRdd.coalesce(2, false);
System.out.println(c3Rdd.getNumPartitions());// 2
}
5.union
scala
def union(sc: SparkContext): Unit = {
val numRdd1: RDD[Int] = sc.parallelize(Array(1, 2, 3, 4, 5), 1)
val numRdd2: RDD[Int] = sc.parallelize(Array(4, 5, 6, 7, 8), 3)
val resRdd: RDD[Int] = numRdd1.union(numRdd2)
resRdd.foreach(println(_))
println("resNumPartitons :" + resRdd.getNumPartitions)
}
java
private static void union(JavaSparkContext sc) {
JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 1);
JavaRDD<Integer> rdd2 = sc.parallelize(Arrays.asList(4, 5, 6, 7, 8), 3);
JavaRDD<Integer> resRdd = rdd1.union(rdd2);
// System.out.println(resRdd.collect());
System.out.println(resRdd.getNumPartitions());
resRdd.foreach(new VoidFunction<Integer>() {
/**
*
*/
private static final long serialVersionUID = 1L;
@Override
public void call(Integer t) throws Exception {
System.out.println(t);
}
});
}
6.cartesian
scala
def cartesian(sc: SparkContext): Unit = {
val rdd01: RDD[String] = sc.parallelize(Array("张学友", "刘德华", "黎明", "郭富城"), 2)
val rdd02: RDD[String] = sc.parallelize(Array("咖喱", "辣椒", "香肠"), 3)
val resRdd: RDD[(String, String)] = rdd01.cartesian(rdd02)
resRdd.foreach(println(_))
println(resRdd.getNumPartitions)
}
java
private static void cartesian(JavaSparkContext sc) {
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("一色", "雪乃", "团子", "彩加"), 2);
JavaRDD<String> rdd2 = sc.parallelize(Arrays.asList("比企谷", "八幡", "大老师"), 3);
JavaPairRDD<String, String> resRdd = rdd1.cartesian(rdd2);
System.out.println(resRdd.getNumPartitions());
System.out.println(resRdd.collect());
resRdd.foreach(new VoidFunction<Tuple2<String, String>>() {
@Override
public void call(Tuple2<String, String> t) throws Exception {
System.out.println(t);
}
});
}
scala: package ; main
package day03
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
object Demo1{
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.ERROR)
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[2]")
val sc = new SparkContext(conf)
// mapPartitions(sc);
// mapPartitionsWithIndex(sc);
// sample(sc);
//reparation(sc);
// union(sc); // 数学计算里面 union 取并集
cartesian(sc); //取笛卡尔积
sc.stop()
}
}
java : package ; main
package day03;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
public class Demo01 {
private static Function2 index;
public static void main(String[] args) {
Logger.getLogger("org").setLevel(Level.ERROR);
SparkConf conf = new SparkConf().setAppName("Demo01").setMaster("local[2]");
JavaSparkContext sc = new JavaSparkContext(conf);
// mapPartitions(sc);
// mapPartitionsWithIndex(sc);
mapPartitionsWithIndex1(sc);
// sample(sc);
// reparation(sc);
// union(sc); // 数学计算里面 union 取并集
// cartesian(sc);// 取笛卡尔积
sc.stop();
}