/**
* Java版本导包相关
*/
import org.apache.spark.Partitioner;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* Scala版本导包相关
*/
import org.apache.spark.rdd.RDD
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
import scala.collection.mutable.ArrayBuffer
---------------------------写在前面的东西--------------------------------------
public class TransformationOperator {
/**
* java前面的东西
*/
public static SparkConf conf = new SparkConf().setMaster("local").setAppName("test");
public static JavaSparkContext sc = new JavaSparkContext(conf);
public static void println(String str) {
System.out.println(str);
}
//算子
//main()
object TransformationOperator {
/**
* Scala前面的东西
*/
val conf: SparkConf = new SparkConf()
conf.setMaster("local")
conf.setAppName("TransformationOperator")
val sc: SparkContext = new SparkContext(conf)
val list: List[String] = List("张无忌","赵敏","周芷若")
val rdd: RDD[String] = sc.parallelize(list)
//算子
//main()
---------------------------Transformation算子--------------------------------------
map()
/**
* map()
*/
public static void map() {
final List<String> list = Arrays.asList("张无忌", "赵敏", "周芷若");
//通过并行化的方式创建RDD
final JavaRDD<String> rdd = sc.parallelize(list);
final JavaRDD<String> nameRDD = rdd.map(new Function<String, String>() {
@Override
public String call(String name) throws Exception {
return "Hello " + name;
}
});
nameRDD.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
println(s);
}
});
}
/**
* name:张无忌
* name:赵敏
* name:周芷若
*/
def map(): Unit ={
rdd.map("name:"+_).foreach(println(_))
}
flatMap()
public static void flatMap() {
final List<String> list = Arrays.asList("张无忌 赵敏", "宋青书 周芷若");
final JavaRDD<String> rdd = sc.parallelize(list);
rdd.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String names) throws Exception {
return Arrays.asList(names.split(" ")).iterator();
}
}).map(new Function<String, String>() {
@Override
public String call(String name) throws Exception {
return "Hello " + name;
}
}).foreach(new VoidFunction<String>() {
@Override
public void call(String line) throws Exception {
println(line);
}
});
}
/**
* Hello:张无忌
* Hello:赵敏
* Hello:宋青书
* Hello:周芷若
*/
def flatMap()={
val rdd1: RDD[String] = sc.parallelize(List("张无忌 赵敏", "宋青书 周芷若"))
rdd1.flatMap(_.split(",")).flatMap(_.split(" ")).foreach(word=>println("Hello:"+word))
}
filter()
/**
* 从RDD过滤出来偶数
*/
public static void filter() {
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7);
final JavaRDD<Integer> rdd = sc.parallelize(list);
final JavaRDD<Integer> filterRDD = rdd.filter(new Function<Integer, Boolean>() {
//true 代表这个值我们要
@Override
public Boolean call(Integer number) throws Exception {
return number % 2 == 0;
}
});
filterRDD.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer integer) throws Exception {
println(integer + "");
}
});
}
/**
* 2,4,6
*/
def filter()={
val list:List[Int] = List(1,2,3,4,5,6,7)
val rdd1: RDD[Int] = sc.parallelize(list)
rdd1.filter(_%2==0).foreach(println(_))
}
groupByKey()
不会做局部汇总
/**
* RDD()
* bykey
* -结果:
* 峨眉
* 周芷若灭绝师太
* 武当
* 宋青书张三丰
*/
public static void groupBykey() {
final List<Tuple2<String, String>> list = Arrays.asList(
new Tuple2<String, String>("峨眉", "周芷若"),
new Tuple2<String, String>("武当", "宋青书"),
new Tuple2<String, String>("峨眉", "灭绝师太"),
new Tuple2<String, String>("武当", "张三丰")
);
final JavaPairRDD<String, String> rdd = sc.parallelizePairs(list);
final JavaPairRDD<String, Iterable<String>> groupBykeyRDD = rdd.groupByKey();
groupBykeyRDD.foreach(new VoidFunction<Tuple2<String, Iterable<String>>>() {
@Override
public void call(Tuple2<String, Iterable<String>> tuple) throws Exception {
final String menpai = tuple._1;
final Iterator<String> iterator = tuple._2.iterator();
println(menpai + " ");
while (iterator.hasNext()) {
final String name = iterator.next();
System.out.print(name);
}
println("");
}
});
}
/**
* 后面是可以传分区规则的,如果不写默认是Hash分区
* (峨眉,CompactBuffer(周芷若, 灭绝师太))
* (武当,CompactBuffer(宋青书, 张三丰))
*/
def groupBykey()= {
val list: List[(String, String)] = List(("峨眉", "周芷若"), ("武当", "宋青书"), ("峨眉", "灭绝师太"), ("武当", "张三丰"))
val rdd: RDD[(String, String)] = sc.parallelize(list)
rdd.groupByKey().foreach(println(_))
}
reduceByKey()
会进行局部汇总,减少shuffle量,所以尽量使用该算子,避免使用groupByKey()
/**
*峨眉 100
* 武当 129
*/
public static void reduceBykey() {
final List<Tuple2<String, Integer>> list = Arrays.asList(
new Tuple2<String, Integer>("峨眉", 40),
new Tuple2<String, Integer>("武当", 30),
new Tuple2<String, Integer>("峨眉", 60),
new Tuple2<String, Integer>("武当", 99)
);
//reduceBykey
final JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(list);
rdd.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
}).foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tuple) throws Exception {
println(tuple._1 + " " + tuple._2);
}
});
}
/**
* (峨眉,100)
* (武当,129)
*/
def reduceBykey()={
val list: List[(String, Int)] = List(("峨眉", 40), ("武当", 30), ("峨眉", 60), ("武当", 99))
val rdd: RDD[(String, Int)] = sc.parallelize(list)
rdd.reduceByKey(_+_).foreach(println(_))
}
sortByKey()
/**
* 98 -> 东方不败
* 85 -> 令狐冲
* 83 -> 任我行
* 80 -> 岳不群
*/
public static void sortBykey() {
final List<Tuple2<Integer, String>> list = Arrays.asList(
new Tuple2<Integer, String>(98, "东方不败"),
new Tuple2<Integer, String>(80, "岳不群"),
new Tuple2<Integer, String>(85, "令狐冲"),
new Tuple2<Integer, String>(83, "任我行")
);
final JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(list);
rdd.sortByKey(false)
.foreach(new VoidFunction<Tuple2<Integer, String>>() {
@Override
public void call(Tuple2<Integer, String> tuple) throws Exception {
println(tuple._1 + " -> " + tuple._2);
}
});
}
/**
* 98->东方不败
* 85->令狐冲
* 83->任我行
* 80->岳不群
*/
def sortBykey()= {
val list: List[(Int, String)] = List((98, "东方不败"), (80, "岳不群"), (85, "令狐冲"), (83, "任我行"))
val rdd: RDD[(Int, String)] = sc.parallelize(list)
rdd.sortBy(word=>word,false,0).foreach(x=>println(x._1+"->"+x._2))
}
join()
/**
* 学号:1 名字:东方不败 分数:99
* 学号:3 名字:林平之 分数:97
* 学号:2 名字:令狐冲 分数:98
*/
public static void join() {
final List<Tuple2<Integer, String>> names = Arrays.asList(
new Tuple2<Integer, String>(1, "东方不败"),
new Tuple2<Integer, String>(2, "令狐冲"),
new Tuple2<Integer, String>(3, "林平之")
);
final List<Tuple2<Integer, Integer>> scores = Arrays.asList(
new Tuple2<Integer, Integer>(1, 99),
new Tuple2<Integer, Integer>(2, 98),
new Tuple2<Integer, Integer>(3, 97)
);
final JavaPairRDD<Integer, String> nemesrdd = sc.parallelizePairs(names);
final JavaPairRDD<Integer, Integer> scoresrdd = sc.parallelizePairs(scores);
/**
* <Integer, 学号
* Tuple2<String, 名字
* Integer>> 分数
*/
final JavaPairRDD<Integer, Tuple2<String, Integer>> joinRDD = nemesrdd.join(scoresrdd);
// final JavaPairRDD<Integer, Tuple2<Integer, String>> join = scoresrdd.join(nemesrdd);
joinRDD.foreach(new VoidFunction<Tuple2<Integer, Tuple2<String, Integer>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<String, Integer>> tuple) throws Exception {
println("学号:" + tuple._1 + " 名字:" + tuple._2._1 + " 分数:" + tuple._2._2);
}
});
}
/**
* (1,(东方不败,99))
* (3,(林平之,97))
* (2,(令狐冲,98))
* ----->
* 学号:1 名字:东方不败 分数:99
* 学号:3 名字:林平之 分数:97
* 学号:2 名字:令狐冲 分数:98
*/
//可以把RDD1,RDD2中的相同的key给连接起来,类似于sql中的join操作
def join()= {
val list1: List[(Int, String)] = List((1, "东方不败"), (2, "令狐冲"), (3, "林平之"))
val list2: List[(Int, Int)]=List((1, 99), (2, 98), (3, 97))
val rdd1: RDD[(Int, String)] = sc.parallelize(list1)
val rdd2: RDD[(Int, Int)] = sc.parallelize(list2)
rdd1.join(rdd2).foreach(x=>println("学号:"+x._1+" 名字:"+x._2._1+" 分数:"+x._2._2))
}
union()
/**
* 1
* 2
* 3
* 4
* 3
* 4
* 5
* 6
*/
public static void union() {
final List<Integer> list1 = Arrays.asList(1, 2, 3, 4);
final List<Integer> list2 = Arrays.asList(3, 4, 5, 6);
final JavaRDD<Integer> rdd1 = sc.parallelize(list1);
final JavaRDD<Integer> rdd2 = sc.parallelize(list2);
rdd1.union(rdd2)
.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer number) throws Exception {
println(number + "");
}
});
}
/**
* union(或称为联合)的作用是将多个结果合并在一起显示出来。
* Union:将两个RDD进行合并,不去重;
* 1
* 2
* 3
* 4
* 3
* 4
* 5
* 6
*/
def union()={
val list1: List[Int] = List(1,2,3,4)
val list2: List[Int] = List(3,4,5,6)
val rdd1: RDD[Int] = sc.parallelize(list1)
val rdd2: RDD[Int] = sc.parallelize(list2)
rdd1.union(rdd2).foreach(println(_))
}
leftOuterJoin()
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* (22,(qwe,None))
* (3,(zxc,Some(true)))
* (2,(asd,Some(true)))
*/
object Test {
def main(args: Array[String]): Unit = {
val a = List((1L, true), (2L, true), (3L, true))
val b = List((22L, "qwe"), (2L, "asd"), (3L, "zxc"))
val conf: SparkConf = new SparkConf().setMaster("local").setAppName("Test")
val sc: SparkContext = new SparkContext(conf)
val rdd1: RDD[(Long, Boolean)] = sc.parallelize(a)
val rdd2: RDD[(Long, String)] = sc.parallelize(b)
//必须都是kv形式才能使用,常用于黑名单过滤
val rdd3: RDD[(Long, (String, Option[Boolean]))] = rdd2.leftOuterJoin(rdd1)
rdd3.foreach(println(_))
}
}
intersection()
/**
* 交集
* 4
* 3
*/
public static void intersection() {
final List<Integer> list1 = Arrays.asList(1, 2, 3, 4);
final List<Integer> list2 = Arrays.asList(3, 4, 5, 6);
final JavaRDD<Integer> rdd1 = sc.parallelize(list1);
final JavaRDD<Integer> rdd2 = sc.parallelize(list2);
rdd1.intersection(rdd2)
.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer number) throws Exception {
println(number + "");
}
});
}
/**
* 求交集
* RDD1.intersection(RDD2) 返回两个RDD的交集,并且去重
* intersection 需要混洗数据,比较浪费性能
* 4
* 3
*/
def intersection(){
val list1: List[Int] = List(1,2,3,4)
val list2: List[Int] = List(3,4,5,6)
val rdd1: RDD[Int] = sc.parallelize(list1)
val rdd2: RDD[Int] = sc.parallelize(list2)
rdd1.intersection(rdd2).foreach(println(_))
}
distinct()
/**
* 去重
* 4
* 1
* 3
* 2
*/
public static void distinct() {
final List<Integer> list1 = Arrays.asList(1, 2, 3, 3, 4, 4);
final JavaRDD<Integer> rdd1 = sc.parallelize(list1);
rdd1.distinct()
.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer number) throws Exception {
println(number + " ");
}
});
}
/**
* distinct用于去重, 我们生成的RDD可能有重复的元素,使用distinct方法可以去掉重复的元素, 不过此方法涉及到混洗,操作开销很大
* 4
* 1
* 3
* 2
*/
def distinct()={
val list: List[Int] = List(1,2,3,3,4,4)
val rdd: RDD[Int] = sc.parallelize(list)
rdd.distinct().foreach(println(_))
}
cartesian()
/**
* 笛卡尔积
* a->0
* a->1
* a->2
* b->0
* b->1
* b->2
*/
public static void cartesian() {
final List<String> A = Arrays.asList("a", "b");
final List<Integer> B = Arrays.asList(0, 1, 2);
final JavaRDD<String> rddA = sc.parallelize(A);
final JavaRDD<Integer> rddB = sc.parallelize(B);
rddA.cartesian(rddB)
.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tuple) throws Exception {
println(tuple._1 + "->" + tuple._2);
}
});
}
/**
* 笛卡尔积
* (a,0) a->0
* (a,1) a->1
* (a,2) -> a->2
* (b,0) b->1
* (b,1) b->2
* (b,2) b->3
*/
def cartesian()={
val list1: List[String] = List("a", "b")
val list2: List[Int] = List(0, 1, 2)
val rdd1: RDD[String] = sc.parallelize(list1)
val rdd2: RDD[Int] = sc.parallelize(list2)
rdd1.cartesian(rdd2).foreach(x=>println(x._1+"->"+x._2))
}
mapPartitions()
/**
* map:
* 一条数据一条数据的处理(文件系统,数据库等等)
* mapPartitions:
* 一次获取的是一个分区的数据(hdfs)
* 正常情况下,mapPartitions 是一个高性能的算子
* 因为每次处理的是一个分区的数据,减少了去获取数据的次数。
* 但是如果我们的分区如果设置得不合理,有可能导致每个分区里面的数据量过大。
* hello-1
* hello-2
* hello-3
* hello-4
* hello-5
* hello-6
*/
public static void mapPartitions() {
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6);
//参数二代表这个rdd里面有两个分区
final JavaRDD<Integer> rdd = sc.parallelize(list, 2);
rdd.mapPartitions(new FlatMapFunction<Iterator<Integer>, String>() {
//每次处理的是一个分区的数据
@Override
public Iterator<String> call(Iterator<Integer> iterator) throws Exception {
List<String> list = new ArrayList<String>();
while (iterator.hasNext()) {
list.add("hello-" + iterator.next());
}
return list.iterator();
}
}).foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
println(s);
}
});
}
/**
* 一次获取的是一个分区的数据(hdfs)
* mapPartition可以倒过来理解,先partition,再把每个partition进行map函数,
* 适用场景
* 如果在映射的过程中需要频繁创建额外的对象,使用mapPartitions要比map高效的过。
* hello-1
* hello-2
* hello-3
* hello-4
* hello-5
* hello-6
*/
def mapPartitions()={
val list: List[Int] = List(1,2,3,4,5,6)
val rdd: RDD[Int] = sc.parallelize(list,2)
rdd.mapPartitions(_.toIterator).foreach(x=>println("hello-"+x))
}
repartition()
/**
* 进行重分区
* HDFS -》 hello.txt 2个文件块(不包含副本)
* 2个文件块 -》2个分区 -》当spark任务运行,一个分区就启动一个task任务。
* 解决的问题:本来分区数少 -》 增加分区数
* 1,3,5,2,4,6
*/
public static void repartition() {
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6);
final JavaRDD<Integer> rdd = (JavaRDD<Integer>) sc.parallelize(list, 1);
// coalesce(numPartitions, shuffle = true)
rdd.repartition(2)
.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer number) throws Exception {
println(number + "");
}
});
}
/**
* 进行重分区
* 解决的问题:本来分区数少 -》 增加分区数
* 1
* 3
* 5
* 2
* 4
* 6
*/
def repartition()={
val list: List[Int] = List(1,2,3,4,5,6)
val rdd: RDD[Int] = sc.parallelize(list,1)
rdd.repartition(2).foreach(println(_))
}
aggregateByKey()
/**
* 实现单词计数
* you ->1
* jump ->2
* i ->1
*/
public static void aggregateByKey() {
final List<String> list = Arrays.asList("you,jump", "i,jump");
final JavaRDD<String> rdd = sc.parallelize(list);
rdd.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String line) throws Exception {
return Arrays.asList(line.split(",")).iterator();
}
})
//第一个参数输入的数据类型,第二个参数输出的Key的类型,第三个参数输出的Value的类型
.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<String, Integer>(word, 1);
}
})
//第一个参数是初始值,如果是10就是每一个单词一开始就是10个,0就是按照0往上加,第二个是局部进行计算,第三个是全局计算,这个特点就是控制的比较细,使用比较复杂,还可以对字符串进行拼接
.aggregateByKey(0, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;//局部
}
}, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;//全局
}
}
).foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tuple) throws Exception {
println(tuple._1 + " ->" + tuple._2);
}
});
}
/**
* 实现单词计数
* 第一个aggregateByKey函数我们可以自定义Partitioner。除了这个参数之外,其函数声明和aggregate很类似;其他的aggregateByKey函数实现最终都是调用这个。
* 第二个aggregateByKey函数可以设置分区的个数(numPartitions),最终用的是HashPartitioner。
* 最后一个aggregateByKey实现先会判断当前RDD是否定义了分区函数,如果定义了则用当前RDD的分区;如果当前RDD并未定义分区 ,则使用HashPartitioner。
* aggregateByKey(初始值)((局部计算),(全局计算))
* (you,1)
* (jump,2)
* (i,1)
*/
def aggregateByKey() ={
val list: List[String] = List("you,jump", "i,jump")
val rdd: RDD[String] = sc.parallelize(list)
rdd.flatMap(_.split(",")).map((_, 1)).aggregateByKey(0)(((x,y)=>x+y),((x,y)=>x+y)).foreach(println(_))
}
coalesce()
/**
* 分区数由多 -》 变少
*/
public static void coalesce() {
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6);
final JavaRDD<Integer> rdd = (JavaRDD<Integer>) sc.parallelize(list, 3);
rdd.coalesce(1)
.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer integer) throws Exception {
println(integer + "");
}
});
}
/**
* 分区数由多 -》 变少
* 1,2,3,4,5,6
*/
def coalesce()={
val list:List[Int] = List(1,2,3,4,5,6)
val rdd: RDD[Int] = sc.parallelize(list,3)
rdd.coalesce(1).foreach(println(_))
}
mapPartitionsWithIndex()
/**
* map: 每次获取和处理的就是一条数据
* mapParitions: 每次获取和处理的就是一个分区的数据
* mapPartitionsWithIndex:每次获取和处理的就是一个分区的数据,并且知道处理的分区的分区号.
*
* 0_1
* 0_2
* 0_3
* 0_4
* 1_5
* 1_6
* 1_7
* 1_8
*/
public static void mapPartitionsWithIndex() {
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8);
final JavaRDD<Integer> rdd = sc.parallelize(list, 2);//HashParitioners Rangepartitionw 自定义分区
rdd.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<String>>() {
@Override
public Iterator<String> call(Integer index, Iterator<Integer> iterator) throws Exception {
final ArrayList<String> list = new ArrayList<>();
while (iterator.hasNext()) {
list.add(index + "_" + iterator.next());
}
return list.iterator();
}
}, true)
.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
println(s);
}
});
}
/**
* map: 每次获取和处理的就是一条数据
* mapParitions: 每次获取和处理的就是一个分区的数据
* mapPartitionsWithIndex:每次获取和处理的就是一个分区的数据,并且知道处理的分区的分区号.
*
* 0_1
* 0_2
* 1_3
* 1_4
* 1_5
* 2_6
* 2_7
* 2_8
*/
def mapPartitionsWithIndex()={
val list:List[Int] = List(1,2 ,3,4,5 ,6,7,8)
val rdd: RDD[Int] = sc.parallelize(list, 3)
rdd.mapPartitionsWithIndex(
(x,iter) => {
val array: ArrayBuffer[Any] = ArrayBuffer()
var result = List[String]()
//var i = 0
while(iter.hasNext){
array += iter.next()
}
array.map(i => x + "_" + i).iterator
}
,true).foreach(println(_))
}
cogroup()
/**
* When called on datasets of type (K, V) and (K, W),
* returns a dataset of (K, (Iterable<V>, Iterable<W>)) tuples.
*
* ID:1 Name: [东方不败, 东方不败] Scores: [90, 98]
* ID:3 Name: [岳不群, 岳不群] Scores: [89, 67]
* ID:2 Name: [林平之, 林平之] Scores: [91, 78]
*
*/
public static void cogroup() {
//sh s sha shan shang sa san sang
final List<Tuple2<Integer, String>> list1 = Arrays.asList(
new Tuple2<Integer, String>(1, "东方不败"),
new Tuple2<Integer, String>(2, "林平之"),
new Tuple2<Integer, String>(3, "岳不群"),
new Tuple2<Integer, String>(1, "东方不败"),
new Tuple2<Integer, String>(2, "林平之"),
new Tuple2<Integer, String>(3, "岳不群")
);
final List<Tuple2<Integer, Integer>> list2 = Arrays.asList(
new Tuple2<Integer, Integer>(1, 90),
new Tuple2<Integer, Integer>(2, 91),
new Tuple2<Integer, Integer>(3, 89),
new Tuple2<Integer, Integer>(1, 98),
new Tuple2<Integer, Integer>(2, 78),
new Tuple2<Integer, Integer>(3, 67)
);
final JavaPairRDD<Integer, String> rdd1 = sc.parallelizePairs(list1);
final JavaPairRDD<Integer, Integer> rdd2 = sc.parallelizePairs(list2);
final JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> rdd3 =
(JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>>) rdd1.cogroup(rdd2);
rdd3.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> tuple) throws Exception {
final Integer id = tuple._1;
final Iterable<String> names = tuple._2._1;
final Iterable<Integer> scores = tuple._2._2;
println("ID:" + id + " Name: " + names + " Scores: " + scores);
}
});
}
/**
* groupByKey是对单个 RDD 的数据进行分组,还可以使用一个叫作 cogroup() 的函数对多个共享同一个键的 RDD 进行分组
* 例如
* RDD1.cogroup(RDD2) 会将RDD1和RDD2按照相同的key进行分组,得到(key,RDD[key,Iterable[value1],Iterable[value2] ])的形式
* cogroup也可以多个进行分组
* 例如RDD1.cogroup(RDD2,RDD3,…RDDN), 可以得到(key,Iterable[value1],Iterable[value2],Iterable[value3],…,Iterable[valueN])
*
* (1,(CompactBuffer(东方不败, 东方不败),CompactBuffer(90, 98)))
* (3,(CompactBuffer(岳不群, 岳不群),CompactBuffer(89, 67)))
* (2,(CompactBuffer(林平之, 林平之),CompactBuffer(91, 78)))
*
* ID:1Name:CompactBuffer(东方不败, 东方不败)Scores:CompactBuffer(90, 98)
* ID:3Name:CompactBuffer(岳不群, 岳不群)Scores:CompactBuffer(89, 67)
* ID:2Name:CompactBuffer(林平之, 林平之)Scores:CompactBuffer(91, 78)
*/
def cogroup()= {
val list1: List[(Int, String)] = List((1, "东方不败"), (2, "林平之"), (3, "岳不群"), (1, "东方不败"), (2, "林平之"), (3, "岳不群"))
val list2: List[(Int, Int)] = List((1, 90), (2, 91), (3, 89), (1, 98), (2, 78), (3, 67))
val rdd1: RDD[(Int, String)] = sc.parallelize(list1)
val rdd2: RDD[(Int, Int)] = sc.parallelize(list2)
rdd1.cogroup(rdd2).foreach(x=>println("ID:"+x._1+"Name:"+x._2._1+"Scores:"+x._2._2))
}
repartitionAndSortWithinPartitions()
/**
* 少 -》 多
*/
public static void repartitionAndSortWithinPartitions() {//调优
final List<Integer> list = Arrays.asList(1, 2, 11, 3, 12, 4, 5);
final JavaRDD<Integer> rdd = sc.parallelize(list, 1);
final JavaPairRDD<Integer, Integer> pairRDD = rdd.mapToPair(new PairFunction<Integer, Integer, Integer>() {
@Override
public Tuple2<Integer, Integer> call(Integer number) throws Exception {
return new Tuple2<>(number, number);
}
});
//new HashPartitioner(2) new RangePartitioner<>()
pairRDD.repartitionAndSortWithinPartitions(new Partitioner() {
@Override
public int numPartitions() {
return 2;
}
@Override
public int getPartition(Object key) {
final Integer number = Integer.valueOf(key.toString());
if (number % 2 == 0) {
return 0;
} else {
return 1;
}
}
}).mapPartitionsWithIndex(new Function2<Integer, Iterator<Tuple2<Integer, Integer>>,
Iterator<String>>() {
@Override
public Iterator<String> call(Integer index, Iterator<Tuple2<Integer, Integer>> iterator) throws Exception {
final ArrayList<String> list = new ArrayList<>();
while (iterator.hasNext()) {
list.add(index + "_" + iterator.next());
}
return list.iterator();
}
}, false)
.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
println(s);
}
});
}
/**
* 少 -》 多
* 如果需要在repartition重分区之后,还要进行排序,建议直接使用repartitionAndSortWithinPartitions算子。因为该算子可以一边进行重分区的shuffle操作,一边进行排序。shuffle与sort两个操作同时进行,比先shuffle再sort来说,性能可能是要高的。
* 此方法需要K,V的数据
* (1,3)
* (1,2)
* (1,4)
* (2,3)
* (2,4)
* (5,4)
*/
def repartitionAndSortWithinPartitions()= {
val list = List((1,3),(1,2),(5,4),(1, 4),(2,3),(2,4))
val rdd: RDD[(Int, Int)] = sc.parallelize(list, 1)
rdd.repartitionAndSortWithinPartitions(new HashPartitioner(3)).foreach(println(_))
}
sample()
/**
* 有放回
* 无放回
*/
public static void sample() {
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 9, 10);
final JavaRDD<Integer> rdd = sc.parallelize(list);
/**
* withReplacement: Boolean,
* true: 有放回的抽样
* false: 无放回抽象
* fraction: Double:
* RDD 里面的每个元素被抽到的概率有多大
* seed: Long:
* 随机种子
*/
final JavaRDD<Integer> rdd2 = rdd.sample(false, 0.5);
rdd2.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer integer) throws Exception {
println(integer + "");
}
});
}
/**
* 结果不确定
* 如果写第三个参数(随机种子)那么结果是固定的
*/
def sample()={
val list: List[Int] = List (1,2,3,4,5,6,7,9,10)
val rdd: RDD[Int] = sc.parallelize(list)
rdd.sample(false,0.5).foreach(println(_))
}
pipe()
public static void pipe() {
final List<Integer> list = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 9, 10);
final JavaRDD<Integer> rdd = sc.parallelize(list);
// final JavaRDD<String> pipe = rdd.pipe("sh wordcouont.sh");
}
/**
* 调用Shell命令
* 在Linux系统中,有许多对数据进行处理的shell命令,我们可能通过pipe变换将一些shell命令用于Spark中生成新的RDD。
*/
def pipe()={
val list: List[Int] = List(1,2,3,4,5,6,7,9,10)
val rdd: RDD[Int] = sc.parallelize(list)
rdd.pipe("sh wordcount.sh")
}