RDD算子大全,你想要的我都有
一 转换算子TransformRDD
SC
- Scala
object mapdemo {
def main(args: Array[String]): Unit = {
println("----------创建SparkContext--------------")
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("mapdemo")
val sc: SparkContext = SparkContext.getOrCreate(conf)
- Java
public class ParallelizeJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("java1");
JavaSparkContext sc = new JavaSparkContext(conf);
parallelize
- Scala
val rdd1:RDD[Int]=sc.parallelize(List(1,2,3,4))
- Java
List<String> strings = Arrays.asList("hello world", "hello java", "hello spark");
JavaRDD<String> rdd1 = sc.parallelize(strings);
List<String> collect = rdd1.collect();
for (String s : collect) {
System.out.println(s);
}
textFile
- Java
//本地文件相对路径
JavaRDD<String> stringJavaRDD = sc.textFile("in/word.txt");
List<String> collect1 = stringJavaRDD.collect();
for (String s : collect1) {
System.out.println(s);
}
//hdfs路径
JavaRDD<String> stringhdfsJavaRDD = sc.textFile("hdfs://Zhuuu:9000/sparkWorkSpace/s*");
List<String> collect2 = stringhdfsJavaRDD.collect();
for (String s : collect2) {
System.out.println(s);
}
filter
- Scala
val rdd1: RDD[Int] = sc.parallelize( 1 to 10,3)
val rdd2: RDD[Int] = rdd1.filter(_%2==0)
val rdd3: RDD[Int] = rdd1.filter(_<4)
- Java
JavaRDD<String> lines = sc.textFile("in");
JavaRDD<String> filterRdd = lines.filter(new Function<String, Boolean>() {
@Override
public Boolean call(String v1) throws Exception {
return v1.contains("spark");
}
});
List<String> collect = filterRdd.collect();
for (String s : collect) {
System.out.println(s);
}
mapValue
- Scala
val mapValuesRdd1: RDD[String] = sc.parallelize(List("tiger","dog","lion","cat","panther","eagle"))
val mapValuesRdd2: RDD[(Int, String)] = mapValuesRdd1.map(x=>(x.length,x))val mapValuesRdd3: RDD[(Int, String)] = mapValuesRdd2.mapValues(x=>"_"+x+"_")
mapValuesRdd3.collect().foreach(println)
/*
(5,_tiger_)
(3,_dog_)
(4,_lion_)
(3,_cat_)
(7,_panther_)
(5,_eagle_)
distinct
- Scala
val rdd1: RDD[Int] = sc.parallelize(List(1,2,3,4,5,3,4,5,6))
val rdd2: RDD[Int] = rdd1.distinct()
println(rdd1.collect().size)
println(rdd2.collect().size)
println(rdd1.partitions.length)
println(rdd2.partitions.length)
val rdd3: RDD[Int] = rdd1.distinct(2)
println(rdd3.partitions.length)
/*
6
3
3
2
- Java
JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 3, 4, 5));
JavaRDD<Integer> distinct = rdd1.distinct();
for (Integer integer : distinct.collect()) {
System.out.print(integer+",");
}
union/++
- Scala
val r1: RDD[Int] = sc.parallelize(1 to 3)
val r2: RDD[Int] = sc.makeRDD(3 to 4)
r1.union(r2).collect().foreach(println)
(r1++r2).collect().foreach(println)
/*
1
2
3
3
4
1
2
3
3
4
- Java
JavaRDD<Integer> rdd2 = sc.parallelize(Arrays.asList(3,10, 11, 12));
for (Integer integer : rdd1.union(rdd2).collect()) {
System.out.print(integer+",");
}
intersection
- Scala
r1.intersection(r2).collect().foreach(println)
/*
3
- Java
JavaRDD<Integer> intersection = rdd1.intersection(rdd2);
List<Integer> collect = intersection.collect();
for (Integer integer : collect) {
System.out.print(integer+",");
}
subtract
- Java
JavaRDD<Integer> subtract = rdd1.subtract(rdd2);
List<Integer> collect1 = subtract.collect();
for (Integer integer : collect1) {
System.out.print(integer+",");
}
cartesian
- Java
JavaPairRDD<Integer, Integer> cartesian = rdd1.cartesian(rdd2);
List<Tuple2<Integer, Integer>> collect2 = cartesian.collect();
for (Tuple2<Integer, Integer> integerIntegerTuple2 : collect2) {
// System.out.println("("+integerIntegerTuple2._1+" "+integerIntegerTuple2._2+")");
System.out.println(integerIntegerTuple2);
}
toDebugString
- 查看血统关系
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object toDebugString {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("todebugString")
val sc = new SparkContext(conf)
val rdd1: RDD[String] = sc.textFile("in/simple.txt",3)
println(rdd1.toDebugString)
val rdd2: RDD[String] = rdd1.flatMap(x=>x.split(" "))
println(rdd2.toDebugString)
val rdd3: RDD[(String, Int)] = rdd2.map((_,1))
println(rdd3.toDebugString)
}
}
/*
//(3)代表3个分区
(3) in/simple.txt MapPartitionsRDD[1] at textFile at toDebugString.scala:10 []
| in/simple.txt HadoopRDD[0] at textFile at toDebugString.scala:10 []
(3) MapPartitionsRDD[2] at flatMap at toDebugString.scala:12 []
| in/simple.txt MapPartitionsRDD[1] at textFile at toDebugString.scala:10 []
| in/simple.txt HadoopRDD[0] at textFile at toDebugString.scala:10 []
(3) MapPartitionsRDD[3] at map at toDebugString.scala:14 []
| MapPartitionsRDD[2] at flatMap at toDebugString.scala:12 []
| in/simple.txt MapPartitionsRDD[1] at textFile at toDebugString.scala:10 []
| in/simple.txt HadoopRDD[0] at textFile at toDebugString.scala:10 []
dependencies
- 查看宽窄依赖
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object dependencies {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("dependencies")
val sc = new SparkContext(conf)
val rdd1: RDD[(Int, Int)] = sc.parallelize(List((1,2),(2,3),(4,5)))
val rdd2: RDD[(Int, Int)] = sc.parallelize(List((1,3)))
val join: RDD[(Int, (Int, Int))] = rdd1.join(rdd2)
println(join.dependencies)
val sort: RDD[(Int, Int)] = rdd1.sortByKey()
println(sort.dependencies)
}
}
/*
List(org.apache.spark.OneToOneDependency@790132f7)
List(org.apache.spark.ShuffleDependency@66434cc8)
二 动作算子ActionRDD
first
- Scala
val rdd=sc.parallelize(List(1,2,3,3))
val i:Int=rdd.first()
println(i) //1
- Java
JavaRDD<Integer> rdd=sc.parallelize(Arrays.asList(1,2,3,4));
Integer first=rdd.first();
System.out.println(first) //1
take
- Scala
val rdd=sc.parallelize(List(1,2,3,4))
val rdd2:Array[Int]=rdd.take(2)
println(rdd.mkString(",")) //1,2
- Java
JavaRDD<Integer> rdd=sc.parallellize(Arrays.asList(1,2,3,4));
List<Integer> take=rdd.take(2);
System.out.println(take) //[1,2]
collect
- Scala
val rdd=sc.parallelize(List(1,2,3,4))
rdd.collect.foreach(println)
- Java
JavaRDD<Integer> rdd=sc.parallelize(Arrays.asList(1,2,3,4));
List<Integer> collect=rdd.collect;
count
- Scala
val rdd=sc.parallelize(List(1,2,3,4))
val i:Long=rdd.count()
- Java
JavaRDD<Integer> rdd=sc.parallelize(Arrays.asList(1,2,3,4));
long count=rdd.count;
countByKey&collectAsMap
- Scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object CountByKey {
def main(args:Array[String]):Unit={
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
val rdd1: RDD[(Int, Int)] = sc.makeRDD(List((1,2),(1,3),(2,3)))
println("-------countByKey------")
val countkey: collection.Map[Int, Long] = rdd1.countByKey()
val set: collection.Set[Int] = countkey.keySet
for (elem <- set) {
println(elem+"="+countkey.get(elem).get)
}
}
}
/*
-------countByKey------
1=2
2=1
- Java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/9
* @Description:
*/
public class CountByKeyJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("countbykey");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<Integer,Integer>> rdd1 = sc.parallelize(Arrays.asList(new Tuple2(1, 2), new Tuple2(1, 3), new Tuple2(2, 5)));
JavaPairRDD<Integer, Integer> mapRDD = JavaPairRDD.fromJavaRDD(rdd1);
Map<Integer, Long> mapcount = mapRDD.countByKey();
Set<Integer> keys = mapcount.keySet();
for (Integer key : keys) {
Long value = mapcount.get(key);
System.out.println("("+key+","+value+")");
}
}
}
/*
(1,2)
(2,1)
countByValue
- Scala
val rdd=Sc.parallelize(List(1,2,3,3))
val intToLong: collection.Map[Int, Long] = rdd1.countByValue()
println(intToLong) //Map(1 -> 1, 2 -> 1, 3 -> 2)
- Java
JavaRDD<Integer> rdd=sc.parallelize(Arrays.asList(1,2,3,3));
Map<Integer,Long> integerLongMap=rdd.countByValue();
System.out.prinitln(integerLongMap) //{1=1, 2=1, 3=2}
collectAsMap
- Scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object CountByKey {
def main(args:Array[String]):Unit={
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
val rdd1: RDD[(Int, Int)] = sc.makeRDD(List((1,2),(1,3),(2,3)))
val map: collection.Map[Int, Int] = rdd1.collectAsMap() //因为map的键是无序且唯一,所以相同的键会覆盖到最后一个有效。
println(map)
}
}
/*
Map(2 -> 3, 1 -> 3)
- Java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/9
* @Description:
*/
public class CountByKeyJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("countbykey");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<Integer,Integer>> rdd1 = sc.parallelize(Arrays.asList(new Tuple2(1, 2), new Tuple2(1, 3), new Tuple2(2, 5)));
System.out.println("---JavaPairRDD方式会导致collectAsMap出bug----")
JavaPairRDD<Integer, Integer> mapRDD = JavaPairRDD.fromJavaRDD(rdd1);
Map<Integer, Integer> map = mapRDD.collectAsMap();
//Exception in thread "main" java.lang.ClassCastException: [Ljava.lang.Object; cannot be cast to [Lscala.Tuple2;
System.out.println("---所以JavaRDD转换JavaPairRDD时采用mapToPair方式就不会报错----")
JavaPairRDD<Integer, Integer> mapRdd = rdd1.mapToPair(new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> tp2) throws Exception {
return tp2;
}
});
Map<Integer, Integer> map1 = mapRdd.collectAsMap();
System.out.println(map1);
// {2=5, 1=3} //map键无序且唯一
Reduce
- Scala
val actionReduce: RDD[Int] = sc.parallelize(1 to 10,2)//分区大于1时数据分到不同分区会在自己分区内相加,然后分区间加,所以显得无序。
val sum: Int = actionReduce.reduce((x,y)=>{println(x,y);x+y})
println("总和: "+sum)
/*
(6,7)
(1,2)
(3,3)
(13,8)
(6,4)
(21,9)
(10,5)
(30,10)
(15,40)
总和: 55
- Java
Integer reduce = rdd.reduce(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
aggregate
- Scala
val rdd1: RDD[Int] = sc.parallelize(List(1,2,3,3),1)
val agg: Int = rdd1.aggregate(1)((x,y)=>{println("区内",x,y);x+y},(x,y)=>{println("区间",x,y);x+y})
println(agg)
/*
(区内,1,1)
(区内,2,2)
(区内,4,3)
(区内,7,3)
(区间,1,10)
11
*/
val rdd2: RDD[Int] = sc.parallelize(List(1,2,3,3),4)
val agg2: Int = rdd1.aggregate(1)((x,y)=>{println("区内",x,y);x+y},(x,y)=>{println("区间",x,y);x+y})
println(agg2)
/*
(区内,1,3)
(区内,1,1)
(区内,1,2)
(区内,1,3)
(区间,1,3)
(区间,4,4)
(区间,8,2)
(区间,10,4)
14
*/
fold
总结:结果=集合数据之和+(分区数+1)*初始值
- Scala
val rdd1: RDD[Int] = sc.parallelize(List(1,2,3,3),1)
val fold1: Int = rdd1.fold(0)((x,y)=>{println(x,y);x+y})
println(fold1)
/*
(1,1)
(2,2)
(4,3)
(7,3)
(1,10)
11
*/
val rdd2: RDD[Int] = sc.parallelize(List(1,2,3,3),4)
val fold2: Int = rdd1.fold(1)((x,y)=>{println(x,y);x+y})
println(fold2)
/*
(1,3)
(1,2)
(1,1)
(1,3)
(1,3)
(4,4)
(8,4)
(12,2)
14
*/
val rdd3: RDD[Int] = sc.parallelize(List(1,2,3,3),4)
val fold3: Int = rdd1.fold(1)((x,y)=>{println(x,y);x+y})
println(fold3)
/*
(1,3)
(1,1)
(1,2)
(1,3)
(1,1)
(2,1)
(3,2)
(5,1)
(6,4)
(10,1)
(11,3)
(14,4)
18
*/
top
- Scala
val rdd1=sc.parallelize(List(1,2,3,4))
val top: Array[Int]=rdd1.top(2)
println(top.mkString(",")) //3,3
takeOrdered
- Scala
val rdd1=sc.parallelize(List(1,2,3,4))
val takeOrder: Array[Int]=rdd1.takeOrdered(2)
println(takeOrder.mkString(",")) //1,2
saveAsTextFile
- Scala
actionReduce.saveAsTextFile("file:///D:/hi/rdd1.txt") //储存在windows本地
actionReduce.saveAsTextFile("hdfs://Zhuuu:9000/sparkWorkSpace/rdd6.txt") //储存在hdfs路径
在windows本地hosts添加ip 主机名
**注意:**上面的Zhuuu是虚拟机ip地址在windows系统的代称。具体看下图
这边强烈建议这里的主机名要和虚拟机主机名一致,否则,即使这里没有错误,以后在IDEA中spark操作hive等操作时还是会报错。
三 聚合操作…ByKey
Java中各种…ByKey的前提都要为JavaPairRDD,如果不是请mapToPair,flatMapToPair去转换类型和结构。而Java中的map和flatMap只能改变结构不能改变类型,即它只是JavaRDD类型。
reduceByKey
- Scala
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object reduceByKeyScala {
def main(args:Array[String]):Unit={
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
val rdd1: RDD[(Int, Int)] = sc.parallelize(List((1,2),(3,4),(3,5),(4,6),(4,7),(4,8)),1)
val rdd2: RDD[(Int, Int)] = rdd1.reduceByKey((x,y)=>{println("one:"+x,"two:"+y);x+y})
rdd2.collect().foreach(println)
println("----------wordcount----------")
val lines: RDD[String] = sc.textFile("in/simple.txt")
lines.flatMap(_.split("\\s+")).map((_,1)).reduceByKey(_+_).collect.foreach(println)
//4.关闭连接
sc.stop()
}
}
- Java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/6
* @Description: reduceByKey的前提是PairRDD
*/
public class ReduceByKeyJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("reducebykey");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("hello", "haha", "hello", "hello"));
JavaPairRDD<String, Integer> rdd2 = rdd1.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
});
JavaPairRDD<String, Integer> rdd3 = rdd2.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
List<Tuple2<String, Integer>> collect = rdd3.collect();
for (Tuple2<String, Integer> stringIntegerTuple2 : collect) {
System.out.println(stringIntegerTuple2);
}
//读取文件,因为文件读取一行为一个元素,采用flatMatToPair扁平化使每一个单词都为一个元素,且都放在一个集合中,采用mapToPair几行还是几个集合。
JavaRDD<String> textFileRDD = sc.textFile("in/sample.txt");
JavaPairRDD<String, Integer> stringIntegerJavaPairRDD = textFileRDD.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
@Override
public Iterator<Tuple2<String, Integer>> call(String s) throws Exception {
List<Tuple2<String, Integer>> list = new ArrayList<>();
String[] s1 = s.split(" ");
for (String s2 : s1) {
Tuple2<String, Integer> tp2 = new Tuple2<>(s2, 1);
list.add(tp2);
}
return list.iterator();
}
});
JavaPairRDD<String, Integer> stringIntegerJavaPairRDD1 = stringIntegerJavaPairRDD.reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
List<Tuple2<String, Integer>> collect1 = stringIntegerJavaPairRDD1.collect();
for (Tuple2<String, Integer> co : collect1) {
System.out.println(co);
}
}
}
groupByKey
- Scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object GroupByScala {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("groupby")
val sc = new SparkContext(conf)
val list = List(
("zhangsan", 99),
("zhangsan", 87),
("lisi", 85),
("lisi", 88),
("wangwu", 100))
val rdd1: RDD[(String, Int)] = sc.parallelize(list)
println("---------groupByKey----------")
val rdd2: RDD[(String, Iterable[Int])] = rdd1.groupByKey()
val tuples: Array[(String, Iterable[Int])] = rdd2.collect()
tuples.foreach(println)
tuples.foreach(x=>{
x._2.foreach(y=>println(x._1,y))
})
println("-----groupBy----------")
val rdd3: RDD[(String, Iterable[(String, Int)])] = rdd1.groupBy(_._1)
val tp: Array[(String, Iterable[(String, Int)])] = rdd3.collect()
tp.foreach(println)
tp.foreach(x=>{
x._2.foreach(println)
})
println("----collectAsMap---")
val map: collection.Map[String, Iterable[(String, Int)]] = rdd3.collectAsMap()
val keys: collection.Set[String] = map.keySet
val iterator: Iterator[String] = keys.iterator
while (iterator.hasNext){
val str: String = iterator.next()
val values: Option[Iterable[(String, Int)]] = map.get(str)
val iter: Iterator[Iterable[(String, Int)]] = values.iterator
while (iter.hasNext){
val tuples: Iterable[(String, Int)] = iter.next()
val it: Iterator[(String, Int)] = tuples.iterator
while (it.hasNext){
val tuple: (String, Int) = it.next()
println(tuple)
}
}
}
for (elem <- keys) {
val value: Option[Iterable[(String, Int)]] = map.get(elem)
val v: Iterable[(String, Int)] = value.get
val iterat: Iterator[(String, Int)] = v.iterator//如果调用同一个迭代器,那么增强for和迭代器只输出一次。
// for (e <- iterat) { //增强for内部调用的是迭代器输出,而迭代器只能输出一次就为空了,所以无论是增强for还是迭代器在上面,都只打印上面的。
// println(e+"for")
// }
println("--------迭代器--------")
val iterator: Iterator[(String, Int)] = v.iterator //如果这里重新定义一个迭代器,那么就是一个新的,输出两次
while (iterator.hasNext){
val tuple: (String, Int) = iterator.next()
println(tuple+"迭代器")
}
}
}
}
- Java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.*;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/9
* @Description: //可以使用JavaPairRDD.fromJavaRDD把JavaRDD类型改为JavaPairRDD,和mapToPair,flatMapToPair不同的是它只能改变类型
*
*/
public class GroupByJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("groupByJava");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<String,Integer>> list=new ArrayList<>();
Tuple2<String, Integer> tp1 = new Tuple2<>("zhangsan", 88);
Tuple2<String, Integer> tp2 = new Tuple2<>("zhangsan", 98);
Tuple2<String, Integer> tp3 = new Tuple2<>("wangwu", 87);
Tuple2<String, Integer> tp4 = new Tuple2<>("wangwu", 86);
Tuple2<String, Integer> tp5 = new Tuple2<>("lisi", 100);
list.add(tp1);
list.add(tp2);
list.add(tp3);
list.add(tp4);
list.add(tp5);
JavaRDD<Tuple2<String, Integer>> rdd1 = sc.parallelize(list);
JavaPairRDD<String, Integer> MapRdd = JavaPairRDD.fromJavaRDD(rdd1);
JavaPairRDD<String, Iterable<Integer>> keyRdd = MapRdd.groupByKey();
Map<String, Iterable<Integer>> map = keyRdd.collectAsMap();
Set<String> keys = map.keySet();
System.out.println("---打印数组---");
for (String key : keys) {
Iterable<Integer> values = map.get(key);//类型为Iterable接口的可以直接打印出数组,而Iterator的打印是地址只能通过hasNext和next方法。
System.out.println(key+ values);
}
System.out.println("----for-----");
for (String key : keys) {
Iterable<Integer> integers = map.get(key);
Iterator<Integer> iterator = integers.iterator();
while (iterator.hasNext()){
System.out.println(key+iterator.next());
}
}
System.out.println("-----迭代器-----");
Iterator<String> keyIter = keys.iterator();
while (keyIter.hasNext()){
String next = keyIter.next();
Iterable<Integer> integers = map.get(next);
Iterator<Integer> iterator = integers.iterator();
while (iterator.hasNext()){
Integer next1 = iterator.next();
System.out.println(next+next1);
}
}
}
}
sortByKey
- Scala
val sortByKeyRdd: RDD[(Int, String)] = mapValuesRdd2.sortByKey()//默认为true,升序
sortByKeyRdd.collect().foreach(println)
val sortByKeyRdd2: RDD[(Int, String)] = mapValuesRdd2.sortByKey(false)//逆序
sortByKeyRdd2.collect().foreach(println)
- Java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/6
* @Description://Java中各种...ByKey的前提都要为JavaPairRDD,如果不是请mapToPair,flatMapToPair去转换类型和结构。
*/
public class SortByKeyJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("sortbykey");
JavaSparkContext sc = new JavaSparkContext(conf);
//例一
List<Tuple2<Integer, String>> list = new ArrayList<>();
list.add(new Tuple2<>(5, "hello"));
list.add(new Tuple2<>(3, "world"));
list.add(new Tuple2<>(6, "spark"));
list.add(new Tuple2<>(2, "kb09"));
list.add(new Tuple2<>(8, "china"));
list.add(new Tuple2<>(1, "java"));
JavaRDD<Tuple2<Integer, String>> rdd1 = sc.parallelize(list);
JavaPairRDD<Integer, String> rdd2 = rdd1.mapToPair(new PairFunction<Tuple2<Integer, String>, Integer, String>() {
@Override
public Tuple2<Integer, String> call(Tuple2<Integer, String> tuple2) throws Exception {
// return new Tuple2<>(tuple2._1, tuple2._2);
return tuple2;
}
});
for (Tuple2<Integer, String> integerStringTuple2 : rdd2.sortByKey().collect()) {
System.out.println(integerStringTuple2);
}
//例二
JavaRDD<Tuple2<Integer, String>> rdd = sc.parallelize(Arrays.asList(new Tuple2(5, "hello"), new Tuple2(3, "world"), new Tuple2(7, "spark")));
JavaPairRDD<Integer, String> integerStringJavaPairRDD = rdd.mapToPair(new PairFunction<Tuple2<Integer, String>, Integer, String>() {
@Override
public Tuple2<Integer, String> call(Tuple2<Integer, String> tuple2) throws Exception {
// return new Tuple2<>(tuple2._1, tuple2._2);
return tuple2;
}
});
for (Tuple2<Integer, String> integerStringTuple2 : integerStringJavaPairRDD.sortByKey().collect()) {
System.out.println(integerStringTuple2);
}
//例三
JavaRDD<String> parallelize = sc.parallelize(Arrays.asList("hello", "java", "sparkgood", "hi", "nicetomeet"));
JavaPairRDD<Integer, String> integerStringJavaPairRDD1 = parallelize.mapToPair(new PairFunction<String, Integer, String>() {
@Override
public Tuple2<Integer, String> call(String s) throws Exception {
return new Tuple2<>(s.length(), s);
}
});
for (Tuple2<Integer, String> s : integerStringJavaPairRDD1.sortByKey().collect()) {
System.out.println(s);
}
}
}
combineByKey
- Scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
//Scala中各种...ByKey的前提元素为RDD[(,)]二元组即可,如果不是请map或flatMap转换类型和结构。
object CombineByKeyScala {
case class ScoreDetail(name:String,subject:String,score:Int)
def main(args:Array[String]):Unit={
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
val scores = List(
ScoreDetail("tianming", "Math", 98),
ScoreDetail("tianming", "English", 88),
ScoreDetail("laowang", "Math", 75),
ScoreDetail("laowang", "English", 78),
ScoreDetail("lisi", "Math", 90),
ScoreDetail("lisi", "English", 80),
ScoreDetail("zhangsan", "Math", 91),
ScoreDetail("zhangsan", "English", 80),
ScoreDetail("zhu", "Math", 96),
ScoreDetail("zhu", "English", 88),
ScoreDetail("tang", "Math", 91),
ScoreDetail("tang", "English", 80)
)
val rdd1: List[(String, ScoreDetail)] = for (elem <- scores) yield {(elem.name,elem)}
val rdd2: RDD[(String, ScoreDetail)] = sc.parallelize(rdd1).partitionBy(new HashPartitioner(3)).cache()
rdd2.foreachPartition(x=>x.foreach(println))
// val rdd1: RDD[ScoreDetail] = sc.parallelize(scores)
// val rdd2: RDD[(String, ScoreDetail)] = rdd1.map(x=>(x.name,x))
rdd2.collect().foreach(println) //对所有二元组遍历每一个
println("----foreachPartition-----") //并行遍历每个分区,效率更高
rdd2.foreachPartition(parContent=>{//先遍历分区或者说遍历的元素为迭代器
parContent.foreach(x=>println(x._1,x._2)) //对迭代器里的元素进行遍历
})
val rdd3: RDD[(String, (Int, Int))] = rdd2.combineByKey(
(x: ScoreDetail) => (x.score, 1),
(acc: (Int, Int), x: ScoreDetail) => (acc._1 + x.score, acc._2 + 1),
(acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
)
val stuAvg1: RDD[(String, Int)] = rdd3.map(x=>(x._1,x._2._1/x._2._2))
stuAvg1.collect().foreach(println)
// rdd3.map({case (key,value)=>(key,value._1/value._2)})
//4.关闭连接
sc.stop()
}
}
- Java
public class ScoreDetailsJava implements Serializable { //只有实现了Serializable接口的类的对象才能被实列化
public String name;
public String subject;
public Integer score;
public ScoreDetailsJava(String name, String subject, Integer score) {
this.name = name;
this.subject = subject;
this.score = score;
}
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/6
* @Description: Scala map-->Java mapToPair
* Scala flatMap-->Java flatMapToPair
*/
public class CombineByKeyJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("combine");
JavaSparkContext sc = new JavaSparkContext(conf);
List<ScoreDetailsJava> list=new ArrayList<>();
list.add(new ScoreDetailsJava("tianming", "Math", 98) );
list.add(new ScoreDetailsJava("tianming", "English", 88));
list.add(new ScoreDetailsJava("laowang", "Math", 75));
list.add(new ScoreDetailsJava("laowang", "English", 78));
list.add(new ScoreDetailsJava("lisi", "Math", 90));
list.add(new ScoreDetailsJava("lisi", "English", 80));
list.add(new ScoreDetailsJava("zhangsan", "Math", 91));
list.add(new ScoreDetailsJava("zhangsan", "English", 80));
list.add(new ScoreDetailsJava("zhu", "Math", 96));
list.add(new ScoreDetailsJava("zhu", "English", 88));
list.add(new ScoreDetailsJava("tang", "Math", 91));
list.add(new ScoreDetailsJava("tang", "English", 80));
JavaRDD<ScoreDetailsJava> rdd1 = sc.parallelize(list);
PairFunction<ScoreDetailsJava, String, ScoreDetailsJava> pairFunction = new PairFunction<ScoreDetailsJava, String, ScoreDetailsJava>() {
@Override
public Tuple2<String, ScoreDetailsJava> call(ScoreDetailsJava scoreDetailsJava) throws Exception {
return new Tuple2<>(scoreDetailsJava.name, scoreDetailsJava);
}
};
JavaPairRDD<String, ScoreDetailsJava> rdd2 = rdd1.mapToPair(pairFunction);
//createCombiner
Function<ScoreDetailsJava, Tuple2<Integer, Integer>> fun1 = new Function<ScoreDetailsJava, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(ScoreDetailsJava v1) throws Exception {
return new Tuple2<>(v1.score, 1);
}
};
//mergeValue
Function2<Tuple2<Integer, Integer>, ScoreDetailsJava, Tuple2<Integer, Integer>> fun2 = new Function2<Tuple2<Integer, Integer>, ScoreDetailsJava, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> v1, ScoreDetailsJava v2) throws Exception {
return new Tuple2<>(v1._1 + v2.score, v1._2 + 1);
}
};
//mergeCombiners
Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>> fun3 = new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> v1, Tuple2<Integer, Integer> v2) throws Exception {
return new Tuple2<>(v1._1 + v2._1, v1._2 + v2._2);
}
};
JavaPairRDD<String, Tuple2<Integer, Integer>> rdd3 = rdd2.combineByKey(fun1, fun2, fun3);
// for (Tuple2<String, Tuple2<Integer, Integer>> stringTuple2Tuple2 : rdd3.collect()) {
// System.out.println(stringTuple2Tuple2);
// }
//1.写个方法去修改结构
PairFunction<Tuple2<String, Tuple2<Integer, Integer>>, String, Integer> avgFunction = new PairFunction<Tuple2<String, Tuple2<Integer, Integer>>, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Tuple2<String, Tuple2<Integer, Integer>> tp2) throws Exception {
return new Tuple2<>(tp2._1, tp2._2._1 / tp2._2._2);
}
};
JavaPairRDD<String, Integer> stringIntegerJavaPairRDD = rdd3.mapToPair(avgFunction);
List<Tuple2<String, Integer>> collect = stringIntegerJavaPairRDD.collect();
for (Tuple2<String, Integer> stringIntegerTuple2 : collect) {
System.out.println(stringIntegerTuple2);
}
//2.直接for强制遍历集合也可
List<Tuple2<String, Tuple2<Integer, Integer>>> collect1 = rdd3.collect();
for (Tuple2<String, Tuple2<Integer, Integer>> tp2 : collect1) {
System.out.println(tp2._1+" "+tp2._2._1/tp2._2._2);
}
}
}
foldByKey
- Spark
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object foldByKeyScala {
def main(args:Array[String]):Unit={
//1.创建SparkConf并设置App名称
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
//2.创建SparkContext,该对象是提交Spark App的入口
val sc: SparkContext = new SparkContext(conf)
val rdd1: RDD[(String, Int)] = sc.parallelize(List(("A",2),("A",3),("B",5),("C",6),("C",7)),5)//分区数
//如果相同键的元素在一个分区内,那么初始值只加一次,否则多次。即现在分区内加初始值,再进行相同键的合并运算。
rdd1.foldByKey(10)((x,y)=>{println("one:"+x+"two:"+y);x+y}).collect().foreach(println)
//4.关闭连接
sc.stop()
}
}
cogroup
- Scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object CoGroupScala {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("cogroup")
val sc = new SparkContext(conf)
val scoreDetail1: RDD[(String, Int)] = sc.parallelize(List(("zhangsan",11),("zhangsan",111),("lisi",11),("wangwu",11)))
val scoreDetail2: RDD[(String, Int)] = sc.parallelize(List(("zhangsan",21),("zhaoliu",21),("wangwu",21)))
val scoreDetail3: RDD[(String, Int)] = sc.parallelize(List(("zhangsan",31),("lisi",31),("liqi",31)))
val rdd1: RDD[(String, (Iterable[Int], Iterable[Int]))] = scoreDetail1.cogroup(scoreDetail2)
rdd1.collect().foreach(println)
val rdd2: RDD[(String, (Iterable[Int], Iterable[Int], Iterable[Int]))] = scoreDetail1.cogroup(scoreDetail2,scoreDetail3)
rdd2.collect().foreach(println)
}
}
- Java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import scala.Tuple3;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/9
* @Description:
*/
public class CoGroupJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("cogroupjava");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<String,Integer>> rdd1 = sc.parallelize(Arrays.asList(
new Tuple2<String,Integer>("zhangsan", 11),
new Tuple2<String,Integer>("zhangsan", 111),
new Tuple2<String,Integer>("lisi", 11),
new Tuple2<String,Integer>("wangwu", 11)
));
JavaRDD<Tuple2<String,Integer>> rdd2 = sc.parallelize(Arrays.asList(
new Tuple2<String,Integer>("zhangsan", 21),
new Tuple2<String,Integer>("zhangsan", 211),
new Tuple2<String,Integer>("zhaosi", 21),
new Tuple2<String,Integer>("wangwu", 21)
));
JavaRDD<Tuple2<String,Integer>> rdd3 = sc.parallelize(Arrays.asList(
new Tuple2<String,Integer>("zhangsan", 31),
new Tuple2<String,Integer>("zhangsan", 311),
new Tuple2<String,Integer>("lisi", 31),
new Tuple2<String,Integer>("xiaowu", 31)
));
JavaPairRDD<String, Integer> mapRdd1 = JavaPairRDD.fromJavaRDD(rdd1);
JavaPairRDD<String, Integer> mapRdd2 = JavaPairRDD.fromJavaRDD(rdd2);
JavaPairRDD<String, Integer> mapRdd3 = JavaPairRDD.fromJavaRDD(rdd3);
JavaPairRDD<String, Tuple2<Iterable<Integer>, Iterable<Integer>>> cogroup =mapRdd1.cogroup(mapRdd2);
Map<String, Tuple2<Iterable<Integer>, Iterable<Integer>>> stringTuple2Map = cogroup.collectAsMap();
Set<String> strings = stringTuple2Map.keySet();
for (String string : strings) {
Tuple2<Iterable<Integer>, Iterable<Integer>> values = stringTuple2Map.get(string);
System.out.println(string+values);
}
System.out.println("-----三个rdd---");
JavaPairRDD<String, Tuple3<Iterable<Integer>, Iterable<Integer>, Iterable<Integer>>> cogroup1 =mapRdd1.cogroup(mapRdd2, mapRdd3);
Map<String, Tuple3<Iterable<Integer>, Iterable<Integer>, Iterable<Integer>>> stringTuple3Map = cogroup1.collectAsMap();
Set<String> keys = stringTuple3Map.keySet();
Iterator<String> iterator = keys.iterator();
while (iterator.hasNext()){
String key = iterator.next();
Tuple3<Iterable<Integer>, Iterable<Integer>, Iterable<Integer>> value = stringTuple3Map.get(key);
System.out.println(key+value);
}
}
}
四 键值对关联操作
subtractByKey
join
leftOuterJoin
rightOuterJoin
fullOuterJoin
- Scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object subtractScala {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("sub")
val sc = new SparkContext(conf)
val rdd1: RDD[(String, Int)] = sc.parallelize(List(new Tuple2("A",1),new Tuple2("B",2),new Tuple2("C",3)))
val rdd2: RDD[(String, Int)] = sc.parallelize(List(new Tuple2("B",8)))
println("------subtractByKey------")
rdd1.subtractByKey(rdd2).collect().foreach(println)
println("----------join------------")
val join: RDD[(String, (Int, Int))] = rdd1.join(rdd2)
join.collect().foreach(println)
println("----------leftOuterJoin---------")
rdd1.leftOuterJoin(rdd2).collect().foreach(println)
println("---------rightOuterJoin------")
rdd1.rightOuterJoin(rdd2).collect().foreach(println)
println("---fullOuterJoin----")
rdd1.fullOuterJoin(rdd2).collect().foreach(println)
}
}
/*
------subtractByKey------
(A,1)
(C,3)
----------join------------
(B,(2,8))
----------leftOuterJoin---------
(A,(1,None))
(B,(2,Some(8)))
(C,(3,None))
---------rightOuterJoin------
(B,(Some(2),8))
---fullOuterJoin----
(A,(Some(1),None))
(B,(Some(2),Some(8)))
(C,(Some(3),None))
- Java
import org.apache.avro.generic.GenericData;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import scala.Tuple2;
import java.util.*;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/9
* @Description:
*/
public class subtractJava {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("java");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<String, Integer>> rdd1 = sc.parallelize(Arrays.asList(new Tuple2<String, Integer>("C", 1), new Tuple2<String, Integer>("B", 2), new Tuple2<String, Integer>("A", 3)));
JavaRDD<Tuple2<String, Integer>> rdd2 = sc.parallelize(Arrays.asList(new Tuple2<String, Integer>("B", 8), new Tuple2<String, Integer>("D", 10)));
JavaPairRDD<String, Integer> map1 = JavaPairRDD.fromJavaRDD(rdd1);
JavaPairRDD<String, Integer> map2 = JavaPairRDD.fromJavaRDD(rdd2);
Map<String, Integer> subtract = map1.subtractByKey(map2).collectAsMap();
Map<String, Tuple2<Integer, Integer>> join = map1.join(map2).collectAsMap();
Map<String, Tuple2<Integer, Optional<Integer>>> leftjoin = map1.leftOuterJoin(map2).collectAsMap();
Map<String, Tuple2<Optional<Integer>, Integer>> rightjoin = map1.rightOuterJoin(map2).collectAsMap();
Map<String, Tuple2<Optional<Integer>, Optional<Integer>>> fulljoin = map1.fullOuterJoin(map2).collectAsMap();
System.out.println("-----subtractByKey----");
Set<String> keysub = subtract.keySet();
for (String s : keysub) {
System.out.println("("+s+","+subtract.get(s)+")");
}
System.out.println("------join-----------");
Set<String> keyjoin = join.keySet();
for (String s : keyjoin) {
System.out.println(s+join.get(s));
}
System.out.println("---------leftOuterJoin-----");
Set<String> keyleft = leftjoin.keySet();
for (String s : keyleft) {
Tuple2<Integer, Optional<Integer>> i = leftjoin.get(s);
System.out.println(s+leftjoin.get(s));
}
System.out.println("------rightOuterJoin");
Iterator<String> iterator = rightjoin.keySet().iterator();
while (iterator.hasNext()){
String next = iterator.next();
System.out.println(next+rightjoin.get(next));
}
System.out.println("--fullOuterJoin-----");
Set<String> strings = fulljoin.keySet();
Iterator<String> iterator1 = strings.iterator();
while (iterator1.hasNext()){
String next = iterator1.next();
Tuple2<Optional<Integer>, Optional<Integer>> values = fulljoin.get(next);
System.out.println(next+"=Tuple2("+values._1+","+values._2+")");
}
}
}
五 创建键值对RDD
map
- Scala
package test
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object map {
def main(args:Array[String])={
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("maprdd")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd1: RDD[String] = sc.textFile("file:///D:\\IDEA\\Data\\Spark\\mysparkdemo\\in/sample.txt")
val rdd2: RDD[Array[String]] = rdd1.map(x=>x.split("\\s+"))
rdd2.collect().foreach(println) //打印出地址,因为元素是数组,没有重写mkString方法,元素是基本数据类型就重写了方法。
rdd2.collect().foreach(x=>println(x.mkString)) //mkString
for (elem <- rdd2.collect()) {
println(elem.mkString)
}
val rdd3: RDD[(String, Int)] = rdd1.map(x=>(x.split(" ")(0),1))
val tuples: Array[(String, Int)] = rdd3.collect()
for (elem <- tuples) {
println(elem)
}
}
}
- Java
JavaRDD<Iterable> map = rdd1.map(new Function<String, Iterable>() {
@Override
public Iterable call(String v1) throws Exception {
String[] s = v1.split(" ");
return Arrays.asList(s);
}
});
List<Iterable> collect1 = map.collect();
for (Iterable iterable : collect1) { //遍历含有3个迭代器的对象,调用每个迭代器进入
Iterator iterator1 = iterable.iterator(); //每个迭代器本身调用iterator方法
while (iterator1.hasNext()){ //打印迭代器内部的元素
System.out.println(iterator1.next());
}
}
//上下对比可以看出调用对象不同,你以为我在第一层,其实我在第二层
Iterator<Iterable> iterator = collect1.iterator();//对含有三个迭代器的对象调用iterator方法
while (iterator.hasNext()){ //判断是否含有下个迭代器
System.out.println(iterator.next()); //输出迭代器本身
mapToPair
- Java
JavaPairRDD<String, Integer> mapPair = rdd1.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
String[] s1 = s.split(" ");
return new Tuple2<>(s1[0], 1);
}
});
List<Tuple2<String, Integer>> collect = mapPair.collect();
for (Tuple2<String, Integer> stringIntegerTuple2 : collect) {
System.out.println(stringIntegerTuple2);
}
flatMap
- Scala
package test
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object flatmap {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("flatmap").setMaster("local[*]")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val rdd1: RDD[String] = sc.textFile("in/sample.txt")
val rdd2: RDD[String] = rdd1.flatMap(x=>x.split("\\s"))//空格也行
println(rdd2.first())
rdd2.collect().foreach(println)
rdd1.flatMap(x=>(x.split(" "))).map((_,1)).collect().foreach(println)
}
}
- Java
JavaRDD<String> stringJavaRDD = rdd1.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
String[] s1 = s.split(" ");
return Arrays.asList(s1).iterator();
}
});
List<String> collect2 = stringJavaRDD.collect();
for (String s : collect2) {
System.out.println(s);
}
flatMapToPair
- Java
JavaPairRDD<String, Integer> flat = rdd1.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {//输入string,输出(string,int)二元组
@Override
public Iterator<Tuple2<String, Integer>> call(String s) throws Exception {
ArrayList<Tuple2<String,Integer>> list = new ArrayList<>();
String[] split = s.split("\\s");
for (int i = 0; i <split.length ; i++) {
Tuple2<String, Integer> tup2 = new Tuple2<>(split[i], 1);
list.add(tup2);
}
return list.iterator();
}
});
List<Tuple2<String, Integer>> collect3 = flat.collect();
for (Tuple2<String, Integer> stringIntegerTuple2 : collect3) {
System.out.println(stringIntegerTuple2);
}
六 分区操作
foreach&foreachPartition
- Scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object foreachAndforeachpartition {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("foreach")
val sc = new SparkContext(conf)
val rdd1: RDD[Int] = sc.parallelize(List(1,2,3,4,5,6,7,8),3)
// rdd1.foreach(println) //打印每个元素
rdd1.foreachPartition(iter=>{
println(iter.toList)//打印每个分区,即每个迭代器
})
rdd1.foreachPartition(iter=>{
iter.foreach(println) //通过每个分区打印每个元素,效率更高
})
}
}
- Java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/10
* @Description:
*/
public class foreachAndforeahcpartition {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("foreach");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 3);
List<Integer> list = rdd1.collect();
System.out.println("----foreach---");
rdd1.foreach(new VoidFunction<Integer>() {
@Override
public void call(Integer integer) throws Exception {
System.out.println(integer);
}
});
System.out.println("----foreachPartition---");
rdd1.foreachPartition(new VoidFunction<Iterator<Integer>>() {
@Override
public void call(Iterator<Integer> iterator) throws Exception {
System.out.println(iterator);
while (iterator.hasNext()){
Integer next = iterator.next();
System.out.println(next);
}
}
});
}
}
mapPartitions
- scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object mapPartitions {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[3]").setAppName("mapPartitons")
val sc = new SparkContext(conf)
val rdd1: RDD[Int] = sc.parallelize(List(1,2,3,4,5))
val rdd2: RDD[(Int, Int)] = rdd1.mapPartitions(x => {
var tuples: List[(Int, Int)] = List[(Int, Int)]()
while (x.hasNext) {
val i: Int = x.next()
tuples = tuples.:+(i, i * i)
}
tuples.iterator
})
rdd2.collect().foreach(println)
/*
(1,1)
(2,4)
(3,9)
(4,16)
(5,25)
- Java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/10
* @Description:
*/
public class mapPartitions {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("mapPartitions");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<String, Integer>> rdd1 = sc.parallelize(Arrays.asList(
new Tuple2<String, Integer>("zhu", 24),
new Tuple2<String, Integer>("tang", 23),
new Tuple2<String, Integer>("zz", 25)
),2);
JavaPairRDD<String, Integer> rdd2 = JavaPairRDD.fromJavaRDD(rdd1);
JavaRDD<Tuple2<String, Integer>> tuple2JavaRDD = rdd2.mapPartitions(new FlatMapFunction<Iterator<Tuple2<String, Integer>>, Tuple2<String, Integer>>() {
@Override
public Iterator<Tuple2<String, Integer>> call(Iterator<Tuple2<String, Integer>> tp2) throws Exception {
ArrayList<Tuple2<String, Integer>> list = new ArrayList<>();
while (tp2.hasNext()) {
Tuple2<String, Integer> next = tp2.next();
Tuple2<String, Integer> tp = new Tuple2<>(next._1, next._2 * 2);
list.add(tp);
}
return list.iterator();
}
});
tuple2JavaRDD.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
System.out.println(stringIntegerTuple2);
}
});
}
}
mapPartitionsWithIndex
- Scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object mapPartitionsWithIndex {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[3]").setAppName("mapPartitonsWithIndex")
val sc = new SparkContext(conf)
val rdd1: RDD[(Int, Int)] = sc.parallelize(List(
(1, 2), (1, 3), (1, 4), (1, 5), (2, 4), (3, 5), (3, 6)
))
val value: RDD[(Int, (Int, Int))] = rdd1.mapPartitionsWithIndex((x: Int, y:Iterator[(Int,Int)]) => {
var tuples: List[(Int, (Int, Int))] = List[(Int, (Int, Int))]()
while (y.hasNext) {
val next: (Int, Int) = y.next()
tuples = tuples.:+(x, next)
}
tuples.iterator
}
)
value.collect().foreach(println)
}
}
/*
(0,(1,2))
(0,(1,3))
(1,(1,4))
(1,(1,5))
(2,(2,4))
(2,(3,5))
(2,(3,6))
- Java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/10
* @Description:
*/
public class mapPartitionWithIndex {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("mapPartitions");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<String, Integer>> rdd1 = sc.parallelize(Arrays.asList(
new Tuple2<String, Integer>("zhu", 24),
new Tuple2<String, Integer>("tang", 23),
new Tuple2<String, Integer>("zz", 25)
),2);
JavaPairRDD<String, Integer> rdd2 = JavaPairRDD.fromJavaRDD(rdd1);
JavaRDD<Tuple2<Integer, Tuple2<String, Integer>>> tuple2JavaRDD = rdd2.mapPartitionsWithIndex(new Function2<Integer, Iterator<Tuple2<String, Integer>>, Iterator<Tuple2<Integer, Tuple2<String, Integer>>>>() {
@Override
public Iterator<Tuple2<Integer, Tuple2<String, Integer>>> call(Integer v1, Iterator<Tuple2<String, Integer>> v2) throws Exception {
ArrayList<Tuple2<Integer, Tuple2<String, Integer>>> list = new ArrayList<>();
while (v2.hasNext()) {
Tuple2<String, Integer> next = v2.next();
Tuple2<Integer, Tuple2<String, Integer>> tp = new Tuple2<>(v1, next);
list.add(tp);
}
return list.iterator();
}
}, false);
for (Tuple2<Integer, Tuple2<String, Integer>> integerTuple2Tuple2 : tuple2JavaRDD.collect()) {
System.out.println(integerTuple2Tuple2);
}
tuple2JavaRDD.foreach(new VoidFunction<Tuple2<Integer, Tuple2<String, Integer>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<String, Integer>> integerTuple2Tuple2) throws Exception {
System.out.println(integerTuple2Tuple2);
}
});
}
}
glom
- Scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object glom {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[3]").setAppName("glom")
val sc = new SparkContext(conf)
val rdd1: RDD[Int] = sc.parallelize(List(1,2,3,4,5))
val rdd2: RDD[Array[Int]] = rdd1.glom() //通过分区遍历元素,效率更高。
val arr: Array[Array[Int]] = rdd2.collect()
arr.foreach(x=>{ //此时x代表每个分区的Array数组,最外面一层
// x.foreach(y=>println(y)) //y代表每个分区里的元素
println(x.mkString(",")) //一个分区一个数组,打印每个分区即打印每个数组,调用mkString方法
})
}
}
- Java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.Arrays;
import java.util.List;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/10
* @Description:
*/
public class glom {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("glom");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 4);
JavaRDD<List<Integer>> glom = rdd1.glom();
glom.foreach(new VoidFunction<List<Integer>>() {
@Override
public void call(List<Integer> integers) throws Exception {
System.out.println(integers);
}
});
}
}
/*
[3]
[1]
[2]
[4, 5]
HashPartitioner&RangePartitioner
- Scala
import org.apache.spark.{HashPartitioner, RangePartitioner, SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object hashPartition {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[1]").setAppName("partition")
val sc = new SparkContext(conf)
def mapPartIndexFunc(i1:Int,iter:Iterator[(Int,Int)]):Iterator[(Int,(Int,Int))]={
var res: List[(Int, (Int, Int))] = List[(Int,(Int,Int))]()
while (iter.hasNext){
val next: (Int, Int) = iter.next()
res=res.::(i1,next)
}
res.iterator
}
println("没有添加分区数时,即分区数为1")
def printRDDPart(rdd:RDD[(Int,Int)]):Unit={
val rdd1: RDD[(Int, (Int, Int))] = rdd.mapPartitionsWithIndex(mapPartIndexFunc)
rdd1.collect().foreach(println)
}
val rdd1: RDD[(Int, Int)] = sc.parallelize(List(
(1, 2), (3, 3), (5, 4), (1, 5), (2, 4), (4, 5), (2, 6), (6, 8)
))
printRDDPart(rdd1)
//可以parallelize里添加分区数,也可以partitionBy分区
println("-----------hashPartitioner-----------")
val rdd2: RDD[(Int, Int)] = rdd1.partitionBy(new HashPartitioner(3))//分区原理:key%分区数3,余数为几就是几分区
printRDDPart(rdd2)
println("---------RangePartitioner----------")
//使用一个范围,将范围内的键分配给相应的分区。这种方法适用于键中有自然排序,键不为负。
val rdd3: RDD[(Int, Int)] = rdd1.partitionBy(new RangePartitioner(3,rdd1))
printRDDPart(rdd3)
}
}
/*
没有添加分区数时,即分区数为1
(0,(6,8))
(0,(2,6))
(0,(4,5))
(0,(2,4))
(0,(1,5))
(0,(5,4))
(0,(3,3))
(0,(1,2))
-----------hashPartitioner-----------
(0,(6,8))
(0,(3,3))
(1,(4,5))
(1,(1,5))
(1,(1,2))
(2,(2,6))
(2,(2,4))
(2,(5,4))
---------RangePartitioner----------
(0,(2,6))
(0,(2,4))
(0,(1,5))
(0,(1,2))
(1,(4,5))
(1,(3,3))
(2,(6,8))
(2,(5,4))
- Java
import org.apache.spark.HashPartitioner;
import org.apache.spark.Partitioner;
import org.apache.spark.RangePartitioner;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/10
* @Description:
*/
public class partitionJava {
static void printPartRDD(JavaPairRDD<Integer,Integer> pairRDD){
JavaRDD<Tuple2<Integer, Tuple2<Integer, Integer>>> mapWithIndex = (JavaRDD<Tuple2<Integer, Tuple2<Integer, Integer>>>) pairRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<Tuple2<Integer, Integer>>, Iterator<Tuple2<Integer, Tuple2<Integer, Integer>>>>() {
@Override
public Iterator<Tuple2<Integer, Tuple2<Integer, Integer>>> call(Integer v1, Iterator<Tuple2<Integer, Integer>> v2) throws Exception {
ArrayList<Tuple2<Integer, Tuple2<Integer, Integer>>> list = new ArrayList<>();
while (v2.hasNext()) {
Tuple2<Integer, Integer> next = v2.next();
Tuple2<Integer, Tuple2<Integer, Integer>> tp = new Tuple2<>(v1, next);
list.add(tp);
}
return list.iterator();
}
}, false);
//先collect再增强for输出
List<Tuple2<Integer, Tuple2<Integer, Integer>>> collect = mapWithIndex.collect();
for (Tuple2<Integer, Tuple2<Integer, Integer>> integerTuple2Tuple2 : collect) {
System.out.println(integerTuple2Tuple2);
}
//foreach输出
mapWithIndex.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Integer, Integer>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<Integer, Integer>> integerTuple2Tuple2) throws Exception {
System.out.println(integerTuple2Tuple2);
}
});
}
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("par");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<Integer, Integer>> rdd1 = sc.parallelize(Arrays.asList(
new Tuple2<Integer, Integer>(1, 2),
new Tuple2<Integer, Integer>(2, 3),
new Tuple2<Integer, Integer>(3, 4),
new Tuple2<Integer, Integer>(4, 5),
new Tuple2<Integer, Integer>(5, 6)
));
System.out.println("---------mapPartitionsWithIndex---------");
JavaPairRDD<Integer, Integer> rdd2 = JavaPairRDD.fromJavaRDD(rdd1);
printPartRDD(rdd2);
System.out.println("---HashPartitioner---");
JavaPairRDD<Integer, Integer> hashRDD = rdd2.partitionBy(new HashPartitioner(3));
printPartRDD(hashRDD);
}
}
/*
---------mapPartitionsWithIndex---------
(4,(2,3))
(2,(1,2))
(11,(5,6))
(9,(4,5))
(7,(3,4))
---HashPartitioner---
(0,(3,4))
(1,(1,2))
(2,(2,3))
(1,(4,5))
(2,(5,6))
自定义分区规则
- Scala
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, Partitioner, SparkConf, SparkContext}
object myPartition {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("mypartition")
val sc = new SparkContext(conf)
//自定义分区规则
class custompartitioner(num:Int) extends Partitioner{
override def numPartitions: Int = num
override def getPartition(key: Any): Int = {
if(key.toString.toInt>=4){
0
}
else if(key.toString.toInt==3){
1
}else{
2
}
}
}
//输出规则,当前分区编号与传入组成二元组
def mapPartIndexFunc(i1:Int,iter:Iterator[(Int,Int)]):Iterator[(Int,(Int,Int))]={
var res: List[(Int, (Int, Int))] = List[(Int,(Int,Int))]()
while (iter.hasNext){
val next: (Int, Int) = iter.next()
res=res.::(i1,next)
}
res.iterator
}
//调用mapPartitionsWithIndex
def printRDDPart(rdd:RDD[(Int,Int)]):Unit={
val rdd1: RDD[(Int, (Int, Int))] = rdd.mapPartitionsWithIndex(mapPartIndexFunc)
rdd1.collect().foreach(println)
}
//调用自定义分区
val rdd1: RDD[(Int, Int)] = sc.parallelize(List((1,1), (5,10), (5,9), (2,4), (3,5), (3,6),(4,7), (4,8),(2,3), (1,2)))
val rdd2: RDD[(Int, Int)] = rdd1.partitionBy(new custompartitioner(3))
printRDDPart(rdd2)
}
}
/*
(0,(4,8))
(0,(4,7))
(0,(5,9))
(0,(5,10))
(1,(3,6))
(1,(3,5))
(2,(1,2))
(2,(2,3))
(2,(2,4))
(2,(1,1))
- Java
import org.apache.spark.Partitioner;
/**
* @author: Zhuuu_ZZ
* @Date 2020/11/10
* @Description:
*/
public class MyPartition extends Partitioner {
int i=1;
public MyPartition(int i){
this.i=i;
}
public MyPartition(){}
@Override
public int numPartitions() {
return i;
}
@Override
public int getPartition(Object key) {
int keyCode=Integer.parseInt(key.toString());
if(keyCode>=4){
return 0;
}
else if (keyCode==3){
return 1;
}
else {
return 2;
}
}
}
public class MyPart {
static void printPartRDD(JavaPairRDD<Integer,Integer> pairRDD){
JavaRDD<Tuple2<Integer, Tuple2<Integer, Integer>>> mapWithIndex = (JavaRDD<Tuple2<Integer, Tuple2<Integer, Integer>>>) pairRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<Tuple2<Integer, Integer>>, Iterator<Tuple2<Integer, Tuple2<Integer, Integer>>>>() {
@Override
public Iterator<Tuple2<Integer, Tuple2<Integer, Integer>>> call(Integer v1, Iterator<Tuple2<Integer, Integer>> v2) throws Exception {
ArrayList<Tuple2<Integer, Tuple2<Integer, Integer>>> list = new ArrayList<>();
while (v2.hasNext()) {
Tuple2<Integer, Integer> next = v2.next();
Tuple2<Integer, Tuple2<Integer, Integer>> tp = new Tuple2<>(v1, next);
list.add(tp);
}
return list.iterator();
}
}, false);
//先collect再增强for输出
// List<Tuple2<Integer, Tuple2<Integer, Integer>>> collect = mapWithIndex.collect();
// for (Tuple2<Integer, Tuple2<Integer, Integer>> integerTuple2Tuple2 : collect) {
// System.out.println(integerTuple2Tuple2);
// }
//foreach输出
mapWithIndex.foreach(new VoidFunction<Tuple2<Integer, Tuple2<Integer, Integer>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<Integer, Integer>> integerTuple2Tuple2) throws Exception {
System.out.println(integerTuple2Tuple2);
}
});
}
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("par");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Tuple2<Integer, Integer>> rdd1 = sc.parallelize(Arrays.asList(
new Tuple2<Integer, Integer>(1, 2),
new Tuple2<Integer, Integer>(2, 3),
new Tuple2<Integer, Integer>(3, 4),
new Tuple2<Integer, Integer>(4, 5),
new Tuple2<Integer, Integer>(5, 6)
));
JavaPairRDD<Integer, Integer> rdd2 = JavaPairRDD.fromJavaRDD(rdd1);
System.out.println("--------MyPartition------");
JavaPairRDD<Integer, Integer> mypartition = rdd2.partitionBy(new MyPartition(3));
printPartRDD(mypartition);
}
}
/*
--------MyPartition------
(1,(3,4))
(0,(4,5))
(0,(5,6))
(2,(1,2))
(2,(2,3))
详细完整清楚的请点击下方链接
链接: Spark算子大全.