Spark 常用算子
Transformation算子
join
RDDa<String,String>(k,v).join(RDDb<String,String>(k,w)),相同的key join,作用在(k,v)格式的rdd上,返回(k(v,w))格式的rdd
isPresent(),可以判断有值无值
- Java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Arrays;
public class SuanZi {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local[4]").setAppName("test");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<String, String> name = sc.parallelizePairs(Arrays.asList(
new Tuple2<String, String>("1", "zhangsan"),
new Tuple2<String, String>("2", "lisi"),
new Tuple2<String, String>("3", "wangwu"),
new Tuple2<String, String>("4", "zhaosi")
));
JavaPairRDD<String, String> score = sc.parallelizePairs(Arrays.asList(
new Tuple2<String, String>("1", "100"),
new Tuple2<String, String>("2", "400"),
new Tuple2<String, String>("3", "300"),
new Tuple2<String, String>("4", "200")
));
name.join(score).foreach(new VoidFunction<Tuple2<String, Tuple2<String, String>>>() {
@Override
public void call(Tuple2<String, Tuple2<String, String>> stringTuple2Tuple2) throws Exception {
System.out.println(stringTuple2Tuple2);
}
});
sc.close();
}
}
运行结果:
(2,(lisi,400))
(4,(zhaosi,200))
(3,(wangwu,300))
(1,(zhangsan,100))
- Scala
import org.apache.spark.{SparkConf, SparkContext}
object Suanzi {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local").setAppName("test")
val sc = new SparkContext(conf)
val name = sc.parallelize(Seq(
new Tuple2[String, String]("1", "zhangsan"),
new Tuple2[String, String]("2", "wangwu"),
new Tuple2[String, String]("3", "lisi"),
new Tuple2[String, String]("4", "zhaosi")
))
val score = sc.parallelize(Seq(
new Tuple2[String, String]("1", "100"),
new Tuple2[String, String]("2", "200"),
new Tuple2[String, String]("3", "300"),
new Tuple2[String, String]("4", "400")
))
name.join(score).foreach(println(_))
// sc.stop()
}
}
运行结果:
(4,(zhaosi,400))
(2,(wangwu,200))
(3,(lisi,300))
(1,(zhangsan,100))
leftOuterJoin
RDDa.leftOuterJoin(RDDb),以左侧为主
- Java
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Arrays;
public class SuanZi {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local[4]").setAppName("test");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<String, String> name = sc.parallelizePairs(Arrays.asList(
new Tuple2<>("1", "zhangsan"),
new Tuple2<>("2", "lisi"),
new Tuple2<>("3", "wangwu"),
new Tuple2<>("4", "zhaosi")
));
JavaPairRDD<String, String> score = sc.parallelizePairs(Arrays.asList(
new Tuple2<>("1", "100"),
new Tuple2<>("2", "400"),
new Tuple2<>("3", "300"),
new Tuple2<>("5", "200")
));
name.leftOuterJoin(score).foreach(new VoidFunction<Tuple2<String, Tuple2<String, Optional<String>>>>() {
@Override
public void call(Tuple2<String, Tuple2<String, Optional<String>>> tuple2) throws Exception {
//leftOuterJoin结果:
// System.out.println(tuple2);
//判断有值则输出
if(tuple2._2._2.isPresent()){
System.out.println("id: " + tuple2._1 + "\t name: " + tuple2._2._1 + " \t score: " + tuple2._2._2.get();
}
}
});
sc.close();
}
}
运行结果:
//判断有值则输出
id: 1 name: zhangsan score: 100
id: 3 name: wangwu score: 300
id: 2 name: lisi score: 400
//leftOuterJoin结果:
(4,(zhaosi,Optional.empty))
(3,(wangwu,Optional[300]))
(2,(lisi,Optional[400]))
(1,(zhangsan,Optional[100]))
- Scala
#rightOuterJoin
以右侧为主
- Java
复制上面代码,将leftOuterJoin()替换成rightOuterJoin()即可
- Scala
union
将rdd合并,分区个数( unionRDD.partitions().size() )为两个rdd之和
intersection
a.intersection(b) ,,并集,取两个rdd之间相同的
subtract
a.subtract(b) , 差集,取两个rdd之间不通的元素
mapPartitions
parallelize ? parallelizePairs(返回k,v格式的rdd) 区别
map ? mapPartitions 的区别
Tips
数据量大的时候慎用map
- Java
public class SuanZi {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local[4]").setAppName("test");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> parallelize = sc.parallelize(Arrays.asList("a", "d", "c", "b"));
parallelize.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {
@Override
public Iterator<String> call(Iterator<String> stringIterator) throws Exception {
while (stringIterator.hasNext()){
System.out.println(stringIterator.next());
}
return stringIterator;
}
}).count();
sc.close();
}
}
运行结果:
a
d
c
b
distinct
去重
源码:
def distinct(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope {
map(x => (x, null)).reduceByKey((x, y) => x, numPartitions).map(_._1)
}
cogroup
rdda.cogroup(rddb) ,对比两个rdd中元素
mapPartitionWithIndex
Return a new RDD by applying a function to each partition of this RDD, while tracking the index of the original partition.
通过将函数应用于此RDD的每个分区来返回新的RDD,同时跟踪原始分区的索引,1.x 中不能返回
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.Arrays;
import java.util.Iterator;
public class Test1220 {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local[4]").setAppName("test");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> parallelize = sc.parallelize(Arrays.asList("1", "2", "3", "4"), 4);
System.out.println(parallelize.partitions().size());
// JavaRDD<String> repartition = parallelize.repartition(3);
JavaRDD<String> stringJavaRDD = parallelize.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
@Override
public Iterator<String> call(Integer index, Iterator<String> iterator) throws Exception {
while (iterator.hasNext()) {
System.out.println("index is " + index + ", value = " + iterator.next());
}
return iterator;
}
}, true);
stringJavaRDD.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
// System.out.println(repartition.partitions().size());
sc.close();
}
}
repartition
常用于增多分区,有shuffle产生,如果减少到1个分区,则没有shuffle产生(多对一)
coalesce
常用于减少分区
/**
* numPartitions: 分区个数
* shuffle:是否有shuffle产生,false不产生shuffle
* 增加分区,不产生shuffle,则参数(shuffle)无效
*/
def coalesce(numPartitions: Int, shuffle: Boolean): JavaRDD[T] =
rdd.coalesce(numPartitions, shuffle)
groupByKey
作用在k,v格式的RDD上,
zipWithIndex
``
zip
rddA.zip(rddB) ,个数不相同时报错
Action算子
foreachPartition
foreach 和 foreachPartition 区别
foreachPartition 和 mapPartitons区别?
Tips
数据量大时候慎用 foreach
foreachPartition 分区遍历
foreachPartiton action算子,mapPartitons transformation 算子
,如果后续还需处理,则选择mapPartitions
countByKey
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Map;
import java.util.Set;
public class Test{
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local[4]").setAppName("test");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<String, String> pairRDD = sc.parallelizePairs(Arrays.asList(
new Tuple2<>("1", "zhangsan"),
new Tuple2<>("1", "zhangsan1"),
new Tuple2<>("1", "zhangsan2"),
new Tuple2<>("2", "zhangsan3"),
new Tuple2<>("3", "zhangsan4"),
new Tuple2<>("4", "zhangsan5")
));
Set<Map.Entry<String, Long>> entrySet = pairRDD.countByKey().entrySet();
for (Map.Entry entry: entrySet) {
System.out.println("key = " + entry.getKey() + ", value = " + entry.getValue());
}
sc.close();
}
}
运行结果:
key = 4, value = 1
key = 1, value = 3
key = 2, value = 1
key = 3, value = 1
countByValue
作用在k,v格式RDD上,k,v分别相同 则+1。
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Map;
import java.util.Set;
public class Test1220 {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local[4]").setAppName("test");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaPairRDD<String, String> pairRDD = sc.parallelizePairs(Arrays.asList(
new Tuple2<>("1", "zhangsan"),
new Tuple2<>("1", "zhangsan"),
new Tuple2<>("1", "zhangsan2"),
new Tuple2<>("2", "zhangsan3"),
new Tuple2<>("3", "zhangsan3"),
new Tuple2<>("4", "zhangsan5")
));
Set<Map.Entry<Tuple2<String, String>, Long>> entrySet = pairRDD.countByValue().entrySet();
for (Map.Entry entry : entrySet) {
System.out.println("key = " + entry.getKey() + ", value = " + entry.getValue());
}
sc.close();
}
}
运行结果:
key = (1,zhangsan), value = 2
key = (1,zhangsan2), value = 1
key = (2,zhangsan3), value = 1
key = (4,zhangsan5), value = 1
key = (3,zhangsan3), value = 1
reduce
public class Test1220 {
public static void main(String[] args) {
SparkConf conf = new SparkConf();
conf.setMaster("local[4]").setAppName("test");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<Integer> parallelize = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6));
Integer integer = parallelize.reduce(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1 + v2;
}
});
System.out.println("result = " + integer);
sc.close();
}
}
运行结果:
result = 21