一、Transformations算子
1.map
特点就是:一对一,进来一个String,出去一个String
JavaRDD<String> map = lines.map(new Function<String, String>() {
@Override
public String call(String line) throws Exception {
return line + "*";
}
});
map.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
2.flatMap
一变多,进来一行,出去一堆单词
//切割一行数据,返回一个Iterator
lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
List<String> list = Arrays.asList(s.split(" "));
return list.iterator();
}
}).foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
3.filter:过滤输出
只返回符合过滤条件的RDD
JavaRDD<String> result = lines.filter(new Function<String, Boolean>() {
@Override
public Boolean call(String line) throws Exception {
return "hello world".equals(line);
}
});
System.out.println(result.count());//打印多少条
result.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
4.mapToPair
进来一行String,出去的是一个Tuple。3个泛型,分别对应call的类型、Tuple的K-V类型
JavaPairRDD<String, String> result = lines.mapToPair(new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String s) throws Exception {
return new Tuple2<>(s, s + "@@@");
}
});
result.foreach(new VoidFunction<Tuple2<String, String>>() {
@Override
public void call(Tuple2<String, String> stringStringTuple2) throws Exception {
System.out.println(stringStringTuple2._1()+"<------>"+stringStringTuple2._2());
//输出:hello world<------>hello world@@@,或者直接输出stringStringTuple2
}
});
5.计数:
lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
List<String> list = Arrays.asList(s.split(" "));
return list.iterator();
}
}).mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String word) throws Exception {
return new Tuple2<>(word,1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer+integer2;
}
}).foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tp) throws Exception {
System.out.println(tp);
}
});
6.排序
排序,反转,排序,再反转
JavaPairRDD<String, Integer> reduceRDD = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {
return Arrays.asList(s.split(" ")).iterator();
}
}).mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
return new Tuple2<>(s, 1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
reduceRDD.mapToPair(new PairFunction<Tuple2<String,Integer>, Integer, String>() {
@Override
public Tuple2<Integer, String> call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
return stringIntegerTuple2.swap();
}
}).sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer,String>, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Tuple2<Integer, String> integerStringTuple2) throws Exception {
return integerStringTuple2.swap();
}
}).foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> stringIntegerTuple2) throws Exception {
System.out.println(stringIntegerTuple2);
//输出(hello,6),以按照wc排好序
}
});
7.sample
JavaRDD<String> sample = lines.sample(true, 0.1,100L);
sample.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
8.join
JavaPairRDD<String, Tuple2<Integer, Integer>> result = rdd1.join(rdd2);
8.1leftOuterJoin
JavaPairRDD<String, Tuple2<Integer, Optional<Integer>>> result = rdd1.leftOuterJoin(rdd2);
result.foreach(new VoidFunction<Tuple2<String, Tuple2<Integer, Optional<Integer>>>>() {
@Override
public void call(Tuple2<String, Tuple2<Integer, Optional<Integer>>> tp) throws Exception {
String key = tp._1;
Integer v1 = tp._2._1;
Optional<Integer> optional = tp._2._2;
// System.out.println("key="+key+";v1="+v1+";optionl="+optional.get());
System.out.println("key="+key+";v1="+v1+";optionl="+optional.orElse(1000));
}
});
8.2rightOuterJoin
JavaPairRDD<String, Tuple2<Optional<Integer>, Integer>> result = rdd1.rightOuterJoin(rdd2);
result.foreach(new VoidFunction<Tuple2<String, Tuple2<Optional<Integer>, Integer>>>() {
@Override
public void call(Tuple2<String, Tuple2<Optional<Integer>, Integer>> tp) throws Exception {
String key = tp._1;
Optional<Integer> v1 = tp._2._1;
Integer v2 = tp._2._2;
System.out.println("key="+key+";v1="+v1.get()+";v2="+v2);
}
});
9.union合并
JavaPairRDD<String, Integer> result = rdd1.union(rdd2);
result.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tp) throws Exception {
System.out.println(tp._1+":"+tp._2);
}
});
10.intersection取交集
JavaPairRDD<String, Integer> result = rdd1.intersection(rdd2);
result.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tp) throws Exception {
System.out.println(tp._1+":"+tp._2);
}
});
11.subtract取差集
JavaPairRDD<String, Integer> result = rdd1.subtract(rdd2);
result.foreach(new VoidFunction<Tuple2<String, Integer>>() {
@Override
public void call(Tuple2<String, Integer> tp) throws Exception {
System.out.println(tp._1+":"+tp._2);
}
});
12.distinct去重
JavaRDD<String> result = rdd1.distinct();
result.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
13.mapPartitions
作用于每个partition上的数据
JavaRDD<String> stringJavaRDD = rdd1.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {
@Override
public Iterator<String> call(Iterator<String> iter) throws Exception {
List<String> list = new ArrayList<>();
System.out.println("创建");
while (iter.hasNext()) {
String s = iter.next();
list.add(s);
System.out.println("插入:" + s);
}
System.out.println("关闭");
return list.iterator();
}
});
stringJavaRDD.count();
14.mapPartitionsWithIndex
JavaRDD<String> rdd1 = rdd.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
@Override
public Iterator<String> call(Integer index, Iterator<String> iterator) throws Exception {
List<String> list = new ArrayList<>();
while (iterator.hasNext()) {
String next = iterator.next();
list.add("index=" + index + ";value=" + next);
}
return list.iterator();
}
}, false);
for (String s:rdd1.collect()){
System.out.println(s);//index=0;value=a
}
15.repartition
可以增多、减少分区。宽依赖算子,会产生shuffle;
这里区别于coalesce,coalesce同样可能增加、减少分区。但是coalesce是窄依赖算子,默认无shuffle,可通过设置true来开启。当coalesce由少的分区分到多的分区时,不让产生shuffle,不起作用。
因此可以变相的理解为:repartition常用于增多分区,coalesce常用于减少分区
JavaRDD<String> rdd2 = rdd1.repartition(4);
rdd2.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {
@Override
public Iterator<String> call(Integer index, Iterator<String> iterator) throws Exception {
List<String> list = new ArrayList<>();
while (iterator.hasNext()) {
String next = iterator.next();
list.add("index=" + index + ";value=" + next);
}
return list.iterator();
}
},false);
16.zip & zipwithindex
zip:两个RDD可以通过zip压缩在一起,输出结果:(a,1)
zipwithindex:Long就是RDD的index下标0,1,2…和各自的下标压缩在一起,形成K-V格式RDD。如:(a,0)
JavaPairRDD<String, String> zip = rdd.zip(rdd1);
JavaPairRDD<String, Long> zipWithIndex = rdd.zipWithIndex();
二、Action算子
1.collect
List<String> list = lines.collect();
for (String s:list){
System.out.println(s);
}
2.count
返回几行
long count = lines.count();
System.out.println(count);
3.first
返回第一行
String first = lines.first();
System.out.println(first);
4.take
返回指定几行
List<String> list = lines.take(5);
for (String s:list){
System.out.println(s);
}
5.foreachPartition
rdd1.foreachPartition(new VoidFunction<Iterator<String>>() {
@Override
public void call(Iterator<String> stringIterator) throws Exception {
}
});
6.reduce &countByKey & countByValue
聚合执行对应逻辑,输出15
sc.parallelize(Arrays.asList(1,2,3,4,5)).reduce(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
});
countByKey按照key分组,count整体相同的有几个
Map<String, Long> map = sc.parallelizePairs(Arrays.asList(new Tuple2<String, Integer>("a", 1),
new Tuple2<String, Integer>("b", 1), new Tuple2<String, Integer>("c", 1),
new Tuple2<String, Integer>("d", 1))).countByKey();
countByValue:整体作为value分组,计算出现次数。输出:((a,100),2)
sc.parallelize(Arrays.asList(1,2,3,4,5)).reduce(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer v1, Integer v2) throws Exception {
return v1+v2;
}
});
Map<Tuple2<String, Integer>, Long> map = sc.parallelizePairs(Arrays.asList(new Tuple2<String, Integer>("a", 1),
new Tuple2<String, Integer>("b", 1), new Tuple2<String, Integer>("c", 1),
new Tuple2<String, Integer>("d", 1))).countByValue();
Set<Map.Entry<Tuple2<String, Integer>, Long>> set = map.entrySet();
for (Map.Entry<Tuple2<String,Integer>,Long>entry : set){
Tuple2<String, Integer> key = entry.getKey();
Long value = entry.getValue();
}