以下为Java版本的代码
first
返回第一个元素;
JavaRDD<String> rdd = sc.parallelize(Arrays.asList("hello", "scala", "python", "spark"));
//todo first()
System.out.println(rdd.first()); //hello
take
返回前n个元素;
//todo take
System.out.println(rdd.take(2)); //[hello, scala]
collect
返回RDD中的所有元素;
//todo collect
System.out.println(rdd.collect()); //[scala, scala, python, spark]
count
返回RDD中的元素个数;
//todo count
System.out.println(rdd.count()); //4
countByValue
各元素在RDD中出现的次数;
//todo countByValue
Map<String, Long> stringLongMap = rdd.countByValue();
Set<String> strings = stringLongMap.keySet();
for (String string : strings) {
System.out.println(string+":"+stringLongMap.get(string));
//scala:2 python:1 spark:1
}
reduce
并行整合RDD中的所有数据;
//todo reduce
String reduce = rdd.reduce(new Function2<String, String, String>() {
@Override
public String call(String s, String s2) throws Exception {
return s + ","+s2;
}
});
System.out.println(reduce);//python,spark,scala,scala
aggregate
和reduce相似,不过需要赋予一个初始值。一般不适用返回不同类型的函数;
//todo aggregate 需要设定一个初始值1
JavaRDD<Integer> rdd2 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
Tuple2<Integer, Integer> aggregate = rdd2.aggregate(new Tuple2<Integer, Integer>(0, 0), new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> t, Integer i) throws Exception {
return new Tuple2<>(t._1 + i, t._2 + 1);
}
}, new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> i1, Tuple2<Integer, Integer> i2) throws Exception {
return new Tuple2<>(i1._1 + i2._1, i1._2 + i2._2);
}
}
);
int i = aggregate._1 + aggregate._2;
System.out.println(i);//20
System.out.println(aggregate); //(15,5)
System.out.println("===================");
Integer aggregate1 = rdd2.aggregate(5, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer i1, Integer i2) throws Exception {
return i1 + i2;
}
}, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer i1, Integer i2) throws Exception {
return i1 + i2;
}
}
);
System.out.println(aggregate1); //30
fold
和aggregate类似,都需要一个初始值。在计算时按照分区进行,每个分区计算完成后和初始值折叠,然后分区之间还会进行fold.例如:
rdd.fold(1)(+)。假设有两个分区。各个分区内的元素相加后再加上1,然后分区之间再相加,再加上1。
//todo fold
Integer fold = rdd2.fold(5, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer i1, Integer i2) throws Exception {
return i1 + i2;
}
});
System.out.println(fold); //30
top
按照降序或者指定的排序规则,返回指定的元素个数;
takeOrdered
与top用法相同,不过默认是升序;
foreach
对RDD中的每个元素使用给定的函数。
countByKey
按相同的key统计元素个数。
//todo top 按照降序或指定规则返回前n个元素
System.out.println(rdd2.top(2));//[5, 4]
//todo takeOrdered 与top作用相反,按升序
System.out.println(rdd2.takeOrdered(2)); //[1, 2]
//todo foreach 对RDD中的每个元素使用给定的函数
rdd.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println(s); //hello scala python spark
}
});
Java版本
package Action;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import scala.Int;
import scala.Tuple2;
import java.util.*;
public class first {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("first").setMaster("local[2]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> rdd = sc.parallelize(Arrays.asList("hello", "scala", "python", "spark"));
//todo first()
System.out.println(rdd.first()); //hello
//todo take
System.out.println(rdd.take(2)); //[hello, scala]
//todo collect
System.out.println(rdd.collect()); //[scala, scala, python, spark]
//todo count
System.out.println(rdd.count()); //4
//todo countByValue
Map<String, Long> stringLongMap = rdd.countByValue();
Set<String> strings = stringLongMap.keySet();
for (String string : strings) {
System.out.println(string+":"+stringLongMap.get(string));
//scala:2 python:1 spark:1
}
//todo reduce
String reduce = rdd.reduce(new Function2<String, String, String>() {
@Override
public String call(String s, String s2) throws Exception {
return s + ","+s2;
}
});
System.out.println(reduce);//python,spark,scala,scala
//todo aggregate 需要设定一个初始值1
JavaRDD<Integer> rdd2 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
Tuple2<Integer, Integer> aggregate = rdd2.aggregate(new Tuple2<Integer, Integer>(0, 0), new Function2<Tuple2<Integer, Integer>, Integer, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> t, Integer i) throws Exception {
return new Tuple2<>(t._1 + i, t._2 + 1);
}
}, new Function2<Tuple2<Integer, Integer>, Tuple2<Integer, Integer>, Tuple2<Integer, Integer>>() {
@Override
public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> i1, Tuple2<Integer, Integer> i2) throws Exception {
return new Tuple2<>(i1._1 + i2._1, i1._2 + i2._2);
}
}
);
int i = aggregate._1 + aggregate._2;
System.out.println(i);//20
System.out.println(aggregate); //(15,5)
System.out.println("===================");
Integer aggregate1 = rdd2.aggregate(5, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer i1, Integer i2) throws Exception {
return i1 + i2;
}
}, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer i1, Integer i2) throws Exception {
return i1 + i2;
}
}
);
System.out.println(aggregate1); //30
//todo fold
Integer fold = rdd2.fold(5, new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer i1, Integer i2) throws Exception {
return i1 + i2;
}
});
System.out.println(fold); //30
//todo top 按照降序或指定规则返回前n个元素
System.out.println(rdd2.top(2));//[5, 4]
//todo takeOrdered 与top作用相反,按升序
System.out.println(rdd2.takeOrdered(2)); //[1, 2]
//todo foreach 对RDD中的每个元素使用给定的函数
rdd.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println(s); //hello scala python spark
}
});
//todo countByKey
JavaRDD<Tuple2<String, Integer>> rdd = sc.parallelize(Arrays.asList(
new Tuple2<String, Integer>("sam", 1),
new Tuple2<String, Integer>("sam", 2),
new Tuple2<String, Integer>("sam", 3),
new Tuple2<String, Integer>("john", 1),
new Tuple2<String, Integer>("john", 3),
new Tuple2<String, Integer>("mary", 1)
));
JavaPairRDD<String, Integer> countRdd = JavaPairRDD.fromJavaRDD(rdd);
System.out.println(countRdd.countByKey());//{sam=3, john=2, mary=1}
Map<String, Long> CountMap = countRdd.countByKey();
Set<String> strings = CountMap.keySet();
for (String string : strings) {
System.out.print(string+":"+CountMap.get(string)+" "); //sam:3 john:2 mary:1
}
}
}