1、算子
reduce 将RDD中的所有元素进行聚合操作。第一个和第二个元素聚合,值与第三个元素聚合,值与第四个元素聚合,以此类推
collect 将RDD中所有元素获取到本地客户端
count 获取RDD元素总数
take(n) 获取RDD中前n个元素
saveAsTextFile 将RDD元素保存到文件中,对每个元素调用toString方法
countByKey 对每个key对应的值进行count计数
foreach 遍历RDD中的每个元素
2、例子
① reduce
案例1:求和(java)
package cn.spark.study.core;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import java.util.Arrays;
import java.util.List;
public class Action_8 {
public static void main(String[] args) {
reduce();
}
private static void reduce(){
SparkConf conf = new SparkConf()
.setMaster("local")
.setAppName("reduce");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numList = Arrays.asList(1, 2, 3, 45, 6, 7, 8, 9, 10);
JavaRDD<Integer> numRDD = sc.parallelize(numList);
Integer sum = numRDD.reduce(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
});
System.out.println(sum);
sc.close();
}
}
案例1:求和(java)
package cn.spark.study.core
import org.apache.spark.{SparkConf, SparkContext}
object Action {
def main(args: Array[String]): Unit = {
myReduce()
}
def myReduce(): Unit ={
val conf = new SparkConf()
.setMaster("local")
.setAppName("reduce")
val sc = new SparkContext(conf)
val number = Array(1,2,3,4,5,6,7,8,9,10)
val numbers = sc.parallelize(number)
val sum: Int = numbers.reduce(_+_)
println(sum)
}
}
② collect
案例1:对每个元素乘以2(java)
package cn.spark.study.core;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import java.util.Arrays;
import java.util.List;
public class Action_8 {
public static void main(String[] args) {
collect();
}
private static void collect(){
SparkConf conf = new SparkConf()
.setMaster("local")
.setAppName("collect");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> numList = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
JavaRDD<Integer> numRDD = sc.parallelize(numList);
JavaRDD<Integer> numc2 = numRDD.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer integer) throws Exception {
return integer * 2;
}
});
List<Integer> collect = numc2.collect();
for (Integer a : collect){
System.out.println(a);
}
sc.close();
}
}
案例2:对每个元素乘以2(scala)
package cn.spark.study.core
import org.apache.spark.{SparkConf, SparkContext}
object Action {
def main(args: Array[String]): Unit = {
myCollect()
}
def myCollect(): Unit ={
val conf = new SparkConf()
.setMaster("local")
.setAppName("reduce")
val sc = new SparkContext(conf)
val number = Array(1,2,3,4,5,6,7,8,9,10)
val numbers = sc.parallelize(number)
val doubleNum = numbers.map(_*2)
val ints: Array[Int] = doubleNum.collect()
for(num <- ints) {
println(num)
}
}
}
③ count
案例1:获取RDD元素总数(java)
package cn.spark.study.core;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.Arrays;
import java.util.List;
public class Action_8 {
public static void main(String[] args) {
count();
}
private static void count(){
//创建SparkConf
SparkConf conf = new SparkConf()
.setMaster("local")
.setAppName("count");
//创建SparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
// 累加集合中的1-10
List<Integer> numberList = Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD<Integer> numbers = sc.parallelize(numberList);
long count = numbers.count();
System.out.println(count);
}
}
案例2:获取RDD元素总数(scala)
package cn.spark.study.core
import org.apache.spark.{SparkConf, SparkContext}
object Action {
def main(args: Array[String]): Unit = {
myCount()
}
def myCount(): Unit ={
val conf = new SparkConf()
.setMaster("local")
.setAppName("reduce")
val sc = new SparkContext(conf)
val number = Array(1,2,3,4,5,6,7,8,9,10)
val numbers = sc.parallelize(number)
val count = numbers.count()
println(count)
}
}
④ take(n)
案例1:获取前三个数(java)
package cn.spark.study.core;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.Arrays;
import java.util.List;
public class Action_8 {
public static void main(String[] args) {
take();
}
private static void take(){
//创建SparkConf
SparkConf conf = new SparkConf()
.setMaster("local")
.setAppName("take");
//创建SparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
// 累加集合中的1-10
List<Integer> numberList = Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD<Integer> numbers = sc.parallelize(numberList);
//take操作是从远程集群上获取RDD中的数据,但是只获取前n个
List<Integer> take = numbers.take(3);
for (Integer a : take) {
System.out.println(a);
}
sc.close();
}
}
案例2:获取前三个数(scala)
package cn.spark.study.core
import org.apache.spark.{SparkConf, SparkContext}
object Action {
def main(args: Array[String]): Unit = {
myTake()
}
def myTake(): Unit ={
val conf = new SparkConf()
.setMaster("local")
.setAppName("take")
val sc = new SparkContext(conf)
val number = Array(1,2,3,4,5,6,7,8,9,10)
val numbers = sc.parallelize(number)
val take: Array[Int] = numbers.take(3)
for(num <- take) {
println(num)
}
}
}
⑤ saveAsTextFile
将RDD元素保存到文件中,对每个元素调用toString方法
package cn.spark.study.core;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import java.util.Arrays;
import java.util.List;
public class Action_8 {
public static void main(String[] args) {
saveAsTextFile();
}
private static void saveAsTextFile(){
//创建SparkConf
SparkConf conf = new SparkConf()
.setMaster("local")
.setAppName("saveAsTextFile");
//创建SparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
// 累加集合中的1-10
List<Integer> numberList = Arrays.asList(1,2,3,4,5,6,7,8,9,10);
JavaRDD<Integer> numbers = sc.parallelize(numberList);
JavaRDD<Integer> numc3 = numbers.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer integer) throws Exception {
return integer * 3;
}
});
numc3.saveAsTextFile("hdfs://hadoop1:9000/numc3.txt");
sc.close();
}
}
⑥ countByKey
案例1:统计每班学生数量(java)
package cn.spark.study.core;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
public class Action_8 {
public static void main(String[] args) {
countByKey();
}
private static void countByKey(){
//创建SparkConf
SparkConf conf = new SparkConf()
.setMaster("local")
.setAppName("countByKey");
//创建SparkContext
JavaSparkContext sc = new JavaSparkContext(conf);
//模拟集合
List<Tuple2<String,String>> studentList = Arrays.asList(
new Tuple2<String,String>("class1","leo"),
new Tuple2<String,String>("class2","jack"),
new Tuple2<String,String>("class1","marry"),
new Tuple2<String,String>("class2","tom"),
new Tuple2<String,String>("class2","da")
);
// 并行化集合,创建JavaPairRDD
JavaPairRDD<String, String> student = sc.parallelizePairs(studentList);
// 对rdd应用countByKey操作,统计每个班级学生人数,也就是统计每个key对应的元素个数
// 这句是countByKey的作用
Map<String, Object> stringObjectMap = student.countByKey();
for (Map.Entry<String,Object> a : stringObjectMap.entrySet()){
System.out.println(a.getKey()+":"+a.getValue());
}
sc.close();
}
}
案例2:统计每班学生数量(scala)
package cn.spark.study.core
import org.apache.spark.{SparkConf, SparkContext}
object Action {
def main(args: Array[String]): Unit = {
myCountByKey()
}
def myCountByKey(): Unit ={
val conf = new SparkConf()
.setMaster("local")
.setAppName("countByKey")
val sc = new SparkContext(conf)
val studentList = Array(
Tuple2("class1","leo"),
Tuple2("class2","jack"),
Tuple2("class1","tom"),
Tuple2("class2","jen"),
Tuple2("class2","marry"))
val scoreRDD = sc.parallelize(studentList)
val stringToLong: collection.Map[String, Long] = scoreRDD.countByKey()
for((key,value) <- stringToLong){
println(key+":"+value)
}
}
}