RDD操作案例
map、flatMap、filter、groupByKey、reduceByKey、join、cogroup
join 两个集合基于Key进行内容的连接。
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object Transformations {
def main(args:Array[String]){
val conf = new SparkConf().setAppName("Transformations").setMaster("local") //创建SparkConf初始化程序的配置
val sc = new SparkContext(conf) //创建SparkContext,这是第一个RDD创建的唯一入口,是通往集群的唯一通道
val nums = sc.parallelize(1 to 10) //根据集合创建RDD
val mapped = nums.map(_*2) //map适用于任何类型的元素,且对其作用的集合中的每一个元素循环遍历,并调用其作为参数的函数对每一个遍历的元素进行具体化操作
mapped.collect().foreach(println)
val filtered = nums.filter(_%2 == 0)
filtered.collect().foreach(println)
val bigData = Array("Scala Spark", "Java Hadoop", "Java Tachyon")
val bigDataString = sc.parallelize(bigData)
val words = bigDataString.flatMap(_.split(" "))
words.collect().foreach(println)
sc.stop()
}
}
main方法中调用的每一个功能都必须模块化,每个模块可以使用函数封装。
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object Transformations {
def main(args:Array[String]){
val sc = sparkContext("Transformation Operations") //创建SparkContext
mapTransformation(sc)
filterTransformation(sc)
flatMapTransformation(sc)
reduceByKeyTransformation(sc)
joinTransformation(sc)
sc.stop()
}
def sparkContext(name:String)={
val conf = new SparkConf().setAppName("Transformations").setMaster("local") //创建SparkConf初始化程序的配置
val sc = new SparkContext(conf) //创建SparkContext,这是第一个RDD创建的唯一入口,是通往集群的唯一通道
sc
}
def mapTransformation(sc: SparkContext){
val nums = sc.parallelize(1 to 10) //根据集合创建RDD
val mapped = nums.map(_*2)
mapped.collect().foreach(println)
}
def filterTransformation(sc: SparkContext){
val nums = sc.parallelize(1 to 20) //根据集合创建RDD
val filtered = nums.filter(_%2 == 0) //根据filter中作为参数的函数的布尔值判断符合条件的元素并基于这些元素构成新的mapPartition RDD
filtered.collect().foreach(println)
}
def flatMapTransformation(sc: SparkContext){
val bigData = Array("Scala Spark", "Java Hadoop", "Java Tachyon") //实例化字符串类型的Array
val bigDataString = sc.parallelize(bigData) //创建以字符串为元素类型的ParallelCollectionRDD
val words = bigDataString.flatMap(_.split(" ")) //首先通过传入的作为参数的函数来作用域RDD的每个字符串进行单词切分(以集合形式存在),然后把切分后的结果合并成一个大的集合
words.collect().foreach(println)
}
def groupByKeyTransformation(sc: SparkContext){
val data = Array(Tuple2(100, "Spark"), Tuple2(100, "Tachyon"), Tuple2(70, "Hadoop"), Tuple2(80, "HBase"))
val dataRDD= sc.parallelize(data) //创建RDD
val grouped = dataRDD.groupByKey() //按照相同的key对value进行分组,分组后的value是一个集合
grouped.collect().foreach(println)
}
def reduceByKeyTransformation(sc: SparkContext){
val lines = sc.textFile("words.txt")
val wordCount = lines.flatMap(_.split(" ")).map(word => (word,1)).reduceByKey(_+_).collect().foreach(println)
}
def joinTransformation(sc: SparkContext){
val studentNames = Array(
Tuple2(1, "Spark"),
Tuple2(2, "Tachyon"),
Tuple2(3, "Hadoop")
)
val studentScores = Array(
Tuple2(1, 100),
Tuple2(2, 95),
Tuple2(3, 90)
)
val names = sc.parallelize(studentNames)
val scores = sc.parallelize(studentScores)
val studentInformation = names.join(scores)
studentInformation.collect().foreach(println)
//(1,(Spark,100))
//(3,(Hadoop,90))
//(2,(Tachyon,95))
}
}
//Java实现cogroup操作
import java.util.Arrays;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
public class CogroupOps {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("Cogroup Transformation").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Tuple2<Integer, String>> namesList = Arrays.asList(
new Tuple2<Integer, String>(1, "Spark"),
new Tuple2<Integer, String>(2, "Tachyon"),
new Tuple2<Integer, String>(3, "Hadoop")
);
List<Tuple2<Integer, Integer>> scoresList = Arrays.asList(
new Tuple2<Integer, Integer>(1, 100),
new Tuple2<Integer, Integer>(2, 90),
new Tuple2<Integer, Integer>(3, 70),
new Tuple2<Integer, Integer>(1, 110),
new Tuple2<Integer, Integer>(2, 95),
new Tuple2<Integer, Integer>(2, 60)
);
JavaPairRDD<Integer, String> names = sc.parallelizePairs(namesList);
JavaPairRDD<Integer, Integer> scores = sc.parallelizePairs(scoresList);
JavaPairRDD<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> nameScores = names.cogroup(scores);
nameScores.foreach(new VoidFunction<Tuple2<Integer,Tuple2<Iterable<String>,Iterable<Integer>>>>() {
@Override
public void call(Tuple2<Integer, Tuple2<Iterable<String>, Iterable<Integer>>> t) throws Exception {
System.out.println("Student ID:" + t._1);
System.out.println("Name:" + t._2._1);
System.out.println("Score:" + t._2._2);
System.out.println("===========");
}
});
sc.close();
}
}
/*
Student ID:1
Name:[Spark]
Score:[100, 110]
===========
Student ID:3
Name:[Hadoop]
Score:[70]
===========
Student ID:2
Name:[Tachyon]
Score:[90, 95, 60]
*/