对一个集合调用SparkContext的parallelize方法,集合的对象将会被拷贝,创建出一个可以被并行操作的分布式数据集。
使用Scala实现如下:
1、map:将集合中每个元素乘以2
object TransformationOperation {
def main(args: Array[String]):Unit = {
val conf = new SparkConf().setAppName("TransformationOperation").setMaster("local")
val sc = new SparkContext(conf)
val numbers = Array(1, 2, 3, 4, 5)
val numberRdd = sc.parallelize(numbers)
val multiNumbers = numberRdd.map(item => item*2)
multiNumbers.foreach(item => println(item))
}
}
2、filter:过滤出集合中的偶数
def filter():Unit = {
val conf = new SparkConf().setAppName("filter").setMaster("local")
val sc = new SparkContext(conf)
val numbers = Array(1, 2, 3, 4, 5)
val numberRdd = sc.parallelize(numbers)
val multiNumbers = numberRdd.filter(item => item % 2 == 0)
multiNumbers.foreach(item => println(item))
}
3、flatMap:将行拆分为单词
def flatMap():Unit = {
val conf = new SparkConf().setAppName("flatMap").setMaster("local")
val sc = new SparkContext(conf)
val lines = Array("hello world", "nihao hello", "you are welcome")
val wordRDD = sc.parallelize(lines).flatMap(line => line.split(" "))
wordRDD.foreach(word => println(word))
}
4、groupByKey:将每个班级的成绩进行分组
def groupByKey():Unit = {
val conf = new SparkConf().setAppName("groupByKey").setMaster("local")
val sc = new SparkContext(conf)
val scoreList = Array(Tuple2("class1", 80), Tuple2("class2", 70), Tuple2("class1", 90), Tuple2("class2", 75))
val scores = sc.parallelize(scoreList, 1)
val groupScores = scores.groupByKey()
groupScores.foreach(score => {
println(score._1 + ":")
score._2.foreach(singleScore => println(singleScore))
println("============")
})
}
5、reduceByKey:统计每个班级的总分
/**
* reduceByKey:统计每个班级的总分
*/
def reduceByKey():Unit = {
val conf = new SparkConf().setAppName("reduceByKey").setMaster("local")
val sc = new SparkContext(conf)
val scoreList = Array(Tuple2("class1", 80), Tuple2("class2", 70), Tuple2("class1", 90), Tuple2("class2", 75))
val scoreRDD = sc.parallelize(scoreList, 1);
val scores = scoreRDD.reduceByKey((v1, v2) => v1 + v2)
scores.foreach(scorePair => println(scorePair._1 + ":" + scorePair._2))
}
6、sortByKey:将学生分数进行排序
/**
* sortByKey:将学生分数进行排序
*/
def sortByKey(): Unit = {
val conf = new SparkConf().setAppName("sortByKey").setMaster("local")
val sc = new SparkContext(conf)
val scoreList = Array(Tuple2(80, "zhangsan"), Tuple2(70, "lisi"), Tuple2(90, "wangwu"), Tuple2(75, "tom"))
val scoreRdd = sc.parallelize(scoreList, 1)
val sortRdd = scoreRdd.sortByKey(ascending = false)//false为降序,默认是升序排列,从小到大
sortRdd.foreach(score => println(score._1 + ":" + score._2))
}
7、join:打印每个学生的成绩
/**
* join:打印每个学生的成绩
*/
def join(): Unit = {
val conf = new SparkConf().setAppName("join").setMaster("local")
val sc = new SparkContext(conf)
val nameList = Array(Tuple2(1, "zhangsan"), Tuple2(2, "lisi"), Tuple2(3, "wangwu"), Tuple2(4, "tom"))
val scoreList = Array(Tuple2(1, 100), Tuple2(2, 80), Tuple2(3, 70))
val nameRDD = sc.parallelize(nameList)
val scoreRDD = sc.parallelize(scoreList)
val namescoreRDD = nameRDD.join(scoreRDD)
namescoreRDD.foreach(namescore => println(namescore._2._1 + ":" + namescore._2._2))
}
8、cogroup:打印每个学生的成绩
/**
* cogroup:打印学生成绩
*/
def cogroup(): Unit = {
val conf = new SparkConf().setAppName("join").setMaster("local")
val sc = new SparkContext(conf)
val nameList = Array(Tuple2(1, "zhangsan"), Tuple2(2, "lisi"), Tuple2(3, "wangwu"), Tuple2(4, "tom"))
val scoreList = Array(Tuple2(1, 100), Tuple2(2, 80), Tuple2(3, 70), Tuple2(1, 90), Tuple2(2, 70), Tuple2(3, 78))
val nameRDD = sc.parallelize(nameList)
val scoreRDD = sc.parallelize(scoreList)
val namescoreRDD = nameRDD.cogroup(scoreRDD) //基于key进行join 结果并没有顺序
namescoreRDD.foreach(namescore => {
println("id: " + namescore._1)
println("name: "/* + namescore._2._1*/)
namescore._2._1.foreach(item => println(item))
println("score: " /*+ namescore._2._2*/)
namescore._2._2.foreach(item => println(item))
})
}