spark里面一个重要的特性就是transformation具有lazy特性,他只会在driver里进行rdd衍生抽象计算,只有当执行action动作运算时才会提交到worker节点进行计算
这里介绍几个常用转换运算的算子:map,flatMap,reduceByKey,sortByKey,groupByBey,join
scala版
package com.chen.spark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object transformation {
def main(args : Array[String]) {
//map()
//filter()
//flatmap()
//groupbykey()
join()
}
def map() {
/**
* 将数组里的每一个元素乘以2
*/
val conf = new SparkConf()
.setAppName("transformation_map")
.setMaster("local")
val sc = new SparkContext(conf)
val numArray = Array(1, 2, 3, 4, 5)
val arrayRDD = sc.parallelize(numArray, 1)
val multiplearrayRDD = arrayRDD.map(num => num * 2)
multiplearrayRDD.foreach(num => println(num))
// scala 一定要手动关闭sc,否则报错
sc.stop()
}
def filter() {
/**
* 保留偶数
*/
val conf = new SparkConf()
.setAppName("transformation_filter")
.setMaster("local")
val sc = new SparkContext(conf)
val numArray = Array(1, 2, 3, 4, 5)
val arrayRDD = sc.parallelize(numArray, 1)
val multiplearrayRDD = arrayRDD.filter(num => num % 2 == 0)
multiplearrayRDD.foreach(num => println(num))
// scala 一定要手动关闭sc,否则报错
sc.stop()
}
def flatmap() {
/**
* flatmap算子
*/
val conf = new SparkConf()
.setAppName("transformation_flatmap")
.setMaster("local")
val sc = new SparkContext(conf)
val stringlist = Array("hello you", "hello world", "hello")
val stringRDD = sc.parallelize(stringlist)
val flat_string = stringRDD.flatMap(x => x.split(" "))
flat_string.foreach(num => println(num))
sc.stop()
}
def groupbykey() {
val conf = new SparkConf()
.setAppName("transformation_group")
.setMaster("local")
val sc = new SparkContext(conf)
val scorelist = Array(Tuple2("class1", 20), Tuple2("class2", 50), Tuple2("class1", 30))
val scoreRDD = sc.parallelize(scorelist, 1)
val scoreRDD_group = scoreRDD.groupByKey()
scoreRDD_group.foreach(score => {println(score._1);
score._2.foreach(singlescore => println(singlescore))})
sc.stop()
}
def join(){
val conf = new SparkConf()
.setAppName("transformation_group")
.setMaster("local")
val sc = new SparkContext(conf)
val scorelist = Array(Tuple2("class1", 20), Tuple2("class2", 50), Tuple2("class1", 30))
val namelsi = Array(Tuple2("class1", "grade1"), Tuple2("class2", "grade2"))
val scoreRDD = sc.parallelize(scorelist)
val nameRDD = sc.parallelize(namelsi)
val joinRDD = scoreRDD.join(nameRDD)
joinRDD.foreach(data => {
println("class: ", data._1);
println("score" , data._2._1);
println("grade" , data._2._2);
println("#######################")
})
}
}
Python版
# -*- coding:UTF-8 -*-
from pyspark import SparkConf
from pyspark import SparkContext
def CreateSparkContext():
sparkConf = SparkConf().setAppName("transformation_py").set("spark.ui.showConsoleProgress", "false")
sc = SparkContext(conf = sparkConf) # 不同于scala,这里必须指定conf来传参
print("master =" + sc.master)
return sc
def map(sc):
# 将list里的每一个元素乘以2
number = [1,2, 3, 4, 5]
numRDD = sc.parallelize(number)
multtiple_num = numRDD.map(lambda num : num * 2).collect()
print(multtiple_num)
def filter(sc):
# 保留偶数
number = [1,2, 3, 4, 5]
numRDD = sc.parallelize(number)
even_num = numRDD.filter(lambda num : num % 2 == 0).collect()
print(even_num)
def flatmap(sc):
# 整合多个单元到一个集合
stringlist = ["hello you", "hello world", "hello"]
stringRDD = sc.parallelize(stringlist)
flat_string = stringRDD.flatMap(lambda x : x.split()).collect()
print(flat_string)
def groupbykey(sc):
scoredataRDD = sc.parallelize([("class1", 30), ("class2", 50), ("class1", 60)])
scoredata = scoredataRDD.groupByKey().collect()
# 注意 这里返回一个两个list,后面的要用sorted/list 不然无法显示
print(scoredata[0][0], sorted(scoredata[0][1]))
print(scoredata[1][0], list(scoredata[1][1]))
def reducebykey(sc):
scoredataRDD = sc.parallelize([("class1", 30), ("class2", 50), ("class1", 60)])
scordata = scoredataRDD.reduceByKey(lambda x,y : x+y).collect()
print(scordata[0][0], scordata[0][1])
print(scordata[1][0], scordata[1][1])
if __name__ == "__main__":
sc = CreateSparkContext()
#map(sc)
#filter(sc)
#flatmap(sc)
#groupbykey(sc)
reducebykey(sc)
sc.stop()