package com.lhj.www
import org.apache.spark.{SparkContext, SparkConf}
object Test {
def main(args: Array[String]) {
val sc = sparkContext("My App")
sc.setLogLevel("WARN")
// flatMapTransformation(sc)
// mapTransformation(sc)
// reduceTransformation(sc)
// reduceByKeyTransformation(sc)//worldcount中:key相同,value相加,reduceByKey(_+_)
// filterTransformation(sc)
// joinTransformation(sc)//类似oracle的join,结果:(key,(a.value,b.value))
// cogroupTransformation(sc)//结果是(k1,(v1,v2的所有迭代)),(1,(CompactBuffer(a1),CompactBuffer(90, 88)))
// countByKeyTransformation(sc)//类似oracle的group by,(key,出现的次数)
// groupByKeyTransformation(sc) //类似oracle的group by,(key,CompactBuffer(a,b,c))出现的具体内容
sc.stop
}
def sparkContext(appname: String): SparkContext ={
val conf = new SparkConf().setAppName(appname).setMaster("local")
val sc = new SparkContext(conf)
sc
}
def flatMapTransformation(sc: SparkContext): Unit ={
val data=Array("1 a1","2 a2","3 a3")
val dataRdd=sc.parallelize(data)
val flatMaped=dataRdd.flatMap(_.split(" "))
flatMaped.collect.foreach(println)
}
def mapTransformation(sc: SparkContext): Unit ={
val data=Array("a","b","c")
val dataRdd=sc.parallelize(data)
val maped=dataRdd.map((_,1))
maped.collect.foreach(println)
}
def reduceTransformation(sc: SparkContext): Unit ={
val dataRdd=sc.parallelize(1 to 100)
val reduced=dataRdd.reduce(_+_)
println(reduced)
}
def filterTransformation(sc: SparkContext): Unit ={
val dataRDD=sc.parallelize(1 to 100)
// val filtered=dataRDD.filter(x=>x%2==0)
// val filtered=dataRDD.filter(x=>x>=50)
val filtered=dataRDD.filter(_>=50)
filtered.collect.foreach(println)
}
//key相同,value进行处理,如相加,
// hello: 4 --hello出现了4次
// spark: 2 --spark出现了2次
def reduceByKeyTransformation(sc: SparkContext): Unit ={
val data=sc.textFile("aaa.txt")
val reduceByKeyed=data.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
reduceByKeyed.collect.foreach(x=>println(x._1+": "+x._2))
}
/**学生考试,a,b,c多次考试的得分
得到数据:
(a,CompactBuffer(100, 95, 99))
(b,CompactBuffer(95, 88))
(c,CompactBuffer(97))
*/
def groupByKeyTransformation(sc: SparkContext): Unit ={
val nameAndScore=Array(("a",100),("b",95),("c",97),("a",95),("b",88),("a",99))
val dataRDD=sc.parallelize(nameAndScore)
val groupByKeyed=dataRDD.groupByKey()
groupByKeyed.collect.foreach(println)
}
/**
* name和score表通过id关联,得出结果集
(1,(a1,90))
(3,(a3,92))
(2,(a2,95))
* @param sc
*/
def joinTransformation(sc: SparkContext): Unit ={
val name=Array((1,"a1"),(2,"a2"),(3,"a3"),(4,"a4"))
val score=Array((1,90),(2,95),(3,92))
val table1=sc.parallelize(name)
val table2=sc.parallelize(score)
val result=table1.join(table2)
result.collect.foreach(println)
}
/**
* name是姓名表
* score是分数表,存储每次考试的分数
*
* 结果:
* (4,(CompactBuffer(a4),CompactBuffer()))
(1,(CompactBuffer(a1),CompactBuffer(90, 88)))
(3,(CompactBuffer(a3),CompactBuffer(92, 99)))
(2,(CompactBuffer(a2),CompactBuffer(95, 93)))
*
*/
def cogroupTransformation(sc: SparkContext): Unit ={
val name=Array((1,"a1"),(2,"a2"),(3,"a3"),(4,"a4"))
val score=Array((1,90),(2,95),(3,92),(1,88),(2,93),(3,99))
val table1=sc.parallelize(name)
val table2=sc.parallelize(score)
val result=table1.cogroup(table2)
result.collect.foreach(println)
}
/**
* 结果数据:
* (1,2)
(3,2)
(2,2)
*
*/
def countByKeyTransformation(sc: SparkContext): Unit ={
val score=Array((1,90),(2,95),(3,92),(1,88),(2,93),(3,99))
val dataRDD=sc.parallelize(score)
val result=dataRDD.countByKey()
result.foreach(println)
}
}
spark 代码重构
最新推荐文章于 2022-04-07 10:46:50 发布