package mby00
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
// 翻滚窗口
object TumblingWindowTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("Tumbling Window Test")
val ssc = new StreamingContext(conf,Seconds(1))
ssc.sparkContext.setLogLevel("OFF")
val ds = ssc.socketTextStream("SparkOnStandalone",8888)
ds
.flatMap(_.split(" "))
.map((_,1))
.window(Seconds(4))
.print()
ssc.start()
ssc.awaitTermination()
}
}
// window(Seconds(5))
/**-------------------------------------------
Time: 1575351312000 ms
-------------------------------------------
(ddd,1)
-------------------------------------------
Time: 1575351313000 ms
-------------------------------------------
(ddd,1)
-------------------------------------------
Time: 1575351314000 ms
-------------------------------------------
(ddd,1)
-------------------------------------------
Time: 1575351315000 ms
-------------------------------------------
(ddd,1)
-------------------------------------------
Time: 1575351316000 ms
-------------------------------------------
(ddd,1)
*/
// window(Seconds(4))
/**
-------------------------------------------
Time: 1575351458000 ms
-------------------------------------------
(mmm,1)
-------------------------------------------
Time: 1575351459000 ms
-------------------------------------------
(mmm,1)
-------------------------------------------
Time: 1575351460000 ms
-------------------------------------------
(mmm,1)
-------------------------------------------
Time: 1575351461000 ms
-------------------------------------------
(mmm,1)
*/
package mby00
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
// 滑动窗口
// ok
object SlidingWindowTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("Sliding Window Test")
val ssc = new StreamingContext(conf,Seconds(1))
ssc.sparkContext.setLogLevel("OFF")
val ds = ssc.socketTextStream("SparkOnStandalone",8888)
ds
.flatMap(_.split(" "))
.map((_,1))
.window(Seconds(5),Seconds(1))
.print()
ssc.start()
ssc.awaitTermination()
}
}
// .window(Seconds(5),Seconds(2))
/**Time: 1575352459000 ms
-------------------------------------------
(bbbb,1)
-------------------------------------------
Time: 1575352461000 ms
-------------------------------------------
(bbbb,1)*/
// .window(Seconds(5),Seconds(1))
/**
//Time: 1575352595000 ms
//-------------------------------------------
//(mby,1)
//
//-------------------------------------------
//Time: 1575352596000 ms
//-------------------------------------------
//(mby,1)
//
//-------------------------------------------
//Time: 1575352597000 ms
//-------------------------------------------
//(mby,1)
//
//-------------------------------------------
//Time: 1575352598000 ms
//-------------------------------------------
//(mby,1)
//
//-------------------------------------------
//Time: 1575352599000 ms
//-------------------------------------------
//(mby,1)
*/
package mby00
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
// 翻滚窗口
// ok
object TumblingWindowTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("Tumbling Window Test")
val ssc = new StreamingContext(conf,Seconds(1))
ssc.sparkContext.setLogLevel("OFF")
val ds = ssc.socketTextStream("SparkOnStandalone",8888)
ds
.flatMap(_.split(" "))
.map((_,1))
.window(Seconds(4))
.print()
ssc.start()
ssc.awaitTermination()
}
}
// window(Seconds(5))
/**-------------------------------------------
Time: 1575351312000 ms
-------------------------------------------
(ddd,1)
-------------------------------------------
Time: 1575351313000 ms
-------------------------------------------
(ddd,1)
-------------------------------------------
Time: 1575351314000 ms
-------------------------------------------
(ddd,1)
-------------------------------------------
Time: 1575351315000 ms
-------------------------------------------
(ddd,1)
-------------------------------------------
Time: 1575351316000 ms
-------------------------------------------
(ddd,1)
*/
// window(Seconds(4))
/**
-------------------------------------------
Time: 1575351458000 ms
-------------------------------------------
(mmm,1)
-------------------------------------------
Time: 1575351459000 ms
-------------------------------------------
(mmm,1)
-------------------------------------------
Time: 1575351460000 ms
-------------------------------------------
(mmm,1)
-------------------------------------------
Time: 1575351461000 ms
-------------------------------------------
(mmm,1)
*/
package mby00
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
// 翻滚窗口
// ok
object TumblingWindowTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("Tumbling Window Test")
val ssc = new StreamingContext(conf,Seconds(1))
ssc.sparkContext.setLogLevel("OFF")
val ds = ssc.socketTextStream("SparkOnStandalone",8888)
ds
.flatMap(_.split(" "))
.map((_,1))
.window(Seconds(4))
.print()
ssc.start()
ssc.awaitTermination()
}
}
package mby00
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.ListBuffer
// 热销榜案例
object HotBoardTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("Hot Board Test")
val ssc = new StreamingContext(conf,Seconds(1))
ssc.sparkContext.setLogLevel("OFF")
val ds = ssc.socketTextStream("SparkOnStandalone",8888)
// ok
/*ds
.map(line => {
val arr = line.split(" ")
val productName = arr(2)
val categoryId = arr(4)
((productName,categoryId),1)
})
.print()
/**((羽绒服,B),1)*/*/
/*// ok
ds
.map(line => {
val arr = line.split(" ")
val productName = arr(2)
val categoryId = arr(4)
((productName,categoryId),1)
})
.window(Seconds(5))
.print()
/** -------------------------------------------
((羽绒服,B),1)
-------------------------------------------
Time: 1575354081000 ms
-------------------------------------------
((羽绒服,B),1)
-------------------------------------------
Time: 1575354082000 ms
-------------------------------------------
((羽绒服,B),1)
-------------------------------------------
Time: 1575354083000 ms
-------------------------------------------
((羽绒服,B),1)
-------------------------------------------
Time: 1575354084000 ms
-------------------------------------------
((羽绒服,B),1)
-------------------------------------------
Time: 1575354085000 ms
---------------*/*/
ds
.map(line =>{
val arr = line.split(" ")
val productName = arr(2)
val categoryId = arr(4)
((productName,categoryId),1)
})
.window(Seconds(5))
.groupByKey()
.print()
/**
((羽绒服,B),ArrayBuffer(1))
-------------------------------------------
Time: 1575354384000 ms
-------------------------------------------
((羽绒服,B),ArrayBuffer(1))
-------------------------------------------
Time: 1575354385000 ms
-------------------------------------------
((羽绒服,B),ArrayBuffer(1))
-------------------------------------------
Time: 1575354386000 ms
-------------------------------------------
((羽绒服,B),ArrayBuffer(1))
-------------------------------------------
Time: 1575354387000 ms
-------------------------------------------
((羽绒服,B),ArrayBuffer(1))*/
/**-------------------------------------------
((iphone11,A),ArrayBuffer(1))
-------------------------------------------
Time: 1575354548000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1))
-------------------------------------------
Time: 1575354549000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1))
-------------------------------------------
Time: 1575354550000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1))
-------------------------------------------
Time: 1575354551000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1))*/
// 在翻滚窗口的时间间隔内 连续输入一下的内容
// 101 1 iphone11 4999.0 A
//101 1 iphone11 4999.0 A
//101 1 iphone11 4999.0 A
/**Time: 1575355044000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1, 1, 1))
-------------------------------------------
Time: 1575355044000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1, 1, 1))
-------------------------------------------
Time: 1575355045000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1, 1, 1))
-------------------------------------------
Time: 1575355045000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1, 1, 1))
-------------------------------------------
Time: 1575355046000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1, 1, 1))*/
// ok
/*ds
.map(line =>{
val arr = line.split(" ")
val productName = arr(2)
val categoryId = arr(4)
((productName,categoryId),1)
})
.window(Seconds(5))
.groupByKey()
.map(t2 =>{
((t2._1._1,t2._1._2),t2._2.size)
})
.print()*/
/**
((iphone11,A),ArrayBuffer(1))
-------------------------------------------
Time: 1575355305000 ms
-------------------------------------------
((iphone11,A),1)
-------------------------------------------
Time: 1575355306000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1, 1, 1))
-------------------------------------------
Time: 1575355306000 ms
-------------------------------------------
((iphone11,A),3)
-------------------------------------------
Time: 1575355307000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1, 1, 1))
-------------------------------------------
Time: 1575355307000 ms
-------------------------------------------
((iphone11,A),3)
-------------------------------------------
Time: 1575355308000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1, 1, 1))
-------------------------------------------
Time: 1575355308000 ms
-------------------------------------------
((iphone11,A),3)
-------------------------------------------
Time: 1575355309000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1, 1, 1))
-------------------------------------------
Time: 1575355309000 ms
-------------------------------------------
((iphone11,A),3)
-------------------------------------------
Time: 1575355310000 ms
-------------------------------------------
((iphone11,A),ArrayBuffer(1, 1))
-------------------------------------------
Time: 1575355310000 ms
-------------------------------------------
((iphone11,A),2)*/
ds
.map(line => {
val arr = line.split(" ")
val productName = arr(2)
val categoryId = arr(4)
((productName, categoryId), 1)
})
.window(Seconds(60))
.groupByKey() //(iphone11,A) [1,1]
.map(t2 => ((t2._1._1, t2._1._2), t2._2.size)) //((iphone11, A), 2)
//((羽绒服,B),2)
//((mi9,A),1)
.transform(rdd => {
val groupByRDD = rdd.groupBy(_._1._2) //(A,[((iphone11, A), 2),((mi9,A),1)])
val list = ListBuffer[(String, (String, Int))]()
groupByRDD.foreach(t2 => {
println(t2)
val categoryId = t2._1
val itar = t2._2
var next = 0
var pName = "" // (A,(iphone11,2))
itar.foreach(n => {
val productName = n._1._1
val count = n._2
if (count > next) {
next = count
pName = productName
}
})
list.+=((categoryId, (pName, next)))
println(list)
})
val newRDD = ssc.sparkContext.makeRDD(list)
newRDD
})
.map(t2 => (t2._1, t2._2._1)) //DStream???
.print()
ssc.start()
ssc.awaitTermination()
}
}
package com.baizhi.mby00
import org.apache.spark.{SparkConf, SparkContext}
// 利用集合创建RDD --- 方式一
// ok
object CreateRDDWithCollectionParallelizeTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("create rdd with collection parallelize")
val sc = new SparkContext(conf)
val rdd = sc.parallelize(List("Hello Scala","Hello Spark","Hello Hello Hello"),2)
// ok
/* rdd
.flatMap(_.split(" "))
.foreach(println)*/
/** Hello
Scala
Hello
Spark
Hello
Hello
Hello*/
// ok
/*rdd
.flatMap(_.split(" "))
.foreach(print)*/
/** HelloHelloSparkScalaHelloHelloHello*/
// ok
/*rdd
.flatMap(_.split(" "))
.map((_,1))
.foreach(println)*/
/**(Hello,1)
(Hello,1)
(Scala,1)
(Spark,1)
(Hello,1)
(Hello,1)
(Hello,1)*/
// ok
/*rdd
.flatMap(_.split(" "))
.map((_,1))
.foreach(print)*/
/**(Hello,1)(Scala,1)(Hello,1)(Spark,1)(Hello,1)(Hello,1)(Hello,1)*/
// ok
/*rdd
.flatMap(_.split(" "))
.map((_,1))
.groupByKey()
.foreach(println)*/
/**(Spark,CompactBuffer(1))
(Hello,CompactBuffer(1, 1, 1, 1, 1))
(Scala,CompactBuffer(1))*/
// ok
/* rdd
.flatMap(_.split(" "))
.map((_,1))
.groupByKey()
.foreach(print)*/
/**(Spark,CompactBuffer(1))(Hello,CompactBuffer(1, 1, 1, 1, 1))(Scala,CompactBuffer(1))*/
// ok
/*rdd
.flatMap(_.split(" "))
.map((_,1))
.groupByKey()
.map(t2 => (t2._1,t2._2.size))
.foreach(println)*/
/**(Spark,1)
(Hello,5)
(Scala,1)*/
// ok
/*rdd
.flatMap(_.split(" "))
.map((_,1))
.groupByKey()
.map(t2 =>{
val word = t2._1
val count = t2._2.size
(word,word,count)
})
.foreach(println)*/
/**(Hello,Hello,5)
(Spark,Spark,1)
(Scala,Scala,1)*/
sc.stop()
}
}
package com.baizhi.mby00
import org.apache.spark.{SparkConf, SparkContext}
// 利用集合创建RDD --- 方式二
// ok
object CreateRDDWithCollectionMakeRDDTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("create RDD with collection makeRDD test")
val ss = new SparkContext(conf)
val rdd = ss.makeRDD(List("Hello Spark","Hello Hadoop","Hello Hello Hello"))
rdd
.flatMap(_.split(" "))
.map((_,1))
.groupByKey()
.map(t =>{
val word = t._1
val count = t._2.size
(word,word,count,count)
})
.foreach(println)
/**(Spark,Spark,1,1)
(Hello,Hello,5,5)
(Hadoop,Hadoop,1,1)*/
ss.stop()
}
}
```java
```java
package mby00
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ListBuffer
// 对源RDD的元素应用函数操作 返回一个新的RDD
// ok
object MapTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("map test")
val sc = new SparkContext(conf)
val sourceRDD = sc.makeRDD(ListBuffer("a","b","c"))
sourceRDD
.map(str =>(str,str))
.foreach(println)
/** (c,c)
(b,b)
(a,a)*/
sourceRDD
.map(str =>{
(str,1)
})
.foreach(println)
/**(a,1)
(c,1)
(b,1)*/
sc.stop()
}
}
```java
首先要明白split方法的参数含义:
split
public String[] split(String regex)根据给定的正则表达式的匹配来拆分此字符串。
然后就要明确正则表达式的含义了:
\\s表示 空格,回车,换行等空白符,
+号表示一个或多个的意思,所以
```java
package mby00
import org.apache.spark.{SparkConf, SparkContext}
// 将源RDD的元素展开为0~n个元素 返回一个新的RDD集合
// 测试方法
// 直接点击main运行
// ok
object FlatMapTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("flatMap test")
val sc = new SparkContext(conf)
val sourceRDD = sc.parallelize(Vector("Hello Scala","Hello Hello Hello"))
sourceRDD
.flatMap(line => line.split(" "))
.foreach(println)
/**Hello
Scala
Hello
Hello
Hello*/
println("LLLLLLLLLLL")
sourceRDD
.flatMap(_.split("\\s+"))
.foreach(println)
/**Hello
Scala
Hello
Hello
Hello*/
sourceRDD
.flatMap(line =>{
line.split("\\s+")
})
.foreach(println)
/**Hello
Scala
Hello
Hello
Hello*/
sc.stop()
}
}
package mby00
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ListBuffer
// 对源RDD的每一分区做函数应用操作 返回一个新的RDD
// 怎样测试
// 直接点击main运行
// ok
object MapPartitionsTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("mapPartitions test")
val sc = new SparkContext(conf)
val sourceRDD = sc.parallelize(Vector("Hello Scala","Hello Hello Hello"),2)
sourceRDD
.mapPartitions(iter => {
var lb = new ListBuffer[(String,Int)]()
while (iter.hasNext){ // 当前分区的数据迭代器
val line = iter.next()
lb.+=((line,1))
}
lb.iterator
})
.foreach(println)
/**(Hello Hello Hello,1)
(Hello Scala,1)*/
sc.stop()
}
}
package mby00
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable.ListBuffer
// 对源RDD的每一分区应用函数操作 注意携带分区数 返回一个新的RDD
// 怎样测试
// 直接点击main运行
// ok
object MapPartitionsWithIndexTest {
// mian
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("mapPartitionsWithIndex test")
val sc = new SparkContext(conf)
val sourceRDD = sc.makeRDD(Vector("Hello Scala","Hello Hello Hello"))
println("1111111111111111111111111111111")
sourceRDD
.mapPartitionsWithIndex((index,iterator) =>{
val lb = new ListBuffer[(String,Int)]()
while (iterator.hasNext){
val line = iterator.next()
lb.+=((line,index)) // 对源RDD的每一个分区应用函数操作 返回(line,分区)
}
lb.iterator
})
.foreach(println)
println("22222222222222222")
val sourceRDD1 = sc.parallelize(Vector("Hello Scala","Hello Hello Hello","Hadoop"),2)
sourceRDD1
.mapPartitionsWithIndex((index,it) =>{
val lb = new ListBuffer[(String,Int)]()
while (it.hasNext){
val line = it.next()
lb.+=((line,index))
}
lb.iterator
})
.foreach(println)
// numSlices: 2
/**(Hello Scala,0)
(Hello Hello Hello,1)
(Hadoop,1)*/
println("3333333333333333333333333333333333")
val sourceRDD2 = sc.parallelize(Vector("Hello Scala","Hello Hello Hello","Hadoop"),3)
sourceRDD2
.mapPartitionsWithIndex((index,it) =>{
val lb = new ListBuffer[(String,Int)]()
while (it.hasNext){
val line = it.next()
lb.+=((line,index))
}
lb.iterator
})
.foreach(println)
/**(Hello Hello Hello,1)
(Hadoop,2)
(Hello Scala,0)*/
println("444444444444444444444444444444")
val sourceRDD4 = sc.parallelize(Vector("Hello Scala","Hello Hello Hello","Hadoop"),5)
sourceRDD4
.mapPartitionsWithIndex((index,it) =>{
val lb = new ListBuffer[(String,Int)]()
while (it.hasNext){
val line = it.next()
lb.+=((line,index))
}
lb.iterator
})
.foreach(println)
/**(Hello Scala,1)
(Hello Hello Hello,3)
(Hadoop,4)*/
sc.stop()
}
}
package mby00
import org.apache.spark.{SparkConf, SparkContext}
//
// 怎样测试
// 直接点击main
// ok
object SampleTest {
// mian
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("sample test")
val sc = new SparkContext(conf)
val sourceRDD = sc.makeRDD(List(1,2,3,4,5,6,7))
println("111111111111111111111111")
// withReplacement -- 表示数据是否可以重复
// fraction:分数 --- 表示数据被抽中的概率
// seed;种子 --- 底层用于产生随机数
sourceRDD
.sample(false,0.5D) // 表示数据不允许重复 每一个元素被抽中的概率为0.5
.foreach(println)
/**2
3
6
7
5*/
println("2222222222222222222222222222")
sourceRDD
.sample(true,1.8) // 表示数据允许重复 每一个数据可以被选中的次数
.foreach(println)
/** 6
4
1
2
3
3
3
7
7*/
sc.stop()
}
}
package mby00
import org.apache.spark.{SparkConf, SparkContext}
// 将源RDD的内容和新RDD的内容进行合并 返回一个新的RDD
// 怎样测试
// 直接点击main
// ok
object UnionTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName(" union test")
val sc = new SparkContext(conf)
val sourceRRD = sc.makeRDD(List(1,2,3,4,5,6,7),1)
val rDD = sc.makeRDD(List(5,6,7,8,9))
println("111111111111111111111")
sourceRRD
.union(rDD)
.sortBy(e => e,true,1)
.foreach(println)
/**1
2
3
4
5
5
6
6
7
7
8
9*/
println("2222222222222222222222")
sourceRRD
.union(rDD)
.foreach(println)
/**1
2
3
4
5
6
7
5
7
6
8
9*/
sc.stop()
}
}
package mby00
import org.apache.spark.{SparkConf, SparkContext}
// 将源RDD的内容与新RDD的内容取交集 返回一个新的RDD
// 怎样测试
// 直接main运行
//
object IntersectionTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("intersection test")
val sc = new SparkContext(conf)
val sourceRDD = sc.makeRDD(List(1,2,3,4))
val rDD = sc.makeRDD(List(1,2))
println("111111111111111111111111111")
sourceRDD
.intersection(rDD)
.foreach(println)
/**2
1*/
sourceRDD
.intersection(rDD)
.sortBy(e => e,true,1)
.foreach(println)
/** 1
2*/
sc.stop()
}
}
package mby00
import org.apache.spark.{SparkConf, SparkContext}
// 对源RDD的内容进行去重 返回一个新的RDD
// 怎样测试
// 直接main
// ok
object DistinctTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName(" distinct test")
val sc = new SparkContext(conf)
val sourceRdd = sc.makeRDD(List(1,2,3,4,5,6,6,7,8),2)
println("11111111111111111111")
sourceRdd
.distinct()
.foreach(println)
/**1
3
7
5
4
6
8
2*/
println("22222222222222222222222222")
sourceRdd
.distinct(1)
.sortBy(e => e,true,1)
.foreach(println)
/**1
2
3
4
5
6
7
8*/
sc.stop()
}
}
package mby00
import org.apache.spark.{SparkConf, SparkContext}
//
//
//
//
object GroupByKeyTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("groupByKey test")
val sc = new SparkContext(conf)
println("111111111111111")
// 对源RDD的(K,V)进行调整 并且返回K相同的(K,Iterator<V>)
// 如果不设定分区的数量 则子RDD的分区的数量和父RDD的分区的数量一致
// 如果以设定分区数量 则以设定的分区数量优先
val sourceRDD = sc.makeRDD(List(("Hello",1),("Scala",1),("Hello",1),("Spark",1)),2)
sourceRDD
.groupByKey(4) // 宽依赖的算子
.foreach(println)
/**(Hello,CompactBuffer(1, 1))
(Spark,CompactBuffer(1))
(Scala,CompactBuffer(1))*/
sc.stop()
}
}
package mby00
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
//
//
//
//
object ReduceByKeyTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("reduceByKey test")
val sc = new SparkContext(conf)
val sourceRDD = sc.makeRDD(List(("Hello",1),("Scala",1),("Hello",1),("Spark",1)),5)
println("1111111111111111111111")
// 对源RDD(K,V)进行调整 返回一个新的RDD(K,V)
// 根据key的values进行给定的函数聚合操作
// 方式1
val rdd = sourceRDD
.reduceByKey(
new Partitioner { // 自定义分区规则
override def numPartitions: Int = 3
override def getPartition(key: Any): Int = {
val k = key.asInstanceOf[String]
if(k.startsWith("H")) 0
else if(k.startsWith("S")) 1
else 2
}
},
(v1,v2) => v1 + v2
)
// .foreach(println)
/**(Spark,1)
(Scala,1)
(Hello,2)*/
println(rdd.getNumPartitions) // 3 --- 获取分区的数量
println("2222222222222222222")
// 方式2
sourceRDD
.reduceByKey(_+_,2)
.foreach(println)
/**(Spark,1)
(Hello,2)
(Scala,1)*/
println("33333333333333333333333333")
// 方式3
sourceRDD
.reduceByKey(_+_)
.foreach(println)
/**(Spark,1)
(Hello,2)
(Scala,1)*/
println("44444444444444444")
val rdd1 = sourceRDD
.reduceByKey(_+_)
println(rdd1.getNumPartitions) // 5
sc.stop()
}
}
package mby00
import org.apache.spark.{SparkConf, SparkContext}
// 根据key进行聚合操作 --- 首先进行分区的计算,再进行分区间的计算
// 测试
// 直接main
//
object AggregateByKeyTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("aggregate by key test")
val sc = new SparkContext(conf)
val sourceRDD = sc.makeRDD(List(("Hello",1),("Hello",1),("Spark",1),("Scala",1)),2)
sourceRDD
.aggregateByKey(1)(
(zeroValue,default) => zeroValue + default, // 分区内的聚合操作
(p1,p2) => p1 + p2 // 不同分区间的集合操作
)
.foreach(println)
/**(Spark,2)
(Hello,3)
(Scala,2)*/
val sourceRDD13 = sc.makeRDD(List(("Hello", 1), ("Spark", 1), ("Hello", 1), ("Scala", 1)), 2)
sourceRDD13
.aggregateByKey(1)(
(zeroValue,default) => zeroValue + default,
(p1,p2) => p1 + p2
)
.foreach(println)
/**(Spark,2)
(Hello,4)
(Scala,2)*/
sc.stop()
}
}
package mby00
import org.apache.spark.{SparkConf, SparkContext}
// 对源RDD调用 根据k进行排序 返回一个新的RDD
// 对源RDD(k,v)调用,根据k进行排序返回一个新的RDD
// 测试方法
// 直接main
// ok
object SortByKeyTest {
// maim
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("sortByKey test")
val sc = new SparkContext(conf)
val sourceRDD = sc.parallelize(List(("b",1),("a",1),("c",1),("a",1)),2)
println("1111111111111111111111111111")
sourceRDD
.sortByKey(false,1) // 排序规则和排序后的分区数量
.foreach(println)
/**(c,1)
(b,1)
(a,1)
(a,1)*/
println("2222222222222222222222222222")
sourceRDD
.sortByKey(true,1)
.foreach(println)
/**(a,1)
(a,1)
(b,1)
(c,1)*/
println("33333333333333333333333")
sourceRDD
.sortBy(t2 => t2._1,true,1)
.foreach(println)
/**(a,1)
(a,1)
(b,1)
(c,1)*/
println("4444444444444444444")
sourceRDD
.sortBy(t2 => t2._1,false,1)
.foreach(println)
/**(c,1)
(b,1)
(a,1)
(a,1)*/
sc.stop()
}
}
package mby00
import org.apache.spark.{SparkConf, SparkContext}
//
//
//
//
object JoinTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("join test")
val sc = new SparkContext(conf)
val sourceRDD1 = sc.parallelize(List((1,"zs"),(2,"ls"),(3,"ww"),(4,"zl")),2)
val sourceRDD2 = sc.parallelize(List((1,10),(2,20),(3,30),(5,50)),2)
println("111111111111111111111111")
sourceRDD1
.join(sourceRDD2,2) // 相当于内连接
.foreach(t2 =>println(t2._1 +"\t" + t2._2._1 + "\t" + t2._2._2) )
/**2 ls 20
1 zs 10
3 ww 30*/
println("22222222222222222222")
sourceRDD1
.join(sourceRDD2,2) // 相当于内连接
.foreach(println)
/**(1,(zs,10))
(3,(ww,30))
(2,(ls,20))*/
println("3333333333333333333333")
sourceRDD1
.leftOuterJoin(sourceRDD2,2) // 左外连接
.foreach(println)
/**(4,(zl,None))
(2,(ls,Some(20)))
(1,(zs,Some(10)))
(3,(ww,Some(30)))*/
println("4444444444444444444444444444")
sourceRDD1
.leftOuterJoin(sourceRDD2,2)
.foreach(t2 =>println(t2._1 + "\t" + t2._2._1 + "\t" + t2._2._2))
/**4 zl None
2 ls Some(20)
1 zs Some(10)
3 ww Some(30)*/
println("55555555555555555555555555555555555")
sourceRDD1
.rightOuterJoin(sourceRDD2,2) // 相当于右外连接
.foreach(println)
/**(2,(Some(ls),20))
(1,(Some(zs),10))
(3,(Some(ww),30))
(5,(None,50))*/
println("666666666666666666666666666666")
sourceRDD1
.rightOuterJoin(sourceRDD2,2)
.foreach(t2 => println(t2._1 + "\t" + t2._2._1 + "\t" + t2._2._2))
/**1 Some(zs) 10
3 Some(ww) 30
5 None 50
2 Some(ls) 20*/
sc.stop()
}
}
package mby00
import org.apache.spark.{SparkConf, SparkContext}
// cogroup**(*otherDataset*, [*numPartitions*]:
// 共同分组,两个RDD(K,V)和(K,W)进行共同分组,
// 返回一个新的RDD (K, (Iterable<V>, Iterable<W>))
//
//
// ok
object CogroupTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName(" cogroup test")
val sc = new SparkContext(conf)
val sourceRDD1 = sc.parallelize(List(("b",1),("a",1),("c",1),("a",1)),2)
val sourceRDD2 = sc.parallelize(List(("e",1),("a",1),("c",1),("a",1)),2)
println("11111111111111111111111111111111111111")
sourceRDD1
.cogroup(sourceRDD2)
.foreach(println)
/**(e,(CompactBuffer(),CompactBuffer(1)))
(b,(CompactBuffer(1),CompactBuffer()))
(a,(CompactBuffer(1, 1),CompactBuffer(1, 1)))
(c,(CompactBuffer(1),CompactBuffer(1)))*/
println("2222222222222222222222222222222222222222222")
sourceRDD1
.cogroup(sourceRDD2)
.foreach(t2 =>println(t2._1 +"\t"+ t2._2._1 + "\t" + t2._2._2))
/**b CompactBuffer(1) CompactBuffer()
e CompactBuffer() CompactBuffer(1)
a CompactBuffer(1, 1) CompactBuffer(1, 1)
c CompactBuffer(1) CompactBuffer(1)*/
sc.stop()
}
}
package mby00
import org.apache.spark.{SparkConf, SparkContext}
//
//
//
//
object RePartitionTest {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("rePartition test")
val sc = new SparkContext(conf)
val sourceRDD = sc.makeRDD(List("a","b","c","d"),4)
val rDD = sourceRDD
.repartition(5) // 父RDD 4 ---> 子RDD 2
println(rDD.getNumPartitions) // 5
rDD.foreach(println)
/** a
b
c
d*/
sc.stop()
}
}
package mby00
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
// 重新分区,并对分区内的数据进行局部排序
//
//
// ok
object RepartitionAndSortWithinPartitons {
// main
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName(" repartitionAndSortWithinPartitions")
val sc = new SparkContext(conf)
val sourceRDD = sc.parallelize(List(("b", 1), ("e", 2), ("a", 1), ("c", 1), ("a", 3), ("a", 2)), 2)
sourceRDD
.repartitionAndSortWithinPartitions(new Partitioner {
override def numPartitions: Int = 4
override def getPartition(key: Any): Int = {
val k = key.asInstanceOf[String]
if (k.startsWith("a") || k.startsWith("e")) 0
else if (k.startsWith("b")) 1
else if (k.startsWith("c")) 2
else 3
}
})
.foreach(println)
/**(c,1)
(a,1)
(b,1)
(a,3)
(a,2)
(e,2)*/
sc.stop()
}
}