1.SparkTA:
import org.apache.spark.{SparkConf, SparkContext}
object SparkTA {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("SparkTA").setMaster("local[2]")
val sc=new SparkContext(conf)
val rdd1=sc.parallelize(List(7,8,9,4,12,11,3))
//按字典顺序排序
val rdd3=rdd1.map(_*2).sortBy(x=>x+" ",true)
// println(rdd3.collect().toBuffer)
//取出里面的每个单词
val rdd4=sc.parallelize(Array("jj,ll,kk","uu oo","pp"))
val rdd5=rdd4.flatMap(_.split(",")).collect()
// println(rdd5.toBuffer)
//取出里面的每个单词
val rdd6=sc.parallelize(List(List("a b c","a e f"),List("e f g","g j"),List("u o")))
val rdd7=rdd6.flatMap(_.flatMap(_.split(" "))).collect()
println(rdd7.toBuffer)
// 求并集 并从大到小排列
val rdd8=sc.parallelize(List(1,2,3))
val rdd9=sc.parallelize(List(3,4,5))
val rdd10 =rdd8.union(rdd9).distinct().sortBy(x=>x,false).collect()
// println(rdd10.toBuffer)
//求交集
val rdd11=rdd8.intersection(rdd9).collect()
// println(rdd11.toBuffer)
//元组的交集 必须一模一样才能交上
val rdd12=sc.parallelize(List(("chang",1),("ma",2),("shi",3)))
val rdd13=sc.parallelize(List(("jj",1),("tom",2),("shi",3),("chang",2),("chang",3)))
val rdd14=rdd12.intersection(rdd13).collect()
// println(rdd14.toBuffer)
//join
val rdd15=rdd12.join(rdd13).collect()
println(rdd15.toBuffer)
//leftOuterJoin 左面保留
val rdd16=rdd12.leftOuterJoin(rdd13).collect()
println(rdd16.toBuffer)
//右面保留
val rdd17=rdd12.rightOuterJoin(rdd13).collect()
println(rdd17.toBuffer)
//groupByKey
val rdd18=rdd12.union(rdd13)
//求出它的wc
// val rdd19=rdd18.groupByKey().map(t=>(t._1,t._2.sum)).collect()
val rdd19=rdd18.groupByKey().mapValues(_.sum).collect()
//reduceByKey(_+_)效率更高
val rdd20=rdd18.reduceByKey(_+_).collect()
println(rdd19.toBuffer)
println(rdd20.toBuffer)
//cogroup
val rdd21=rdd12.cogroup(rdd13)
val res=rdd21.map(t=>(t._1,t._2._1.sum+t._2._2.sum)).collect()
println(res.toBuffer)
val res1=rdd21.mapValues(t=>(t._1.sum+t._2.sum)).collect()
println(res1.toBuffer)
//笛卡儿积
val rdd22=sc.parallelize(List("jay","jj"))
val rdd23=sc.parallelize(List("Ma","chang","tom"))
val rdd24=rdd22.cartesian(rdd23).collect()
println(rdd24.toBuffer)
println("=======================================================================================================")
//Action 算子
val rdd25=sc.parallelize(List(1,6,5,4,3,2),2)
//collect
val result=rdd25.collect()
println(result.toBuffer)
//reduce
val result1=rdd25.reduce(_+_)
//count
val result2=rdd25.count()
println(result2)
//top 先排序然后取最大的
val result3=rdd25.top(2)
println(result3.toBuffer)
//take 取前两个
val result4=rdd25.take(2)
println(result4.toBuffer)
//first =take(1)
val result5=rdd25.first()
println(result5)
//takeOrdered 升序排序取前三个
val result6=rdd25.takeOrdered(3)
println(result6.toBuffer)
}
}
2.Monte Carlo Method:
import org.apache.spark.{SparkConf, SparkContext}
object MySpqrkPi {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("MySpqrkPi").setMaster("local[2]")
val sc = new SparkContext(conf)
val s = 100
val n = 1000 * s
val count = sc.parallelize(1 to n,s).map({i=>
//返回一个大于等于0小于1的数
def random:Double = java.lang.Math.random()
println(random)
val x = random * 2 - 1
val y = random * 2 - 1
println(x+":"+y)
if(x * x + y * y<1) 1 else 0
}).reduce(_ + _)
println(4.0*count/n)
}
}
3.PageRank:
import org.apache.spark.{SparkConf, SparkContext}
object SparkPageRank {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("SparkPageRank").setMaster("local[2]")
val sc=new SparkContext(conf)
val iters=30
val lines=sc.textFile("c://pagerank.txt")
//生成邻接表 (1,(2,3,4,5)) (2,(1,5))
val links=lines.map{S=>
val parts=S.split(" ")
(parts(0),parts(1))
}.distinct().groupByKey().cache()
println(links.collect().toBuffer)
var ranks=links.mapValues(v=>1.0)//(2,1.0), (3,1.0), (1,1.0)
println(ranks.collect().toBuffer)
for (i <- 1 to iters){
//(1,(2,3,4,5),1.0) (2,(1,5),1.0)
val contribs=links.join(ranks).values.flatMap{ case(urls,ranks)=>
val size =urls.size
urls.map(t=>(t,ranks/size))
}
ranks=contribs.reduceByKey(_+_).mapValues(0.15+0.85*_)
}
val output=ranks.collect()
output.foreach(tup=>println(tup._1 + " rank: "+tup._2))
sc.stop()
}
}
4.SeniorOperator:
import org.apache.spark.{SparkConf, SparkContext}
object SuperFunc {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SparkTA").setMaster("local[2]")
val sc = new SparkContext(conf)
val rdd=sc.parallelize(List(1,2,3,4,5,6,7,8,9),2)
println(rdd.partitions.length)
//把每个partition中的元素拿出来
val func=(index:Int,iter:Iterator[(Int)])=>{
iter.toList.map(x=>"[partionID: "+index+" value: "+x+"]").iterator
}
val res=rdd.mapPartitionsWithIndex(func).collect()
println(res.toBuffer)
//aggregate
val rdd1=sc.parallelize(Array(1,2,3,4,5,6,7,8),2)
val res2=rdd1.mapPartitionsWithIndex(func).collect()
val res1=rdd1.aggregate(0)(_+_,_+_)
println(res1)
println(res2.toBuffer)
//把每个分区的最大值取出来
val res3=rdd1.aggregate(0)(math.max(_,_),_+_)
println(res3)
//把每个分区的最小值取出来相加
val res4=rdd1.aggregate(10000)(math.min(_,_),_+_)
println(res4)
val rdd2=sc.parallelize(List("a","b","c","d","e"),2)
val res5=rdd2.aggregate("@")(_+_,_+_)
println(res5)
//aggregateByKey 把key相同的先做一个局部操作,然后再做全局操作
val rdd3=sc.parallelize(List(("a",1),("a",10),("b",2),("c",5),("d",7),("a",9),("c",7),("e",8)),2)
val res6=rdd3.aggregateByKey(0)(_+_,_+_).collect()
println(res6.toBuffer)
//把每个分区里最大的那个k拿出来相加
//combineBykey x=>x 指的是每个分区的第一个value
val rdd4=sc.textFile("c://student.txt").flatMap(_.split(" ")).map((_,1)).combineByKey(x=>x+10,(a:Int,b:Int)=>a+b,(m:Int,n:Int)=>m+n).collect()
val rdd5=sc.textFile("c://student.txt").flatMap(_.split(" ")).map((_,1)).combineByKey(x=>x,(a:Int,b:Int)=>a+b,(m:Int,n:Int)=>m+n).collect()
println(rdd4.toBuffer)
println(rdd5.toBuffer)
// repatition 发生了shuffle
val rdd6=sc.parallelize(List(1,2,3,4,5,6,7,8,9),2)
val res7=rdd6.mapPartitionsWithIndex(func).collect()
println(res7.toBuffer)
val rdd7=rdd6.repartition(3)
println(rdd7.partitions.length)
println(rdd7.mapPartitionsWithIndex(func).collect().toBuffer)
// coalesce
val rdd8=rdd6.coalesce(4,true)
println(rdd8.partitions.length)
// countByKey value为key的数量
val rdd9=sc.parallelize(List(("a",1),("b",2),("c",2),("c",1),("d",3),("f",4)))
println(rdd9.count())
println(rdd9.countByKey())
println(rdd9.countByValue())
// collectAsMap 把结果放到map中
val rdd10=sc.parallelize(List(("a",1),("b",2)))
println(rdd10.collect().toBuffer)
println(rdd10.collectAsMap())
// filterByRange 范围过滤
val res8=rdd9.filterByRange("a","c").collect()
println(res8.toBuffer)
// flatMapValues
val rdd11=sc.parallelize(List(("tom","5,6"),("jerry","4,7")))
val res9=rdd11.flatMapValues(_.split(",")).reduceByKey(_+_).collect()
println(res9.toBuffer)
// foldByKey
val rdd12=sc.parallelize(List("ma","shi","chang","cheng","jay"))
val rdd13=rdd12.map(x=>(x.length,x))
val res10=rdd13.foldByKey("")(_+_).collect()
println(res10)
// foreachPartition
val rdd14=sc.parallelize(List(1,2,3,4,5,6,7,8,9),4)
println(rdd14.foreachPartition(x=>println(x.reduce(_+_))))
val rdd15=sc.parallelize(List(1,2,3,4,5,6,7,8,9),3)
// 作业 :把迭代器数据显示出来
// rdd15.foreachPartition()
//keyBy
val rdd16=rdd12.keyBy(_.length)
println(rdd16.collect().toBuffer)
//keys,values
val rdd17=rdd16.keys.collect()
println(rdd17.toBuffer)
val rdd18=rdd16.values.collect()
println(rdd18.toBuffer)
}
}
5.IpLocation (Gaining the maximum number of click from different province):
import org.apache.spark.{SparkConf, SparkContext}
object IpLocation {
def ip2Long(ip: String): Long = {
val fragments = ip.split("[.]")
var ipNum = 0L
for (i <- 0 until fragments.length){
ipNum = fragments(i).toLong | ipNum << 8L
}
ipNum
}
def binarySearch(lines: Array[(String, String, String)], ip: Long) : Int = {
var low = 0
var high = lines.length - 1
while (low <= high) {
val middle = (low + high) / 2
if ((ip >= lines(middle)._1.toLong) && (ip <= lines(middle)._2.toLong))
return middle
if (ip < lines(middle)._1.toLong)
high = middle - 1
else {
low = middle + 1
}
}
-1
}
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("IpLocation").setMaster("local[2]")
val sc=new SparkContext(conf)
val ipRulesRDD=sc.textFile("c://ip.txt").map(lines=>{
val fields=lines.split("\\|")// |是正则的关键字需要进行转义
val start_ip=fields(2)
val end_ip=fields(3)
val province=fields(6)
(start_ip,end_ip,province)
})
// println(IpRulesRDD.collect().toBuffer)
val ipRules=ipRulesRDD.collect()//加载全部的规则到driver端
val ipRulesBroadcast=sc.broadcast(ipRules)
val ipRdd=sc.textFile("c://20090121000132.394251.http.format").map(lines=>{
val fields=lines.split("\\|")
fields(1)
})
val result=ipRdd.map(ip=>{
val ipNum=ip2Long(ip)
val index=binarySearch(ipRules,ipNum)
val info=ipRulesBroadcast.value(index)
info
}).map(t=>(t._3,1)).reduceByKey(_+_).sortBy(_._2,false).take(2)
println(result.toBuffer)
sc.stop()
}
}
6.Base Station (Gaining the longest time in each base station):
(1)
import org.apache.spark.{SparkConf, SparkContext}
object ULC {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("ULC").setMaster("local[2]")
val sc=new SparkContext(conf)
// sc.textFile("c://jztj.txt").map(_.split(",")).map(x=>(x(0),x(1),x(2),x(3)))
val rdd=sc.textFile("c://jztl.txt").map(lines=>{
val fields=lines.split(",")
val et=fields(3)
val time=fields(1)
val timeLong=if (et=="1") -time.toLong else time.toLong
(fields(0)+"_"+fields(2),timeLong)
})
// println(rdd.collect().toBuffer)
val res=rdd.groupBy(_._1).mapValues(_.foldLeft(0L)(_+_._2))
println(res.collect().toBuffer)
//groupby结果 ArrayBuffer((18816888888_1785EALDKAMMSXCSDOKO22,CompactBuffer((18816888888_1785EALDKAMMSXCSDOKO22,-20170729082500), (18816888888_1785EALDKAMMSXCSDOKO22,20170729102500), (18816888888_1785EALDKAMMSXCSDOKO22,-20170729222500), (18816888888_1785EALDKAMMSXCSDOKO22,20170729235800))), (18888888888_1555EALDKAMMSXCSDOKL00,CompactBuffer((18888888888_1555EALDKAMMSXCSDOKL00,-20170729222100), (18888888888_1555EALDKAMMSXCSDOKL00,20170729235900))), (18888888888_1785EALDKAMMSXCSDOKO22,CompactBuffer((18888888888_1785EALDKAMMSXCSDOKO22,-20170729092400), (18888888888_1785EALDKAMMSXCSDOKO22,20170729132900), (18888888888_1785EALDKAMMSXCSDOKO22,-20170729132500), (18888888888_1785EALDKAMMSXCSDOKO22,20170729202500))))
//res结果 (18816888888_1785EALDKAMMSXCSDOKO22,33300)
val rdd1=res.map(t=>{
val phone_lac=t._1
val phone=phone_lac.split("_")(0)
val lac=phone_lac.split("_")(1)
val tltime=t._2
(phone,lac,tltime)
})
// println(rdd1.collect().toBuffer)
val rdd2=rdd1.groupBy(_._1)
println(rdd2.collect().toBuffer)
val rdd3=rdd2.mapValues(it=>{
it.toList.sortBy(_._3).reverse.take(1)
})
println(rdd3.collect().toBuffer)
}
}
(2)
import org.apache.spark.{SparkConf, SparkContext}
object ULC2 {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("ULC2").setMaster("local[2]")
val sc=new SparkContext(conf)
// sc.textFile("c://jztj.txt").map(_.split(",")).map(x=>(x(0),x(1),x(2),x(3)))
val rdd=sc.textFile("c://jztl.txt").map(lines=> {
val fields = lines.split(",")
val et = fields(3)
val time = fields(1)
val timeLong = if (et == "1") -time.toLong else time.toLong
((fields(0),fields(2)),timeLong)
} )
val rdd1=rdd.reduceByKey(_+_).map(t=>{
val phone=t._1._1
val lac=t._1._2
val tltime=t._2
(lac,(phone,tltime))
})
val rdd2=sc.textFile("c://lac_info.txt").map(lines=>{
val fields=lines.split(",")
(fields(0),(fields(1),fields(2)))
})
val rdd3=rdd1.join(rdd2).map(t=>{
val lac= t._1
val phone=t._2._1._1
val tltime=t._2._1._2
val jd=t._2._2._1
val wd=t._2._2._2
(phone,lac,tltime,jd,wd)
})
println(rdd3.collect().toBuffer)
//排序拿到top1
val rdd4=rdd3.groupBy(_._1)
println(rdd4.collect().toBuffer)
val rdd5=rdd4.mapValues(it=>{
it.toList.sortBy(_._3).reverse.take(1)
})
rdd5.saveAsTextFile("c://jzres")
sc.stop()
}
}
7.Custom Partition:
import java.net.URL
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
object URLCount {
def main(args: Array[String]): Unit = {
val arr=Array("sport.sina.cn","game.sina.cn","car.sina.cn")
val conf=new SparkConf().setAppName("URLCount").setMaster("local[2]")
val sc=new SparkContext(conf)
val rdd1=sc.textFile("c://url.log").map(lines=>{
val fields=lines.split("\t")
(fields(1),1)
})
val rdd2=rdd1.reduceByKey(_+_)
// println(rdd2.collect().toBuffer)
val rdd3=rdd2.map(t=>{
val url=t._1
val count=t._2
//拿到域名
val host=new URL(url).getHost
(host,(url,count))
})
// rdd3.repartition(3).saveAsTextFile("c://out123456")
// println(rdd3.collect().toBuffer)
val forum=rdd3.map(_._1).distinct().collect()
val np=new NewPartition(forum)
rdd3.partitionBy(np).saveAsTextFile("c://888888")
// println(forum.toBuffer)
// for(bk<-arr){
// val rdd=rdd3.filter(_._1==bk)
// val res=rdd.sortBy(_._3,false).take(2)
// println(res.toBuffer)
// }
// val sportRDD=rdd3.filter(_._1=="sport.sina.cn")
// val sortRDD=sportRDD.sortBy(_._3,false).take(2)
// println(sortRDD.toBuffer)
println(rdd3.collect().toBuffer)
// //小作业:只拿出域名,url和次数
// val rdd4=rdd3.groupBy(_._1).mapValues(it=>{
// it.toList.sortBy(_._3).reverse.take(2)
// })
println(rdd4.collect().toBuffer)
// println(rdd4.collect().toBuffer)
sc.stop()
}
}
class NewPartition(forum:Array[String]) extends Partitioner{
val partMap=new mutable.HashMap[String,Int]()
var count=0
for (i<-forum){
partMap +=(i->count)
count+=1
//Map((sport->0),(car->1),(game->2))
}
override def numPartitions = forum.length
override def getPartition(key: Any) = {
partMap.getOrElse(key.toString,0)
}
}
8.User-Defined Sort:
import org.apache.spark.{SparkConf, SparkContext}
object UserDefinedSort {
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("UserDefinedSort").setMaster("local[2]")
val sc=new SparkContext(conf)
val sortRdd=sc.parallelize(List(("Apple",6,5288),("Sumsung",6,5888),("Mi",5,2499),("LG",4,4888),("SONY",3,3799),("HUAWEI",5,2399),("Nokia",2,199)))
// sortRdd.sortBy(_._2,_._3)
val rdd=sortRdd.sortBy(x=>Phone(x._2,x._3),false)
println(rdd.collect().toBuffer)
sc.stop()
}
}
case class Phone(val size:Int,val price:Int) extends Ordered[Phone] with Serializable{
override def compare(that: Phone) = {
if (this.size==that.size){
this.price-that.price
}else{
this.size-that.size
}
}
}