package url
import java.net.URL
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
/*
* 自定义一个分区 不会存在数据倾斜问题 参数都是写活的
* */
object UrlCompute2 extends App {
val conf = new SparkConf().setAppName("url").setMaster("local[2]")
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("e:/url.log").map(line => {
val files = line.split("\t")
(files(1), 1)
})
//结果: (http://java.bw.cn/java/course/javaeeadvanced.shtml,1)....
val rdd2 = rdd1.reduceByKey(_ + _) //把访问次数相加
// println(rdd2.collect().toBuffer)
//rdd2结果:(http://java.bw.cn/java/course/javaeeadvanced.shtml,477)
val rdd3 = rdd2.map(t => {
//将tuple的第一个元素中的host取出来
val url = t._1 //http://java.bw.cn/java/course/javaeeadvanced.shtml
val host = new URL(url).getHost //固定写法 自动获取host java.bw.cn
val num = t._2 //477
(host, (url, num))
})
val hostList = rdd3.map(t => t._1).distinct().collect().toList //去重host 再收集回来 rdd不好传递 最好是list
// println(hostList.toBuffer) 结果 (java.bw.cn, php.bw.cn, net.bw.cn)
val rdd4 = rdd3.partitionBy(new MyPartitioner2(hostList)).mapPartitions(it => {
it.toList.sortBy(_._2._2).reverse.take(5).iterator
})
// println(rdd4.collect().toBuffer)
rdd4.saveAsTextFile("e:/ZDYPar.log")
}
class MyPartitioner2(val hosts: List[String]) extends Partitioner {
val hostMap = new mutable.HashMap[String, Int]()
var parNum = 0 // var 注意 类似于count
for (host <- hosts) { //相当于java里的foreach( Int a: hosts)
hostMap += (host -> parNum) //是重新将 key:host value:parNum 赋值给hostMap
parNum = parNum + 1 //每循环一次加一
}
override def numPartitions: Int = hosts.size //设置分区的个数
//设置返回多少个分区
override def getPartition(key: Any): Int = {
hostMap.getOrElse(key.toString, hosts.size) //map中若取出来数据都没有 就到3号分区去
}
}
e:/url.log 的文件 展示一点
20160321101954 http://java.bw.cn/java/course/javaeeadvanced.shtml
20160321101954 http://java.bw.cn/java/course/javaee.shtml
20160321101954 http://java.bw.cn/java/course/android.shtml
20160321101954 http://java.bw.cn/java/video.shtml
20160321101954 http://java.bw.cn/java/teacher.shtml
20160321101954 http://java.bw.cn/java/course/android.shtml
20160321101954 http://php.bw.cn/php/teacher.shtml
20160321101954 http://net.bw.cn/net/teacher.shtml
20160321101954 http://java.bw.cn/java/course/hadoop.shtml
20160321101954 http://java.bw.cn/java/course/base.shtml
20160321101954 http://net.bw.cn/net/course.shtml
20160321101954 http://php.bw.cn/php/teacher.shtml
20160321101954 http://net.bw.cn/net/video.shtml
20160321101954 http://java.bw.cn/java/course/base.shtml
20160321101954 http://net.bw.cn/net/teacher.shtml
20160321101954 http://java.bw.cn/java/video.shtml
20160321101954 http://java.bw.cn/java/video.shtml
20160321101954 http://net.bw.cn/net/video.shtml
20160321101954 http://net.bw.cn/net/course.shtml
20160321101954 http://java.bw.cn/java/course/javaee.shtml
20160321101954 http://java.bw.cn/java/course/android.shtml
20160321101955 http://php.bw.cn/php/course.shtml
20160321101955 http://net.bw.cn/net/teacher.shtml
输出结果: