数据
www.baidu.com,url5
www.baidu.com,url5
www.baidu.com,url5
www.baidu.com,url5
www.baidu.com,url5
www.google.com,url8
www.google.com,url2
www.google.com,url1
www.twitter.com,url10
www.twitter.com,url10
www.twitter.com,url10
www.twitter.com,url10
- 方式一
val lines = sc.textFile("/ruozedata-spark-core/topn/site.log")
val topN = 2;
val rdd = lines.map(x=>{
val splits = x.split(",")
val domain = splits(0)
val url = splits(1)
((domain,url),1)
}).reduceByKey(_+_).groupBy(_._1._1).mapValues(x=> {
/**
* toList容易OOM,建议
*/
x.toList.sortBy(-_._2).map(x => (x._1._2, x._2))
}).take(topN)
其中直接x.toList容易引起OOM
- 方式二
val lines = sc.textFile("/ruozedata-spark-core/topn/site.log")
val processRDD = lines.map(x => {
val splits = x.split(",")
val domain = splits(0)
val url = splits(1)
((domain, url), 1)
})
val domains = processRDD.map(_._1._1).distinct().collect()
domains.foreach(x => {
processRDD.filter(_._1._1 == x).reduceByKey(_ + _)
.sortBy(-_._2).take(topN)
})
虽然做了distinct,但是域名若果多了,UI还是会爆掉(sortBy会很多)
- 方式三:使用分区
class TopnPartitioner(domains:Array[String]) extends Partitioner{
val map = mutable.HashMap[String,Int]()
for(i <- 0 until(domains.length)){
map(domains(i)) = i
}
override def numPartitions: Int = domains.length
override def getPartition(key: Any): Int = {
val domain = key.asInstanceOf[(String,String)]._1
map(domain)
}
}
val topN = 2;
val lines = sc.textFile("/ruozedata-spark-core/topn/site.log")
val processRDD = lines.map(x => {
val splits = x.split(",")
val domain = splits(0)
val url = splits(1)
((domain, url), 1)
})
val domains = processRDD.map(_._1._1).distinct().collect()
val result = processRDD.reduceByKey(new TopnPartitioner(domains),_+_)
result.mapPartitions(partition => {
partition.toList.sortBy(-_._2).take(topN).iterator
}).collect()
UI的情况有所好转,,但是因为take需要将所有数据都拉取到Driver上才能完成操作
方式四:较优的解决方案
val topN = 2;
val lines = sc.textFile("/ruozedata-spark-core/topn/site.log")
val processRDD = lines.map(x => {
val splits = x.split(",")
val domain = splits(0)
val url = splits(1)
((domain, url), 1)
})
val domains = processRDD.map(_._1._1).distinct().collect()
val result = processRDD.reduceByKey(new TopnPartitioner(domains),_+_)
result.mapPartitions(partition => {
var treeSet = new mutable.TreeSet[((String,String),Int)]()(new TopOrdering())
partition.foreach(x=>{
treeSet.add(x)
if(treeSet.size > topN) {
treeSet = treeSet.dropRight(1)
}
})
treeSet.iterator
})