import java.net.URL
import org.apache.spark.{SparkConf, SparkContext}
/**
* 取出学科点击前三的
* Created by root on 2016/5/16.
*/
object UrlCount {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("UrlCount").setMaster("local[2]") //local[2] 代表2个线程
val sc = new SparkContext(conf)
//rdd1将数据切分,元组中放的是(URL, 1)
val rdd1 = sc.textFile("D:\\itcast-大数据\\day29\\itcast.log").map(line => {
val f = line.split("\t")
(f(1), 1)
})
val rdd2 = rdd1.reduceByKey(_+_)
val rdd3 = rdd2.map(t => {
val url = t._1
val host = new URL(url).getHost
(host, url, t._2)
})
val rdd4 = rdd3.groupBy(_._1).mapValues(it => {
it.toList.sortBy(_._3).reverse.take(3)
})
println(rdd2.collect().toBuffer)
sc.stop()
}
}
参考数据:20160321101954
http://java.itcast.cn/java/course/javaeeadvanced.shtml
spark对url的提取与统计
最新推荐文章于 2024-06-04 01:02:11 发布