1 测试1
package webcount
import java.net.URL
import org.apache.spark.{SparkConf, SparkContext}
object UrlCountPartition {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("AdvURLCount").setMaster("local[2]")
val sc = new SparkContext(conf)
//rdd1 将数据切分,然后元组中(url,1)
val rdd1 = sc.textFile("d://urlcount.log").map(line => {
val f = line.split("\t")
(f(1), 1)
})
val rdd2 = rdd1.reduceByKey(_ + _)
val rdd3 = rdd2.map(t => {
val url = t._1
val host = new URL(url).getHost
(host, (url, t._2))
})
println(rdd3.collect().toBuffer)
}
}
ArrayBuffer((php.itcast.cn,(http://php.itcast.cn/php/course.shtml,459)), (java.itcast.cn,(http://java.itcast.cn/java/course/base.shtml,543)),
(java.itcast.cn,(http://java.itcast.cn/java/video.shtml,496)), (java.itcast.cn,(http://java.itcast.cn/java/course/android.shtml,501)),
(net.itcast.cn,(http://net.itcast.cn/net/video.shtml,521)), (java.itcast.cn,(http://java.itcast.cn/java/course/hadoop.shtml,506)),
(net.itcast.cn,(http://net.itcast.cn/net/course.shtml,521)), (java.itcast.cn,(http://java.itcast.cn/java/course/cloud.shtml,1028)),
(php.itcast.cn,(http://php.itcast.cn/php/video.shtml,490)),
(java.itcast.cn,(http://java.itcast.cn/java/teacher.shtml,482)),
(php.itcast.cn,(http://php.itcast.cn/php/teacher.shtml,464)),
(net.itcast.cn,(http://net.itcast.cn/net/teacher.shtml,512)),
(java.itcast.cn,(http://java.itcast.cn/java/course/javaee.shtml,1000)),
(java.itcast.cn,(http://java.itcast.cn/java/course/javaeeadvanced.shtml,477)))
2 测试2
package webcount
import java.net.URL
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
object UrlCountPartition {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("AdvURLCount").setMaster("local[2]")
val sc = new SparkContext(conf)
//rdd1 将数据切分,然后元组中(url,1)
val rdd1 = sc.textFile("d://urlcount.log").map(line => {
val f = line.split("\t")
(f(1), 1)
})
val rdd2 = rdd1.reduceByKey(_ + _)
val rdd3 = rdd2.map(t => {
val url = t._1
val host = new URL(url).getHost
(host, (url, t._2))
})
val rdd4 = rdd3.map(_._1).distinct()
//rdd3.repartition(3).saveAsTextFile("d://out1")
println(rdd4.collect().toBuffer)
}
}
//class HostPartitioer extends Partitioner{
// override def numPartitions: Int =
//
// override def getPartition(key: Any): Int =
//}
ArrayBuffer(net.itcast.cn, java.itcast.cn, php.itcast.cn)
3 测试3
package webcount
import java.net.URL
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
object UrlCountPartition {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("AdvURLCount").setMaster("local[2]")
val sc = new SparkContext(conf)
//rdd1 将数据切分,然后元组中(url,1)
val rdd1 = sc.textFile("d://urlcount.log").map(line => {
val f = line.split("\t")
(f(1), 1)
})
val rdd2 = rdd1.reduceByKey(_ + _)
val rdd3 = rdd2.map(t => {
val url = t._1
val host = new URL(url).getHost
(host, (url, t._2))
})
val ins = rdd3.map(_._1).distinct().collect()
val hostPartitioner = new HostPartitioer(ins)
rdd3.partitionBy(hostPartitioner).saveAsTextFile("d://out2")
//println(rdd4.collect().toBuffer)
}
}
class HostPartitioer(ins: Array[String]) extends Partitioner {
val partMap = new mutable.HashMap[String, Int]()
var count = 0
for (i <- ins) {
partMap += (i -> count)
count += 1
}
override def numPartitions: Int = ins.length
override def getPartition(key: Any): Int = {
partMap.getOrElse(key.toString,0)
}
}
4 测试4
package webcount
import java.net.URL
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
object UrlCountPartition {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("AdvURLCount").setMaster("local[2]")
val sc = new SparkContext(conf)
//rdd1 将数据切分,然后元组中(url,1)
val rdd1 = sc.textFile("d://urlcount.log").map(line => {
val f = line.split("\t")
(f(1), 1)
})
val rdd2 = rdd1.reduceByKey(_ + _)
val rdd3 = rdd2.map(t => {
val url = t._1
val host = new URL(url).getHost
(host, (url, t._2))
})
val ins = rdd3.map(_._1).distinct().collect()
val hostPartitioner = new HostPartitioer(ins)
val rdd4 = rdd3.partitionBy(hostPartitioner).mapPartitions(it => {
it.toList.sortBy(_._2._2).reverse.take(3).iterator
})
rdd4.saveAsTextFile("d://out3")
}
}
class HostPartitioer(ins: Array[String]) extends Partitioner {
val partMap = new mutable.HashMap[String, Int]()
var count = 0
for (i <- ins) {
partMap += (i -> count)
count += 1
}
override def numPartitions: Int = ins.length
override def getPartition(key: Any): Int = {
partMap.getOrElse(key.toString, 0)
}
}