任务:
http.log中有访问记录,第二项是ip地址
20090121000132095572000|125.213.100.123|show.51.com。。。。。。。。。。。。。。。。。。。。。。。。。。。。
ip.txt中有ip的基本信息
startIP endIP startIP(long类型的) endIP(long类型) 洲 国家 省、、、、、
1.0.1.0|1.0.3.255|16777472|16778239|亚洲|中国|福建|福州||电信|350100|China|CN|119.306239|26.075302
需求:求出每个省的访问数量
package scalaBase.day12
import org.apache.avro.TestAnnotation
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object IpSearch {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("ipSearch").setMaster("local")
val sc = new SparkContext(conf)
val rdd: RDD[String] = sc.textFile("D://MyBigData/Aspark/data/ipsearch/ip.txt")
val rdd1: RDD[(String, String, String)] =rdd .map(x => {
val f = x.split("\\|")
val start = f(2)
val end = f(3)
val pro = f(6)
(start, end, pro)
})
val broad: Broadcast[Array[(String, String, String)]] = sc.broadcast(rdd1.collect())
val pro: RDD[(String, Int)] = sc.
textFile("D://MyBigData/Aspark/data/ipsearch/http.log").map(x => {
val f = x.split("\\|")
val longIp = ipToLong(f(1))
val ipInfo: Array[(String, String, String)] = broad.value
val i = binarSeach(ipInfo, longIp)
println(longIp)
println(i+"__________"+ipInfo.length)
(ipInfo(i)._3, 1)
})
val res: Array[(String, Int)] = pro.reduceByKey(_+_).collect()
println(res.toBuffer)
}
//二分查找
def binarSeach(arr: Array[(String, String, String)], ip: Long):Int= {
var start=0
var end=arr.length-1
while (start<=end){
var middle=(start+end)/2
if(ip>=arr(middle)._1.toLong&&(ip<=arr(middle)._2.toLong)){
return middle
}
else if(ip<arr(middle)._1.toLong){
end=middle-1
}
else {
start=middle+1
}
}
-1
}
//将ip转换成long类型
def ipToLong(ip:String):Long={
val splited: Array[String] = ip.split("\\.")
var ipNumber=0L
for(i<-0 until splited.length){
ipNumber = splited(i).toLong | ipNumber<<8L
}
ipNumber
}
}