主程序
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object ProvinceAndIpDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("ProvinceAndIpDemo")
val sc = new SparkContext(conf)
//读取ip文件
//如需读取hdfs上的文件只需修改路径即可,hadoop集群记得启动
val ip = sc.textFile("E:\\study\\Date\\ip.txt")
//数据截取
val partIpRules: RDD[(Long, Long, String)] = ip.map(line => {
val ipRules = line.split("[|]")
val startIp = ipRules(2).toLong
val endIp = ipRules(3).toLong
val province = ipRules(6)
(startIp, endIp, province)
})
//将ip文件内容拉去到Driver端
val allIpRules: Array[(Long, Long, String)] = partIpRules.collect()
//广播
val broadCastIp: Broadcast[Array[(Long, Long, String)]] = sc.broadcast(allIpRules)
//读取待处理文件
val accessLog = sc.textFile("E:\\study\\Date\\access.log")
//字段截取
val provinceAndOne = accessLog.map(line => {
val ip = line.split("[|]")(1)
val ipLong = MyUtils.ip2Long(ip)
//将数据拉取到executor里
val allIpRulesExecutor: Array[(Long, Long, String)] = broadCastIp.value
val index = MyUtils.binarySearch(allIpRulesExecutor, ipLong)
var province = "未知省份"
//判断ip文件中是否包含待处理文件的ip地址
if (index != -1) {
province = allIpRulesExecutor(index)._3
}
(province, 1)
})
//聚合
val redeced: RDD[(String, Int)] = provinceAndOne.reduceByKey(_+_)
//排序
val result: RDD[(String, Int)] = redeced.sortBy(_._2,false)
//打印到窗口
result.collect().foreach(println)
//打印到数据库
result.foreachPartition(MyUtils.ipLocationData2MySQL)
//关闭资源
sc.stop()
}
}
工具代码
import java.sql.DriverManager
object MyUtils {
//ip转换成long类型
def ip2Long(ip:String):Long ={
val fragments = ip.split("[.]")
var ipNum =0L
for(i<- 0 until fragments.length){
ipNum = fragments(i).toLong | ipNum << 8L
}
ipNum
}
//二分法
def binarySearch(lines: Array[(Long,Long,String)],ip: Long):Int ={
var low =0
var high =lines.length-1
while(low <=high){
val middle =(low+high)/2
if((ip>=lines(middle)._1) && (ip<=lines(middle)._2))
return middle
if(ip < lines(middle)._1)
high=middle -1
else{
low =middle +1
}
}
-1
}
//连接数据库
def ipLocationData2MySQL(it:Iterator[(String,Int)])={
//获取数据库连接
val conn = DriverManager.getConnection("jdbc:mysql://hadoop01:3306/xiaoniu?characterEncoding=UTF-8", "root", "123456")
//sql语句
val statement = conn.prepareStatement("insert into ip_local values(?,?)")
it.foreach(tp=>{
//给定值
statement.setString(1,tp._1)
statement.setInt(2,tp._2)
//执行
statement.executeLargeUpdate()
})
//关闭资源
if(statement != null) statement.close()
if(conn != null) conn.close()
}
}