sql 重写ipCount I

package com.ws.sparksql
import com.ws.spark.IpFromUtils
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

/**
  * 统计日志中ip归属地出现次数
  */
object SqlIpFromCount {

  def main(args: Array[String]): Unit = {

    val sparkSession = SparkSession.builder().appName("SqlIpFromCount").master("local[*]").getOrCreate()

    import sparkSession.implicits._

    //读取规则
    val rulesData: Dataset[String] = sparkSession.read.textFile("E:\\bigData\\testdata\\ip.txt")

    val rules: Dataset[(Long, Long, String)] = rulesData.map(l => {
      val fields = l.split("[|]")
      val beginNum = fields(2).toLong
      val endNum = fields(3).toLong
      val province = fields(6)
      (beginNum, endNum, province)
    })

    val rulesDataFrame = rules.toDF("start_num","end_num","province")

    rulesDataFrame.createTempView("t_rules")

    //广播变量,只能用sc实例
    //val broadCast: Broadcast[Array[(Long, Long, String)]] = sparkSession.sparkContext.broadcast(rules)

    //读取数据
    val data: Dataset[String] = sparkSession.read.textFile("E:\\bigData\\testdata\\access.log")

    val ipNum: Dataset[Long] = data.map(l => {
      val fields = l.split("[|]")
      val ip = fields(1)
      //ip转十进制
      val ipNum = IpFromUtils.ipToLong(ip)
      ipNum
    })

    val ipNumDataFrame: DataFrame = ipNum.toDF("ip_num")

    ipNumDataFrame.createTempView("t_ips")

    val result = sparkSession.sql("select province , count(*) as times from t_rules right join t_ips on (ip_num >= start_num and ip_num <= end_num) group by province order by times desc")

    result.show()

    sparkSession.stop()
  }
}

结果 :

+--------+-----+
|province|times|
+--------+-----+
|      陕西| 1824|
|      北京| 1535|
|      重庆|  868|
|      河北|  383|
|      云南|  126|
+--------+-----+
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值