spark-groupBy(3)

需求:从日志文件中获取每个时间段的访问量

package com.atguigu.bigdata.spark.core.operator.transform.test

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import java.text.SimpleDateFormat
import java.util.Date

object RDD_groupBy_03 {
  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("Operator")
    val sc = new SparkContext(sparkConf)

    //  TODO 算子 -groupBy - 将数据根据指定的规则进行分组,分区默认不变,但是数据会被打乱重新组合,我们将这样的操作称之为shuffle,
    //                      极限情况下,数据可能被分在同一个分区中
    //  需求:从日志文件中获取每个时间段的访问量

    val rdd = sc.textFile("datas/apache.log")
    val timeRDD: RDD[(String, Iterable[(String, Int)])] = rdd.map(
      line => {
        //  先按空格切割
        val datas = line.split(" ")
        //  根据索引获取时间
        val time = datas(3)
        //  匹配时间格式
        val sdf = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss")
        //  把获取到的时间传入,匹配格式
        val date: Date = sdf.parse(time)
        val sdf1 = new SimpleDateFormat("HH")
        val hour: String = sdf1.format(date)
        (hour, 1)
      }
    ).groupBy(_._1)

    timeRDD.map {
      case (hour, iter) => {
        (hour, iter.size)
      }
    }.collect().foreach(println)

    sc.stop()

  }
}

数据源apache.log

92.100.97.83 - - 20/05/2015:18:05:02 +0000 GET /projects/xdotool/xdotool.xhtml
92.100.97.83 - - 20/05/2015:18:05:54 +0000 GET /favicon.ico
23.229.67.14 - - 20/05/2015:18:05:05 +0000 GET /
99.146.78.102 - - 20/05/2015:18:05:40 +0000 GET /presentations/logstash-scale11x/images/ahhh___rage_face_by_samusmmx-d5g5zap.png
23.229.67.14 - - 20/05/2015:18:05:13 +0000 GET /blog/geekery/installing-windows-8-consumer-preview.html
190.107.140.178 - - 20/05/2015:18:05:49 +0000 GET /images/googledotcom.png
5.10.83.23 - - 20/05/2015:18:05:00 +0000 GET /files/blogposts/20070917/?C=D;O=A
66.249.73.135 - - 20/05/2015:18:05:46 +0000 GET /blog/tags/java
66.249.73.135 - - 20/05/2015:19:05:29 +0000 GET /blog/tags/screen
100.43.83.137 - - 20/05/2015:19:05:03 +0000 GET /blog/geekery/shell-shortcut-hacks.html
209.85.238.199 - - 20/05/2015:19:05:32 +0000 GET /?flav=rss20
46.105.14.53 - - 20/05/2015:19:05:57 +0000 GET /blog/tags/puppet?flav=rss20
157.55.33.17 - - 20/05/2015:19:05:24 +0000 GET /blog/geekery/screen-solaris-terminfo.html
91.232.96.8 - - 20/05/2015:19:05:26 +0000 GET /presentations/logstash-1/
91.232.96.8 - - 20/05/2015:19:05:31 +0000 GET /presentations/logstash-1/
91.232.96.8 - - 20/05/2015:19:05:09 +0000 GET /presentations/logstash-1/
91.232.96.8 - - 20/05/2015:19:05:01 +0000 GET /presentations/logstash-1/
91.232.96.8 - - 20/05/2015:19:05:31 +0000 GET /presentations/logstash-1/
157.56.92.151 - - 20/05/2015:19:05:50 +0000 GET /images/jordan-80.png
82.130.49.223 - - 20/05/2015:19:05:12 +0000 GET /projects/xdotool/
82.130.49.223 - - 20/05/2015:19:05:37 +0000 GET /reset.css
82.130.49.223 - - 20/05/2015:19:05:06 +0000 GET /style2.css
82.130.49.223 - - 20/05/2015:19:05:23 +0000 GET /favicon.ico
82.130.49.223 - - 20/05/2015:19:05:21 +0000 GET /images/jordan-80.png
82.130.49.223 - - 20/05/2015:19:05:12 +0000 GET /images/web/2009/banner.png
31.4.197.143 - - 20/05/2015:19:05:06 +0000 GET /articles/ssh-security/
31.4.197.143 - - 20/05/2015:19:05:31 +0000 GET /reset.css
31.4.197.143 - - 20/05/2015:19:05:28 +0000 GET /style2.css
31.4.197.143 - - 20/05/2015:19:05:39 +0000 GET /images/jordan-80.png
31.4.197.143 - - 20/05/2015:19:05:01 +0000 GET /images/web/2009/banner.png
31.4.197.143 - - 20/05/2015:19:05:31 +0000 GET /favicon.ico
128.118.108.67 - - 20/05/2015:19:05:20 +0000 GET /favicon.ico
84.215.206.35 - - 20/05/2015:19:05:57 +0000 GET /projects/xdotool/xdotool.xhtml
84.215.206.35 - - 20/05/2015:19:05:43 +0000 GET /favicon.ico
208.91.156.11 - - 20/05/2015:19:05:20 +0000 GET /files/logstash/logstash-1.3.2-monolithic.jar
66.249.73.135 - - 20/05/2015:19:05:09 +0000 GET /blog/tags/speed

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值