需求:从日志文件中获取每个时间段的访问量
package com.atguigu.bigdata.spark.core.operator.transform.test
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import java.text.SimpleDateFormat
import java.util.Date
object RDD_groupBy_03 {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("Operator")
val sc = new SparkContext(sparkConf)
// TODO 算子 -groupBy - 将数据根据指定的规则进行分组,分区默认不变,但是数据会被打乱重新组合,我们将这样的操作称之为shuffle,
// 极限情况下,数据可能被分在同一个分区中
// 需求:从日志文件中获取每个时间段的访问量
val rdd = sc.textFile("datas/apache.log")
val timeRDD: RDD[(String, Iterable[(String, Int)])] = rdd.map(
line => {
// 先按空格切割
val datas = line.split(" ")
// 根据索引获取时间
val time = datas(3)
// 匹配时间格式
val sdf = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss")
// 把获取到的时间传入,匹配格式
val date: Date = sdf.parse(time)
val sdf1 = new SimpleDateFormat("HH")
val hour: String = sdf1.format(date)
(hour, 1)
}
).groupBy(_._1)
timeRDD.map {
case (hour, iter) => {
(hour, iter.size)
}
}.collect().foreach(println)
sc.stop()
}
}
数据源apache.log
92.100.97.83 - - 20/05/2015:18:05:02 +0000 GET /projects/xdotool/xdotool.xhtml
92.100.97.83 - - 20/05/2015:18:05:54 +0000 GET /favicon.ico
23.229.67.14 - - 20/05/2015:18:05:05 +0000 GET /
99.146.78.102 - - 20/05/2015:18:05:40 +0000 GET /presentations/logstash-scale11x/images/ahhh___rage_face_by_samusmmx-d5g5zap.png
23.229.67.14 - - 20/05/2015:18:05:13 +0000 GET /blog/geekery/installing-windows-8-consumer-preview.html
190.107.140.178 - - 20/05/2015:18:05:49 +0000 GET /images/googledotcom.png
5.10.83.23 - - 20/05/2015:18:05:00 +0000 GET /files/blogposts/20070917/?C=D;O=A
66.249.73.135 - - 20/05/2015:18:05:46 +0000 GET /blog/tags/java
66.249.73.135 - - 20/05/2015:19:05:29 +0000 GET /blog/tags/screen
100.43.83.137 - - 20/05/2015:19:05:03 +0000 GET /blog/geekery/shell-shortcut-hacks.html
209.85.238.199 - - 20/05/2015:19:05:32 +0000 GET /?flav=rss20
46.105.14.53 - - 20/05/2015:19:05:57 +0000 GET /blog/tags/puppet?flav=rss20
157.55.33.17 - - 20/05/2015:19:05:24 +0000 GET /blog/geekery/screen-solaris-terminfo.html
91.232.96.8 - - 20/05/2015:19:05:26 +0000 GET /presentations/logstash-1/
91.232.96.8 - - 20/05/2015:19:05:31 +0000 GET /presentations/logstash-1/
91.232.96.8 - - 20/05/2015:19:05:09 +0000 GET /presentations/logstash-1/
91.232.96.8 - - 20/05/2015:19:05:01 +0000 GET /presentations/logstash-1/
91.232.96.8 - - 20/05/2015:19:05:31 +0000 GET /presentations/logstash-1/
157.56.92.151 - - 20/05/2015:19:05:50 +0000 GET /images/jordan-80.png
82.130.49.223 - - 20/05/2015:19:05:12 +0000 GET /projects/xdotool/
82.130.49.223 - - 20/05/2015:19:05:37 +0000 GET /reset.css
82.130.49.223 - - 20/05/2015:19:05:06 +0000 GET /style2.css
82.130.49.223 - - 20/05/2015:19:05:23 +0000 GET /favicon.ico
82.130.49.223 - - 20/05/2015:19:05:21 +0000 GET /images/jordan-80.png
82.130.49.223 - - 20/05/2015:19:05:12 +0000 GET /images/web/2009/banner.png
31.4.197.143 - - 20/05/2015:19:05:06 +0000 GET /articles/ssh-security/
31.4.197.143 - - 20/05/2015:19:05:31 +0000 GET /reset.css
31.4.197.143 - - 20/05/2015:19:05:28 +0000 GET /style2.css
31.4.197.143 - - 20/05/2015:19:05:39 +0000 GET /images/jordan-80.png
31.4.197.143 - - 20/05/2015:19:05:01 +0000 GET /images/web/2009/banner.png
31.4.197.143 - - 20/05/2015:19:05:31 +0000 GET /favicon.ico
128.118.108.67 - - 20/05/2015:19:05:20 +0000 GET /favicon.ico
84.215.206.35 - - 20/05/2015:19:05:57 +0000 GET /projects/xdotool/xdotool.xhtml
84.215.206.35 - - 20/05/2015:19:05:43 +0000 GET /favicon.ico
208.91.156.11 - - 20/05/2015:19:05:20 +0000 GET /files/logstash/logstash-1.3.2-monolithic.jar
66.249.73.135 - - 20/05/2015:19:05:09 +0000 GET /blog/tags/speed