使用map对日志文件的处理
package com.atguigu.bigdata.spark.core.operator.transform.test
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object RDD_map_02 {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("Operator")
val sc = new SparkContext(sparkConf)
// TODO 算子 -map
// 需求:在日志文件中 获取 用户请求的URL
// 数据格式如下:
// 83.149.9.216 - - 17/05/2015:10:05:03 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-search.png
// 创建数据源
val rdd = sc.textFile("datas/apache.log")
// 这时候的rdd已经拿到了一行一行的字符串
// 所以要把长字符串转换为短字符串
val mapRDD: RDD[String] = rdd.map(
line => {
val datas = line.split(" ")
datas(6)
}
)
mapRDD.collect().foreach(println)
sc.stop()
}
}
数据源apache.log
38.99.236.50 - - 20/05/2015:21:05:52 +0000 GET /presentations/logstash-puppetconf-2012/images/kibana-logstash-downloads.png
38.99.236.50 - - 20/05/2015:21:05:53 +0000 GET /presentations/logstash-puppetconf-2012/images/kibana-chef-agent.png
38.99.236.50 - - 20/05/2015:21:05:22 +0000 GET /presentations/logstash-puppetconf-2012/images/kibana-chef-hits-6min-each.png
38.99.236.50 - - 20/05/2015:21:05:06 +0000 GET /presentations/logstash-puppetconf-2012/images/trollface.png
38.99.236.50 - - 20/05/2015:21:05:39 +0000 GET /presentations/logstash-puppetconf-2012/images/psychoaxe.jpg
38.99.236.50 - - 20/05/2015:21:05:48 +0000 GET /presentations/logstash-puppetconf-2012/images/stats-negative-min.png
38.99.236.50 - - 20/05/2015:21:05:42 +0000 GET /presentations/logstash-puppetconf-2012/images/logs.jpg
38.99.236.50 - - 20/05/2015:21:05:29 +0000 GET /presentations/logstash-puppetconf-2012/images/apache-negative-duration.png
38.99.236.50 - - 20/05/2015:21:05:31 +0000 GET /favicon.ico
66.249.73.135 - - 20/05/2015:21:05:11 +0000 GET /blog/tags/xsendevent
198.46.149.143 - - 20/05/2015:21:05:29 +0000 GET /blog/geekery/disabling-battery-in-ubuntu-vms.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+semicomplete%2Fmain+%28semicomplete.com+-+Jordan+Sissel%29
198.46.149.143 - - 20/05/2015:21:05:34 +0000 GET /blog/geekery/solving-good-or-bad-problems.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+semicomplete%2Fmain+%28semicomplete.com+-+Jordan+Sissel%29
82.165.139.53 - - 20/05/2015:21:05:15 +0000 GET /projects/xdotool/
100.43.83.137 - - 20/05/2015:21:05:01 +0000 GET /blog/tags/standards
63.140.98.80 - - 20/05/2015:21:05:28 +0000 GET /blog/tags/puppet?flav=rss20
63.140.98.80 - - 20/05/2015:21:05:50 +0000 GET /blog/geekery/solving-good-or-bad-problems.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+semicomplete%2Fmain+%28semicomplete.com+-+Jordan+Sissel%29
66.249.73.135 - - 20/05/2015:21:05:00 +0000 GET /?flav=atom
180.76.6.56 - - 20/05/2015:21:05:56 +0000 GET /robots.txt
46.105.14.53 - - 20/05/2015:21:05:15 +0000 GET /blog/tags/puppet?flav=rss20