1 测试数据接收
package streamingproject
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/*
* 使用 Spark Streaming 处理 Kafka 输出的数据
* */
object StatStreamingApp {
def main(args: Array[String]): Unit = {
if (args.length != 4) {
println("Usage:<zkQuorum> <group> <topics> <numThreads>")
System.exit(1)
}
val Array(zkQuorum, groupId, topics, numThreads) = args
val sparkConf = new SparkConf().setAppName("StatStreamingApp").setMaster("local[2]")
val ssc = new StreamingContext(sparkConf, Seconds(60))
val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
val messages = KafkaUtils.createStream(ssc, zkQuorum, groupId, topicMap)
/*
* 测试1:测试数据接收
* */
messages.map(_._2).count().print()
ssc.start()
ssc.awaitTermination()
}
}
运行结果
2 数据清洗
原始日志
43.198.187.56 2018-12-14 16:54:01 "GET /class/145.html HTTP/1.1" 404 https://cn.bing.com/search?q=Spark Streaming实战
187.113.192.198 2018-12-14 16:54:01 "GET /class/145.html HTTP/1.1" 200 -
90.186.43.111 2018-12-14 16:54:01 "GET /class/146.html HTTP/1.1" 200 -
56.198.113.187 2018-12-14 16:54:01 "GET /learn/821.html HTTP/1.1" 404 https://search.yahoo.com/search?p=Spark SQL实战
2.1 时间解析工具类 DateUtils.scala
package streamingproject.utils
import java.util.Date
import org.apache.commons.lang3.time.FastDateFormat
/*
* 日期时间工具类
* */
object DateUtils {
val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
val TARGET_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss")
def getTime(time: String) = {
YYYYMMDDHHMMSS_FORMAT.parse(time).getTime
}
def parseToMinute(time: String) = {
TARGET_FORMAT.format(new Date(getTime(time)))
}
def main(args: Array[String]): Unit = {
println(parseToMinute("2018-12-14 16:54:01"))
}
}
2.2 数据清洗程序
ClickLog.scala
package streamingproject.domian
/*
*
* 清洗后的日志信息
* */
case class ClickLog(ip: String, time: String, courseId: Int, statusCode: Int, referer: String)
StatStreamingApp
package streamingproject
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import streamingproject.domian.ClickLog
import streamingproject.utils.DateUtils
/*
* 使用 Spark Streaming 处理 Kafka 输出的数据
* */
object StatStreamingApp {
def main(args: Array[String]): Unit = {
if (args.length != 4) {
println("Usage:<zkQuorum> <group> <topics> <numThreads>")
System.exit(1)
}
val Array(zkQuorum, groupId, topics, numThreads) = args
val sparkConf = new SparkConf().setAppName("StatStreamingApp").setMaster("local[2]")
val ssc = new StreamingContext(sparkConf, Seconds(60))
val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
val messages = KafkaUtils.createStream(ssc, zkQuorum, groupId, topicMap)
/*
* 测试1:测试数据接收
* */
//messages.map(_._2).count().print()
/*
* 测试2:数据清洗
* */
val logs = messages.map(_._2)
val cleanData = logs.map(line => {
//infos(2) = "GET /class/128.html HTTP/1.1"
//url = /class/128.html
val infos = line.split("\t")
val url = infos(2).split(" ")(1)
var courseId = 0
if (url.startsWith("/class")) {
val courseIdHTML = url.split("/")(2)
courseId = courseIdHTML.substring(0, courseIdHTML.lastIndexOf(".")).toInt
}
ClickLog(infos(0),DateUtils.parseToMinute(infos(1)),courseId,infos(3).toInt,infos(4))
}).filter(clickLog => clickLog.courseId != 0)
cleanData.print()
ssc.start()
ssc.awaitTermination()
}
}
- 清洗结果
ClickLog(90.56.201.198,20181214194501,112,500,-)
ClickLog(186.121.201.90,20181214194501,145,500,-)
ClickLog(198.43.201.156,20181214194501,146,500,-)
ClickLog(186.192.90.156,20181214194501,112,200,-)
ClickLog(43.186.156.201,20181214194501,145,200,-)
ClickLog(113.186.121.111,20181214194501,128,500,-)
ClickLog(56.132.198.121,20181214194501,112,500,-)
ClickLog(198.156.113.132,20181214194501,145,404,http://www.baidu.com/s?wd=Spark SQL实战)
ClickLog(90.111.43.201,20181214194501,128,200,-)
ClickLog(132.201.56.186,20181214194501,131,200,http://www.sougou.com/web?query=Spark SQL实战)