第1关:QueueStream
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.{HashPartitioner, SparkConf}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable
object QueueStream {
def main(args: Array[String]) {
val rddQueue = new mutable.SynchronizedQueue[RDD[String]]()
val conf = new SparkConf().setMaster("local[2]").setAppName("queueStream")
/********** Begin **********/
//1.初始化StreamingContext,设置时间间隔为1s
val ssc = new StreamingContext(conf, Seconds(1))
//2.对接队列流
val inputStream = ssc.queueStream(rddQueue)
/**
*
* 数据格式如下:
* 100.143.124.29,1509116285000,'GET www/1 HTTP/1.0',https://www.baidu.com/s?wd=反叛的鲁鲁修,404
* 数据从左往右分别代表:用户IP、访问时间戳、起始URL及相关信息(访问方式,起始URL,http版本)、目标URL、状态码
*
*
* 原始数据的切割符为逗号,(英文逗号)
*
* 需求:
* 1.将时间戳转换成规定时间(格式为:yyyy-MM-dd HH:mm:ss )
* 2.提取数据中的起始URL(切割符为空格)
* 3.拼接结果数据,格式如下:
* Ip:124.132.29.10,visitTime:2019-04-22 11:08:33,startUrl:www/2,targetUrl:https://search.yahoo.com/search?p=反叛的鲁鲁修,statusCode:200
* 4.将最终结果写入 mysql 数据库, 调用DBUtils.add(line)即可, line:String
*/
//3.获取队列流中的数据,进行清洗、转换(按照上面的需求)
val simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val value = inputStream.map(x => {
val arrs = x.split(",")
val ip = arrs(0)
val time = simpleDateFormat.format(new Date(arrs(1).toLong))
val startUrl = arrs(2).split(" ")(1)
val targetUrl = arrs(3)
val statusCode = arrs(4)
val result = "Ip:" + ip + ",visitTime:" + time + ",startUrl:" + startUrl + ",targetUrl:" + targetUrl + ",statusCode:" + statusCode
result
})
//4.将最终结果写入 mysql 数据库, 调用DBUtils.add(line)即可, line:String
value.foreachRDD(rdd => {
rdd.foreachPartition(it => {
it.foreach(line => {