Spark Streaming 项目实战(3)—— 数据清洗

1 测试数据接收

package streamingproject

import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

/*
* 使用 Spark Streaming 处理 Kafka 输出的数据
* */
object StatStreamingApp {
  def main(args: Array[String]): Unit = {

    if (args.length != 4) {
      println("Usage:<zkQuorum> <group> <topics> <numThreads>")
      System.exit(1)
    }

    val Array(zkQuorum, groupId, topics, numThreads) = args

    val sparkConf = new SparkConf().setAppName("StatStreamingApp").setMaster("local[2]")
    val ssc = new StreamingContext(sparkConf, Seconds(60))

    val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
    val messages = KafkaUtils.createStream(ssc, zkQuorum, groupId, topicMap)

    /*
    * 测试1:测试数据接收
    * */
    messages.map(_._2).count().print()

    ssc.start()
    ssc.awaitTermination()

  }
}

在这里插入图片描述

运行结果
在这里插入图片描述

2 数据清洗

原始日志

43.198.187.56	2018-12-14 16:54:01	"GET /class/145.html HTTP/1.1"	404	https://cn.bing.com/search?q=Spark Streaming实战
187.113.192.198	2018-12-14 16:54:01	"GET /class/145.html HTTP/1.1"	200	-
90.186.43.111	2018-12-14 16:54:01	"GET /class/146.html HTTP/1.1"	200	-
56.198.113.187	2018-12-14 16:54:01	"GET /learn/821.html HTTP/1.1"	404	https://search.yahoo.com/search?p=Spark SQL实战

2.1 时间解析工具类 DateUtils.scala

package streamingproject.utils
import java.util.Date
import org.apache.commons.lang3.time.FastDateFormat

/*
* 日期时间工具类
* */
object DateUtils {

  val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")

  val TARGET_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss")

  def getTime(time: String) = {
    YYYYMMDDHHMMSS_FORMAT.parse(time).getTime
  }

  def parseToMinute(time: String) = {
    TARGET_FORMAT.format(new Date(getTime(time)))
  }

  def main(args: Array[String]): Unit = {
    println(parseToMinute("2018-12-14 16:54:01"))
  }

}

在这里插入图片描述

2.2 数据清洗程序

  • ClickLog.scala
package streamingproject.domian

/*
*
* 清洗后的日志信息
* */
case class ClickLog(ip: String, time: String, courseId: Int, statusCode: Int, referer: String)

  • StatStreamingApp
package streamingproject

import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import streamingproject.domian.ClickLog
import streamingproject.utils.DateUtils

/*
* 使用 Spark Streaming 处理 Kafka 输出的数据
* */
object StatStreamingApp {
  def main(args: Array[String]): Unit = {

    if (args.length != 4) {
      println("Usage:<zkQuorum> <group> <topics> <numThreads>")
      System.exit(1)
    }

    val Array(zkQuorum, groupId, topics, numThreads) = args

    val sparkConf = new SparkConf().setAppName("StatStreamingApp").setMaster("local[2]")
    val ssc = new StreamingContext(sparkConf, Seconds(60))

    val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap

    val messages = KafkaUtils.createStream(ssc, zkQuorum, groupId, topicMap)

    /*
    * 测试1:测试数据接收
    * */
    //messages.map(_._2).count().print()


    /*
    * 测试2:数据清洗
    * */
    val logs = messages.map(_._2)
    val cleanData = logs.map(line => {

      //infos(2) = "GET /class/128.html HTTP/1.1"
      //url =  /class/128.html
      val infos = line.split("\t")
      val url = infos(2).split(" ")(1)
      var courseId = 0
      if (url.startsWith("/class")) {
        val courseIdHTML = url.split("/")(2)
        courseId = courseIdHTML.substring(0, courseIdHTML.lastIndexOf(".")).toInt
      }

      ClickLog(infos(0),DateUtils.parseToMinute(infos(1)),courseId,infos(3).toInt,infos(4))

    }).filter(clickLog => clickLog.courseId != 0)


    cleanData.print()

    ssc.start()
    ssc.awaitTermination()

  }
}

  • 清洗结果
ClickLog(90.56.201.198,20181214194501,112,500,-)
ClickLog(186.121.201.90,20181214194501,145,500,-)
ClickLog(198.43.201.156,20181214194501,146,500,-)
ClickLog(186.192.90.156,20181214194501,112,200,-)
ClickLog(43.186.156.201,20181214194501,145,200,-)
ClickLog(113.186.121.111,20181214194501,128,500,-)
ClickLog(56.132.198.121,20181214194501,112,500,-)
ClickLog(198.156.113.132,20181214194501,145,404,http://www.baidu.com/s?wd=Spark SQL实战)
ClickLog(90.111.43.201,20181214194501,128,200,-)
ClickLog(132.201.56.186,20181214194501,131,200,http://www.sougou.com/web?query=Spark SQL实战)
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值