增量数据->flume->kafka->sparkStreaming->hbase

最新推荐文章于 2022-04-16 08:33:55 发布

code_xuan

最新推荐文章于 2022-04-16 08:33:55 发布

阅读量251

点赞数

分类专栏：大数据进阶文章标签：日志分析数据清洗+统计访问量 flume+kafka+spark+hbase

本文链接：https://blog.csdn.net/qq_25905159/article/details/103038406

版权

大数据进阶专栏收录该内容

20 篇文章 0 订阅

订阅专栏

*完整pom.xml在最后
模拟日志生成代码：

#coding=UTF-8

import random
import time

url_paths=[
    "class/112.html",
    "class/128.html",
    "class/145.html",
    "class/130.html",
    "class/146.html",
    "class/131.html",
    "learn/821",
    "course/list"
]

ip_slices=[132,156,124,10,29,167,143,187,30,46,55,63,72,87,98,168]

http_referers=[
    "https://www.baidu.com/s?wd={query}",
    "https://www.sogou.com/web?query={query}",
    "https://cn.bing.com/search?q={query}",
    "https://www.so.com/s?q={query}"
]

search_keyword=[
    "spark sql实战",
    "hadoop 基础",
    "storm实战",
    "spark streaming实战"
]

status_code=["200","404","500"]

def sample_status_code():
    return random.sample(status_code,1)[0]

def sample_referer():
    if random.uniform(0,1)>0.2:
        return "-"
    refer_str=random.sample(http_referers,1)
    query_str=random.sample(search_keyword,1)
    return refer_str[0].format(query=query_str[0])

def sample_url():
    return random.sample(url_paths,1)[0]

def sample_ip():
    slice=random.sample(ip_slices,4)
    return ".".join([str(item) for item in slice])

def generate_log(count=10):
    time_str=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())

    f=open("/apps/apache-flume-1.9.0-bin/mytemp/streaming_access.log","w+")

    while count >=1:
        query_log="{ip}\t{local_time}\t\"GET /{url} HTTP/1.1\"\t{status_code}\t{refer}".format(url=sample_url(),ip=sample_ip(),refer=sample_referer(),status_code=sample_status_code(),local_time=time_str)
        print(query_log)
        f.write(query_log+"\n")
        count=count-1

if __name__ == '__main__':
    while True:
        generate_log()
        time.sleep(1)

编写配置文件，配置flume：
vim streaming_project.conf:
注意，组件名称最好不要有 - 本人跳坑经验

#定义三大组件名称
a1.sources= r1
a1.sinks= k1
a1.channels= c1

a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /apps/apache-flume-1.9.0-bin/mytemp/streaming_access.log
a1.sources.r1.shell= /bin/sh -c

a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.brokerList = 192.168.130.163:9092,192.168.130.167:9092,192.168.130.168:9092
a1.sinks.k1.topic = test_topic
a1.sinks.k1.batchSize = 5
a1.sinks.k1.requireedAcks = 1

a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

需求：
1.统计到目前为止的访问量
2.统计今天到目前为止从搜索引擎过来的课程的访问量

业务处理：
实体类:
ClickLog.scala:

/**
  * 清洗后的日志格式
  * @param ip
  * @param time
  * @param courseId
  * @param statusCode
  * @param refer
  */
case class ClickLog (ip:String,time:String,courseId:Int,statusCode:Int,refer:String)

CourseClickCount.scala:

/**
  * 课程点击次数实体
  * @param day_course
  * @param click_count
  */
case class CourseClickCount (day_course:String,click_count:Long)

CourseSearchClickCount.scala:

/**
  *   * 从搜索引擎过来的课程点击数实体类
  * @param day_search_course
  * @param click_count
  */
case class CourseSearchClickCount (day_search_course:String,click_count:Long)

工具类：
DateUtils.scala

/**
  * 日期时间工具类
  */
object DateUtils {
  val OLD_FORMAT: FastDateFormat = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
  val TARGET_FORMAT: FastDateFormat = FastDateFormat.getInstance("yyyyMMddHHmmss")

  def getTime(time:String) :Long = {
    OLD_FORMAT.parse(time).getTime
  }

  def parseToMinute(time:String):String = {
    TARGET_FORMAT.format(new Date(getTime(time)))
  }

  def main(args: Array[String]): Unit = {
    println(parseToMinute("2018-9-6 13:58:01"))
  }
}

HBaseUtils.scala

object HBaseUtils {

  val configuration = new Configuration()
  configuration.set("hbase.zookeeper.quorum", "192.168.130.163:2181")
  configuration.set("hbase.rootdir", "hdfs://192.168.130.161:8020/hbase")
  val admin:HBaseAdmin = new HBaseAdmin(configuration)


  //根据表名获得HTable实例
  def getTable(tableName: String): HTable = {
    var table: HTable = null
    try {
      table = new HTable(configuration, tableName)
    } catch {
      case ex: Exception => {
        println(ex)
      }
    }
    table
  }

  //向表中插入数据
  def put(tableName: String, rowKey: String, cf: String, column: String, value: String): Unit = {
    val table: HTable = getTable(tableName)
    val put = new Put(Bytes.toBytes(rowKey))
    put.add(Bytes.toBytes(cf), Bytes.toBytes(column), Bytes.toBytes(value))

    try {
      table.put(put)
    } catch {
      case ex: Exception => {
        println(ex)
      }
    }
  }

  def main(args: Array[String]): Unit = {
    val tableName = "course_clickcount"
    val rowkey = "20180906_1"
    val cf = "info"
    val column = "click_count"
    val value = "2"
    HBaseUtils.put(tableName,rowkey,cf,column,value)
  }
}

HBase操作：
CourseClickCountDAO.scala:

/**
  * 数据访问层，课程点击数
  */
object CourseClickCountDAO {
  val tableName:String = "course_clickcount"
  val cf:String = "info"
  val qualifer:String = "click_count"

  //保存数据到HBase
  def save(list:ListBuffer[CourseClickCount]): Unit ={
    val table:HTable = HBaseUtils.getTable(tableName)

    for (elem <- list) {
      table.incrementColumnValue(Bytes.toBytes(elem.day_course),Bytes.toBytes(cf),Bytes.toBytes(qualifer),elem.click_count)
    }
  }

  //rowKey查值
  def count(day_course:String):Long = {
    val table:HTable = HBaseUtils.getTable(tableName)
    val get = new Get(Bytes.toBytes(day_course))
    val value:Array[Byte] = table.get(get).getValue(cf.getBytes,qualifer.getBytes)

    if(null == value){
      0L
    }else{
      Bytes.toLong(value)
    }
  }

  def main(args: Array[String]): Unit = {
    val list = new ListBuffer[CourseClickCount]
    list.append(CourseClickCount("20180906_8",8))
    list.append(CourseClickCount("20180906_4",3))
    list.append(CourseClickCount("20180906_2",2))

    //save(list)
    println(count("20180906_8")+":"+count("20180906_4")+":"+count("20180906_2"))
  }

}

CourseSearchClickCountDAO.scala:

/**
  * 数据访问层：搜索过来的课程点击数
  */
object CourseSearchClickCountDAO {
  val tableName = "course_search_clickcount"
  val cf = "info"
  val qualifer = "click_count"

  //保存数据到HBase
  def save(list: ListBuffer[CourseSearchClickCount]) :Unit = {
    val table: HTable = HBaseUtils.getTable(tableName)
    for (elem <- list) {
      table.incrementColumnValue(Bytes.toBytes(elem.day_search_course), Bytes.toBytes(cf),Bytes.toBytes(qualifer), elem.click_count)
    }
  }

  //rowKey查值：
  def count(day_search_course:String):Long = {
    val table: HTable = HBaseUtils.getTable(tableName)
    val get = new Get(Bytes.toBytes(day_search_course))
    val value: Array[Byte] = table.get(get).getValue(cf.getBytes,qualifer.getBytes)

    if(null == value){
      0L
    }else{
      Bytes.toLong(value)
    }
  }

  def main(args: Array[String]): Unit = {
    val list = new ListBuffer[CourseSearchClickCount]
    list.append(CourseSearchClickCount("20180906_www.baidu.com_8", 8))
    list.append(CourseSearchClickCount("20180906_www.baidu.com_4", 3))

    save(list)
    println(count("20180906_www.baidu.com_8") + ":" + count("20180906_www.baidu.com_4"))
  }
}

主类：

package com.qwx.flumeAndKafka.main

import com.qwx.flumeAndKafka.dao.{CourseClickCountDAO, CourseSearchClickCountDAO}
import com.qwx.flumeAndKafka.utils.DateUtils
import com.qwx.flumeAndKafka.domain.{ClickLog, CourseClickCount, CourseSearchClickCount}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

import scala.collection.mutable.ListBuffer

object SparkStreamingApp {
  Logger.getLogger("org").setLevel(Level.ERROR)
  def main(args: Array[String]): Unit = {

    //参数入集合
    //val Array(zkQuorum, group, topics, numThreads) = args
    val zkQuorum = "hadoop005:2181"
    val group = "test"
    val topics = "test_topic"
    val numThreads =1


    val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("spark://hadoop000:7077")
    val ssc = new StreamingContext(conf,Seconds(2))

    val topicMap: Map[String, Int] = topics.split(",").map((_,numThreads.toInt)).toMap
    //整合kafka
    val messages: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap)
    //测试数据接受
    messages.map(_._2).count().print()

    //数据清洗
    val logs: DStream[String] = messages.map(_._2)
    val cleanData: DStream[ClickLog] = logs.map(line => {
      val infos: Array[String] = line.split("\t")
      val url = infos(2).split(" ")(1)
      var courseId = 0
      //获取课程标号
      if (url.startsWith("/class")) {
        val courseHtml: String = url.split("/")(2)
        courseId = courseHtml.substring(0, courseHtml.lastIndexOf(".")).toInt
      }
      ClickLog(infos(0), DateUtils.parseToMinute(infos(1)), courseId, infos(3).toInt, infos(4))
    }).filter(clickLog => clickLog.courseId != 0)

    cleanData.print()

    //统计今天到现在为止的课程访问量
    val timeAndOne: DStream[(String, Int)] = cleanData.map(x => (x.time.substring(0,8)+"_"+x.courseId,1))
    val reduceResData: DStream[(String, Int)] = timeAndOne.reduceByKey(_+_)
    reduceResData.foreachRDD(rdd => {
      rdd.foreachPartition(partitionRecords => {
        val list = new ListBuffer[CourseClickCount]
        partitionRecords.foreach(pair => {
          list.append(CourseClickCount(pair._1,pair._2))
        })
        //写入HBase
        CourseClickCountDAO.save(list)
      })
    })

    //统计从搜索引擎过来的从今天开始到现在的课程的访问量
    val courseData: DStream[(String, Int, String)] = cleanData.map(x => {
      val refer: String = x.refer.replaceAll("//", "/")
      val splits: Array[String] = refer.split("/")
      var hosts = ""
      if (splits.length > 2) {
        hosts = splits(1)
      }
      (hosts, x.courseId, x.time)
    })

    val wordsWithOne: DStream[(String, Int)] = courseData.filter(_._1 != "").map(x => {
      (x._3.substring(0, 8) + "_" + x._1 + "_" + x._2, 1)
    })
    val resData: DStream[(String, Int)] = wordsWithOne.reduceByKey(_+_)
    resData.foreachRDD(rdd => {
      rdd.foreachPartition(partitionRecords => {
        val list = new ListBuffer[CourseSearchClickCount]
        partitionRecords.foreach(pair => {
          list.append(CourseSearchClickCount(pair._1,pair._2))
        })
        CourseSearchClickCountDAO.save(list)
      })
    })

    ssc.start()
    ssc.awaitTermination()
  }
}

运行测试：
1.启动 zookeeper：

zkServer.sh start

2.启动hadoop：

start-all.sh

3.启动 kafka:

kafka-server-start.sh -daemon $KAFKA_HOME/config/server.properties &

4.启动 flume

bin/flume-ng agent --conf conf/ --name a1 --conf-file job/streaming_project2.conf -Dflume.root.logger=INFO,console

5.运行日志生成器(centos7默认带有python环境)

python generate_log.py

6.运行spark程序

spark-submit --class com.qwx.flumeAndKafka.main.SparkStreamingApp ./sparkDemo-1.0-SNAPSHOT-jar-with-dependencies.jar  --master spark://hadoop000:7077

这里如果不想因为jar包引用而各种报错的话，就将项目依赖的所有jar包都打包。
打包方式见文章：
[https://blog.csdn.net/qq_25905159/article/details/103038736]

完整pom.xml:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.qwx</groupId>
    <artifactId>sparkDemo</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <scala.version>2.11.8</scala.version>
        <spark.version>2.4.4</spark.version>
        <hadoop.version>2.9.2</hadoop.version>
        <encoding>UTF-8</encoding>
    </properties>

    <repositories>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
        </repository>
    </repositories>

    <dependencies>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.9.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.9.2</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.38</version>
        </dependency>
        <dependency>
            <groupId>com.typesafe</groupId>
            <artifactId>config</artifactId>
            <version>1.2.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>redis.clients</groupId>
            <artifactId>jedis</artifactId>
            <version>2.8.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>2.0.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>org.scalikejdbc</groupId>
            <artifactId>scalikejdbc_2.11</artifactId>
            <version>2.5.0</version>
        </dependency>
        <dependency>
            <groupId>org.scalikejdbc</groupId>
            <artifactId>scalikejdbc-config_2.11</artifactId>
            <version>2.5.0</version>
        </dependency>
        <!-- 解析IP地址 -->
        <dependency>
            <groupId>com.ggstar</groupId>
            <artifactId>ipdatabase</artifactId>
            <version>1.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.14</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.14</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.2.0-cdh5.7.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>1.2.0-cdh5.7.0</version>
        </dependency>

    </dependencies>


    <build>
        <plugins>
            <!-- java编译插件-->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.5.1</version>
            </plugin>

            <!-- scala编译插件-->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                        <configuration>
                            <args>
                                <arg>-dependencyfile</arg>
                                <arg>${project.build.directory}/.scala_dependencies</arg>
                            </args>
                        </configuration>
                    </execution>
                </executions>
            </plugin>

            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <configuration>
                    <archive>
                        <manifest>
                            <mainClass>com.allen.capturewebdata.Main</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
            </plugin>

        </plugins>
    </build>
</project>

code_xuan

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
增量数据->flume->kafka->sparkStreaming->hbase

模拟日志生成代码：#coding=UTF-8import randomimport timeurl_paths=[ "class/112.html", "class/128.html", "class/145.html", "class/130.html", "class/146.html", "class/131.html", ...
复制链接

扫一扫

专栏目录