SparkStreaming读取Kafka的数据并写入到HBase

最新推荐文章于 2022-07-03 20:34:49 发布

逆水行舟如何

最新推荐文章于 2022-07-03 20:34:49 发布

阅读量2.3k

点赞数 1

分类专栏： spark 文章标签： SparkStreaming Kafka HBase读写

本文链接：https://blog.csdn.net/weixin_43823423/article/details/103278316

版权

spark 专栏收录该内容

14 篇文章 0 订阅

订阅专栏

1、编写HBase工具类

package HBaseDao;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import java.io.IOException;

/*
*
*   操作Hbase的工具类
* */
public class HBaseUtils {
    HBaseAdmin admin = null;
    Configuration configuration = null;
    /*
     *  私有化构造器
     * */
    private HBaseUtils() {
        configuration = new Configuration();
        configuration.set("hbase.zookeeper.quorum","qyl01,qyl02,qyl03");
        configuration.set("hbase.rootdir","hdfs:///hbase");
        try{
            admin = new HBaseAdmin(configuration);
        }
        catch(IOException e){
            e.printStackTrace();
        }
    }
    private static HBaseUtils instance = null;

    public static HBaseUtils getInstance() {
        if (null == instance){
            instance = new HBaseUtils();
        }
        return instance;
    }
    /*
    *   根据表名获取HBase实例
    *   @param tableName
    *   @return
    * */
    public HTable getTable(String tableName){
        HTable table  = null ;
        try {
            table  = new HTable(configuration, tableName);
        }catch (IOException e){
            e.printStackTrace();
        }
        return table;
    }

    /**
     * 添加一条记录到HBase表
     * @param tableName 表名
     * @param rowkey rowkey
     * @param cf columnFamily
     * @param column 列
     * @param value 写入的值
     */
    public void put(String tableName,String rowkey,String cf,String column,String value){
        HTable table = getTable(tableName);
        Put put = new Put(rowkey.getBytes());
        put.add(cf.getBytes(),column.getBytes(),value.getBytes());

        try{
            table.put(put);
        }catch (IOException e){
            e.printStackTrace();
        }
    }
    /*
    * 插入一条数据到HBase中进行操作
    * */
/*
    public static void main(String[] args) {
        String tableName = "course_clickcount";
        String rowkey = "20191111_188";
        String cf = "info";
        String column = "click_count";
        String value = "2";
        HBaseUtils.getInstance().put(tableName,rowkey,cf,column,value);

      }
*/
}

2、编写HBase操作类

package HBaseDao

import java.io.IOException

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable.ListBuffer

/*
* 操作hbase的dao
* */
object ClickCourseCountDao {
    /*
    * hbase中的各参数，参照个人配置
    * */
  val tableName = "course_clickcount"
  val cf = "info"
  val column = "clickcount"


  /*
  * 插入结果数据的方法
  * */
  def save(List:ListBuffer[ClickCoursCount]):Unit = {
       val htable = HBaseUtils.getInstance().getTable(tableName)
       for(clk <- List){
         htable.incrementColumnValue(
           clk.dayCourse.getBytes(),
           cf.getBytes(),
           column.getBytes(),
           clk.clickCount
         )
       }
  }
  /*
  * 取出tableName中column对应的value的数据
  * */
  def count(dayCourse:String):Long = {
    val htable = HBaseUtils.getInstance().getTable(tableName)
    val get = new Get(dayCourse.getBytes())
    val value = htable.get(get).getValue(cf.getBytes(),column.getBytes())
    if(null == value){
      0L
    }else{
      Bytes.toLong(value)
    }
  }




  def main(args: Array[String]): Unit = {
      val listbuffer = new ListBuffer[ClickCoursCount]
     /*
     * 插入数据测试
     * */
      listbuffer.append(ClickCoursCount("20191111_88",1L))
      listbuffer.append(ClickCoursCount("20191111_88",2L))
      listbuffer.append(ClickCoursCount("20191111_88",2L))
      save(listbuffer)

     println(count("20191111_88")+"------"+count("20191111_88"))

  }
}
  /*
  * 点击量的实体类
  * */
case class ClickCoursCount(dayCourse:String,clickCount:Long)

3、编写SparkStreaming代码读取Kafka的数据写入HBase

package com.bonc.qyl.Spark

import HBaseDao.{ClickCoursCount, ClickCourseCountDao}
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

import scala.collection.mutable.ListBuffer

/*
*  flume + kafka +SparkStreaming +HBase
* */
object ProjectStreaming {
  def main(args: Array[String]): Unit = {
    /*
    *    实际项目中 应该是传入参数，这里不做演示了
    * if(args.length != 2){
      System.err.println("Usage ProjectStreaming: <brokers> <topics>")
      System.exit(1)
    }
   */

    /*
    * 建立连接
    * */
    System.setProperty("HADOOP_USER_NAME","qyl")
    val conf = new SparkConf().setMaster("local[2]").setAppName("ProjectStreaming")
    val ssc = new StreamingContext(conf,Seconds(5))
    ssc.checkpoint("hdfs:///flume-kafka-direct")

    /*
    *  读取kafka中的数据
    * */
    val kafkaParams = Map[String,String]("metadata.broker.list" -> "qyl01:9092,qyl02:9092,qyl03:9092","auto.offset.reset" -> "smallest")
    val topics = Set("flume-kafka-sparkStreaming-HBase1")
    val kafkaDStream: DStream[String] = KafkaUtils.createDirectStream
      [String, String, StringDecoder, StringDecoder](ssc,kafkaParams,topics).map(_._2)

    /*
  * 数据过滤
  * 数据格式
  * 132.168.89.224    2018-07-13 05:53:02 "GET /class/145.html HTTP/1.1"  200 https://search.yahoo.com/search?p=Flink实战
  * */
    val   cleanData : DStream[ClickLog]  = kafkaDStream.map { x =>
      val strArr = x.split("\t")
        val ip = strArr(0)
        val time = strArr(1).substring(0,10).trim()
        val refer = strArr(2).split(" ")(1)
        val status = strArr(3).toInt
        val searchArr = strArr(4).replaceAll("//", "/").split("/")
        var searchUrl = ""
        if (searchArr.length > 2) {
          searchUrl = searchArr(1)
        } else {
          searchUrl = searchArr(0)
        }
        (ip, time, refer, status, searchUrl)
    }.filter(_._3.startsWith("/class")).map { x =>
      // 145.html
      val referStr = x._3.split("/")(2)
      val refer = referStr.substring(0, referStr.lastIndexOf("."))
      ClickLog(x._1, x._2, refer, x._4, x._5)
    }
      /*
    * 需求：统计到今天为止，的访问量
    */

    cleanData.map(x =>(x.time +"_"+x.refer,1)).reduceByKey(_+_).foreachRDD{rdd =>{
      rdd.foreachPartition{rddPartition =>
        val list = new ListBuffer[ClickCoursCount]
        rddPartition.foreach{ pair =>
          list.append(ClickCoursCount(pair._1,pair._2))
        }
        /*
        * 写入数据到HBase
        * */
        ClickCourseCountDao.save(list)
      }
    }}


    ssc.start()
    ssc.awaitTermination()

  }
}

case class ClickLog(ip:String,time:String,refer:String,status:Int,searchUrl:String)

4、pom.xml文件

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.bonc.qyl.Spark</groupId>
  <artifactId>Kafka_SparkStreaming_Hbase</artifactId>
  <version>1.0-SNAPSHOT</version>
  <inceptionYear>2008</inceptionYear>
    <properties>
        <project.build.sourceEncoding>UTF8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <encoding>UTF-8</encoding>
        <scala.version>2.11.8</scala.version>
        <spark.version>2.3.2</spark.version>
        <hadoop.version>2.7.7</hadoop.version>
        <mysql.version>5.1.46</mysql.version>
        <kafka.version>1.1.0</kafka.version>
        <junit.version>4.12</junit.version>
        <streaming.kafka.version>2.3.2</streaming.kafka.version>
        <scala.compat.version>2.11</scala.compat.version>
    </properties>

    <dependencies>

        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-graphx_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-mllib_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <!-- SparkStreaming和kafka做整合 -->
        <!--<dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka_2.11</artifactId>
            <version>1.6.3</version>
        </dependency>-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
            <version>${streaming.kafka.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-flume_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>${mysql.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_2.11</artifactId>
            <version>${spark.version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka -->
        <!--<dependency>-->
        <!--<groupId>org.apache.kafka</groupId>-->
        <!--<artifactId>kafka_2.11</artifactId>-->
        <!--<version>${kafka.version}</version>-->
        <!--</dependency>-->

        <!-- https://mvnrepository.com/artifact/junit/junit -->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>${junit.version}</version>
            <scope>test</scope>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.2.6</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>1.2.6</version>
        </dependency>

        <!--<dependency>-->
        <!--<groupId>org.spark-project.spark</groupId>-->
        <!--<artifactId>unused</artifactId>-->
        <!--<version>1.0.0</version>-->
        <!--<scope>compile</scope>-->
        <!--</dependency>-->

        <!-- https://mvnrepository.com/artifact/com.101tec/zkclient -->
        <dependency>
            <groupId>com.101tec</groupId>
            <artifactId>zkclient</artifactId>
            <version>0.3</version>
        </dependency>


        <!-- https://mvnrepository.com/artifact/org.apache.zookeeper/zookeeper -->
        <dependency>
            <groupId>org.apache.zookeeper</groupId>
            <artifactId>zookeeper</artifactId>
            <version>3.4.12</version>
            <type>pom</type>
        </dependency>

    </dependencies>

    <build>
        <pluginManagement>
            <plugins>
                <plugin>
                    <groupId>net.alchim31.maven</groupId>
                    <artifactId>scala-maven-plugin</artifactId>
                    <version>3.2.2</version>
                </plugin>
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-compiler-plugin</artifactId>
                    <version>3.5.1</version>
                </plugin>
            </plugins>
        </pluginManagement>
        <plugins>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
                        <phase>process-resources</phase>
                        <goals>
                            <goal>add-source</goal>
                            <goal>compile</goal>
                        </goals>
                    </execution>
                    <execution>
                        <id>scala-test-compile</id>
                        <phase>process-test-resources</phase>
                        <goals>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>

            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <executions>
                    <execution>
                        <phase>compile</phase>
                        <goals>
                            <goal>compile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>

            <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-shade-plugin</artifactId>
                    <version>2.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>