1、编写HBase工具类
package HBaseDao;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import java.io.IOException;
/*
*
* 操作Hbase的工具类
* */
public class HBaseUtils {
HBaseAdmin admin = null;
Configuration configuration = null;
/*
* 私有化构造器
* */
private HBaseUtils() {
configuration = new Configuration();
configuration.set("hbase.zookeeper.quorum","qyl01,qyl02,qyl03");
configuration.set("hbase.rootdir","hdfs:///hbase");
try{
admin = new HBaseAdmin(configuration);
}
catch(IOException e){
e.printStackTrace();
}
}
private static HBaseUtils instance = null;
public static HBaseUtils getInstance() {
if (null == instance){
instance = new HBaseUtils();
}
return instance;
}
/*
* 根据表名获取HBase实例
* @param tableName
* @return
* */
public HTable getTable(String tableName){
HTable table = null ;
try {
table = new HTable(configuration, tableName);
}catch (IOException e){
e.printStackTrace();
}
return table;
}
/**
* 添加一条记录到HBase表
* @param tableName 表名
* @param rowkey rowkey
* @param cf columnFamily
* @param column 列
* @param value 写入的值
*/
public void put(String tableName,String rowkey,String cf,String column,String value){
HTable table = getTable(tableName);
Put put = new Put(rowkey.getBytes());
put.add(cf.getBytes(),column.getBytes(),value.getBytes());
try{
table.put(put);
}catch (IOException e){
e.printStackTrace();
}
}
/*
* 插入一条数据到HBase中进行操作
* */
/*
public static void main(String[] args) {
String tableName = "course_clickcount";
String rowkey = "20191111_188";
String cf = "info";
String column = "click_count";
String value = "2";
HBaseUtils.getInstance().put(tableName,rowkey,cf,column,value);
}
*/
}
2、编写HBase操作类
package HBaseDao
import java.io.IOException
import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.util.Bytes
import scala.collection.mutable.ListBuffer
/*
* 操作hbase的dao
* */
object ClickCourseCountDao {
/*
* hbase中的各参数,参照个人配置
* */
val tableName = "course_clickcount"
val cf = "info"
val column = "clickcount"
/*
* 插入结果数据的方法
* */
def save(List:ListBuffer[ClickCoursCount]):Unit = {
val htable = HBaseUtils.getInstance().getTable(tableName)
for(clk <- List){
htable.incrementColumnValue(
clk.dayCourse.getBytes(),
cf.getBytes(),
column.getBytes(),
clk.clickCount
)
}
}
/*
* 取出tableName中column对应的value的数据
* */
def count(dayCourse:String):Long = {
val htable = HBaseUtils.getInstance().getTable(tableName)
val get = new Get(dayCourse.getBytes())
val value = htable.get(get).getValue(cf.getBytes(),column.getBytes())
if(null == value){
0L
}else{
Bytes.toLong(value)
}
}
def main(args: Array[String]): Unit = {
val listbuffer = new ListBuffer[ClickCoursCount]
/*
* 插入数据测试
* */
listbuffer.append(ClickCoursCount("20191111_88",1L))
listbuffer.append(ClickCoursCount("20191111_88",2L))
listbuffer.append(ClickCoursCount("20191111_88",2L))
save(listbuffer)
println(count("20191111_88")+"------"+count("20191111_88"))
}
}
/*
* 点击量的实体类
* */
case class ClickCoursCount(dayCourse:String,clickCount:Long)
3、编写SparkStreaming代码读取Kafka的数据写入HBase
package com.bonc.qyl.Spark
import HBaseDao.{ClickCoursCount, ClickCourseCountDao}
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.ListBuffer
/*
* flume + kafka +SparkStreaming +HBase
* */
object ProjectStreaming {
def main(args: Array[String]): Unit = {
/*
* 实际项目中 应该是传入参数,这里不做演示了
* if(args.length != 2){
System.err.println("Usage ProjectStreaming: <brokers> <topics>")
System.exit(1)
}
*/
/*
* 建立连接
* */
System.setProperty("HADOOP_USER_NAME","qyl")
val conf = new SparkConf().setMaster("local[2]").setAppName("ProjectStreaming")
val ssc = new StreamingContext(conf,Seconds(5))
ssc.checkpoint("hdfs:///flume-kafka-direct")
/*
* 读取kafka中的数据
* */
val kafkaParams = Map[String,String]("metadata.broker.list" -> "qyl01:9092,qyl02:9092,qyl03:9092","auto.offset.reset" -> "smallest")
val topics = Set("flume-kafka-sparkStreaming-HBase1")
val kafkaDStream: DStream[String] = KafkaUtils.createDirectStream
[String, String, StringDecoder, StringDecoder](ssc,kafkaParams,topics).map(_._2)
/*
* 数据过滤
* 数据格式
* 132.168.89.224 2018-07-13 05:53:02 "GET /class/145.html HTTP/1.1" 200 https://search.yahoo.com/search?p=Flink实战
* */
val cleanData : DStream[ClickLog] = kafkaDStream.map { x =>
val strArr = x.split("\t")
val ip = strArr(0)
val time = strArr(1).substring(0,10).trim()
val refer = strArr(2).split(" ")(1)
val status = strArr(3).toInt
val searchArr = strArr(4).replaceAll("//", "/").split("/")
var searchUrl = ""
if (searchArr.length > 2) {
searchUrl = searchArr(1)
} else {
searchUrl = searchArr(0)
}
(ip, time, refer, status, searchUrl)
}.filter(_._3.startsWith("/class")).map { x =>
// 145.html
val referStr = x._3.split("/")(2)
val refer = referStr.substring(0, referStr.lastIndexOf("."))
ClickLog(x._1, x._2, refer, x._4, x._5)
}
/*
* 需求:统计到今天为止,的访问量
*/
cleanData.map(x =>(x.time +"_"+x.refer,1)).reduceByKey(_+_).foreachRDD{rdd =>{
rdd.foreachPartition{rddPartition =>
val list = new ListBuffer[ClickCoursCount]
rddPartition.foreach{ pair =>
list.append(ClickCoursCount(pair._1,pair._2))
}
/*
* 写入数据到HBase
* */
ClickCourseCountDao.save(list)
}
}}
ssc.start()
ssc.awaitTermination()
}
}
case class ClickLog(ip:String,time:String,refer:String,status:Int,searchUrl:String)
4、pom.xml文件
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.bonc.qyl.Spark</groupId>
<artifactId>Kafka_SparkStreaming_Hbase</artifactId>
<version>1.0-SNAPSHOT</version>
<inceptionYear>2008</inceptionYear>
<properties>
<project.build.sourceEncoding>UTF8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<encoding>UTF-8</encoding>
<scala.version>2.11.8</scala.version>
<spark.version>2.3.2</spark.version>
<hadoop.version>2.7.7</hadoop.version>
<mysql.version>5.1.46</mysql.version>
<kafka.version>1.1.0</kafka.version>
<junit.version>4.12</junit.version>
<streaming.kafka.version>2.3.2</streaming.kafka.version>
<scala.compat.version>2.11</scala.compat.version>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- SparkStreaming和kafka做整合 -->
<!--<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka_2.11</artifactId>
<version>1.6.3</version>
</dependency>-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>${streaming.kafka.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-flume_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka -->
<!--<dependency>-->
<!--<groupId>org.apache.kafka</groupId>-->
<!--<artifactId>kafka_2.11</artifactId>-->
<!--<version>${kafka.version}</version>-->
<!--</dependency>-->
<!-- https://mvnrepository.com/artifact/junit/junit -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.6</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.6</version>
</dependency>
<!--<dependency>-->
<!--<groupId>org.spark-project.spark</groupId>-->
<!--<artifactId>unused</artifactId>-->
<!--<version>1.0.0</version>-->
<!--<scope>compile</scope>-->
<!--</dependency>-->
<!-- https://mvnrepository.com/artifact/com.101tec/zkclient -->
<dependency>
<groupId>com.101tec</groupId>
<artifactId>zkclient</artifactId>
<version>0.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.zookeeper/zookeeper -->
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.4.12</version>
<type>pom</type>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
</plugin>
</plugins>
</pluginManagement>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>