日志项目在线分析每个城市的点击数
从kafka里面消费,由SparkStreaming来处理,存到hbase里面
问题: table.incrementColumnValue 这里用了这个方法 可以指定列的值 自增
如果对于这种问题,我也想用put来做呢
1、使用updateStateByKey ,然后put到hbase,相同的列,版本保存最新的,
但是局限性,它每次会把历史所有的都会输出,5000个城市只有2个有变更,全要重新导入,浪费资源
2、使用mapWithState 只会把更新的输出,解决
ppackage com.atguigu.online
import java.io.File
import java.text.SimpleDateFormat
import java.util.Properties
import com.atguigu.model.StartupReportLogs
import com.atguigu.utils.{HBaseUtils, JsonUtils, ZookeeperUtils}
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.hadoop.hbase.client.Table
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.clients.consumer
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{KafkaCluster, KafkaUtils}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* @author wade
* @create 2019-03-22 15:21
*/
object KafkaToSparkToHbase {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("wawa").setMaster("local[*]")
val ssc = new StreamingContext(conf,Seconds(3))
val params = Map(
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "hadoop103:9092,hadoop104:9092,hadoop105:9092",
"zookeeper.connect" -> "hadoop103:2181,hadoop104:2181,hadoop105:2181",
ConsumerConfig.GROUP_ID_CONFIG -> "bb"
)
val kafkaCluster = new KafkaCluster (params)
val fromOffsets: Map[TopicAndPartition, Long] = ZookeeperUtils.getOffsetFromZookeeper(kafkaCluster,"bb",Set("log-analysis"))
val idStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder,String](
ssc,
params,
fromOffsets,
(message:MessageAndMetadata[String,String]) => message.message()
)
idStream.foreachRDD(rdd => {
val sdf = new SimpleDateFormat("yyyyMMdd")
//table 不能被序列化,
//使用 foreachPartition 在分区里面进行遍历就没问题了
rdd.foreachPartition(t =>{
val table: Table = HBaseUtils.getHBaseTabel(new Properties())
//向Hbase里面 写 城市_日期 count
t.foreach(s =>{
val startupReportLog: StartupReportLogs = JsonUtils.json2StartupLog(s)
val date = sdf.format(startupReportLog.getActiveTimeInMs)
var rowkey = startupReportLog.getCity+"_"+date
table.incrementColumnValue(
Bytes.toBytes(rowkey),
Bytes.toBytes("info"),
Bytes.toBytes("click_count"),
1L
)
})
table.close()
})
})
//
//提交完成后更新offset
ZookeeperUtils.offsetToZookeeper(idStream,kafkaCluster,"bb")
ssc.start()
ssc.awaitTermination()
}
}
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>log-analysis</artifactId>
<groupId>com.atguigu</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>com.atguigu</groupId>
<artifactId>data-processing</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.3.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.3.2.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-8 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.11</artifactId>
<version>0.8.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.8.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-core</artifactId>
<version>2.8.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
</dependencies>
</project>