1、realtime工程构建,
(1)步骤图
(2)realtime工程的pom.xml文件添加jar文件依赖:
<dependencies>
<dependency>
<groupId>com.study.gmall20200213</groupId>
<artifactId>gmall20200213-common</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.10.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>io.searchbox</groupId>
<artifactId>jest</artifactId>
<version>5.3.3</version>
</dependency>
<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>commons-compiler</artifactId>
<version>2.7.8</version>
</dependency>
</dependencies>
依赖包导入进去即可
2、realtime工程代码编写
(1)在main文件目录下创建一个scala文件,并设定root(蓝色文件)
(2)代码编写
DauApp.scala
package com.study.gmall2000213.reltime.app
import com.study.gmall2000213.reltime.util.MyKafkaUtil
import com.study.gmall20200213.common.constant.GmallConstant
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
object DauApp {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("dau_app").setAppName("local[*]")
val ssc = new StreamingContext(sparkConf,Seconds(5))
val inputDStream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getKafkaStream(GmallConstant.KAFKA_TOPIC_STARTUP,ssc)
inputDStream.foreachRDD{rdd=>
println(rdd.map(_.value()).collect().mkString("\n"))
}
ssc.start()
ssc.awaitTermination()
}
}
MyKafkaUtil.scala
package com.study.gmall2000213.reltime.util
import java.util.Properties
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object MyKafkaUtil {
private val properties: Properties = PropertiesUtil.load("config.properties")
val broker_list = properties.getProperty("kafka.broker.list")
// kafka消费者配置
val kafkaParam = Map(
"bootstrap.servers" -> broker_list,//用于初始化链接到集群的地址
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
//用于标识这个消费者属于哪个消费团体
"group.id" -> "gmall_consumer_group",
//如果没有初始化偏移量或者当前的偏移量不存在任何服务器上,可以使用这个配置属性
//可以使用这个配置,latest自动重置偏移量为最新的偏移量
"auto.offset.reset" -> "latest",
//如果是true,则这个消费者的偏移量会在后台自动提交,但是kafka宕机容易丢失数据
//如果是false,会需要手动维护kafka偏移量
"enable.auto.commit" -> (true: java.lang.Boolean)
)
// 创建DStream,返回接收到的输入数据
// LocationStrategies:根据给定的主题和集群地址创建consumer
// LocationStrategies.PreferConsistent:持续的在所有Executor之间分配分区
// ConsumerStrategies:选择如何在Driver和Executor上创建和配置Kafka Consumer
// ConsumerStrategies.Subscribe:订阅一系列主题
def getKafkaStream(topic: String,ssc:StreamingContext): InputDStream[ConsumerRecord[String,String]]={
val dStream = KafkaUtils.createDirectStream[String,String](ssc, LocationStrategies.PreferConsistent,ConsumerStrategies.Subscribe[String,String](Array(topic),kafkaParam))
dStream
}
}
PropertiesUtil.scala
package com.study.gmall2000213.reltime.util
import java.io.InputStreamReader
import java.util.Properties
object PropertiesUtil {
def main(args: Array[String]): Unit = {
val properties: Properties = PropertiesUtil.load("config.properties")
println(properties.getProperty("kafka.broker.list"))
}
def load(propertieName:String): Properties ={
val prop=new Properties();
prop.load(new InputStreamReader(Thread.currentThread().getContextClassLoader.getResourceAsStream(propertieName) , "UTF-8"))
prop
}
}
RedisUtil.scala
package com.study.gmall2000213.reltime.util
import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}
object RedisUtil {
var jedisPool:JedisPool=null
def getJedisClient: Jedis = {
if(jedisPool==null){
// println("开辟一个连接池")
val config = PropertiesUtil.load("config.properties")
val host = config.getProperty("redis.host")
val port = config.getProperty("redis.port")
val jedisPoolConfig = new JedisPoolConfig()
jedisPoolConfig.setMaxTotal(100) //最大连接数
jedisPoolConfig.setMaxIdle(20) //最大空闲
jedisPoolConfig.setMinIdle(20) //最小空闲
jedisPoolConfig.setBlockWhenExhausted(true) //忙碌时是否等待
jedisPoolConfig.setMaxWaitMillis(500)//忙碌时等待时长 毫秒
jedisPoolConfig.setTestOnBorrow(true) //每次获得连接的进行测试
jedisPool=new JedisPool(jedisPoolConfig,host,port.toInt)
}
// println(s"jedisPool.getNumActive = ${jedisPool.getNumActive}")
// println("获得一个连接")
jedisPool.getResource
}
}
config.properties
# Kafka配置
kafka.broker.list=hadoop5:9092,hadoop6:9092,hadoop7:9092
# Redis配置
redis.host=hadoop5
redis.port=6379
启动PropertiesUtil入口类,控制台打印
接下来,我们需要把log4j的配置文件从子模块gamll0315-logger,移动至子模块gmall0315-realtime的resource文件目录下
log4j.properties
#输出到控制台。
# 其中log4j.appender是固定写法,后面的.atguigu.MyConsole是自定义取名
log4j.appender.atguigu.MyConsole=org.apache.log4j.ConsoleAppender
#输出的控制台有2个
log4j.appender.atguigu.MyConsole.target=System.out
#布局
log4j.appender.atguigu.MyConsole.layout=org.apache.log4j.PatternLayout
#自定义格式,如: logger.info(jsonObject.toJSONString())
log4j.appender.atguigu.MyConsole.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %10p (%c:%M) - %m%n
#文件输出(每日滚动)
log4j.appender.atguigu.File=org.apache.log4j.DailyRollingFileAppender
#输出文件路径,文件后缀名
log4j.appender.atguigu.File.file=/applog/gamll0315-logger/log/app.log
#扩展名。如:新日志的产生与老日志并存,但老日志会重新取名
log4j.appender.atguigu.File.DatePattern='.'yyyy-MM-dd
#布局,根据日志换行,其他都没有
log4j.appender.atguigu.File.layout=org.apache.log4j.PatternLayout
log4j.appender.atguigu.File.layout.ConversionPattern=%m%n
#日志级别
# rootLogger:打印所有类
log4j.rootLogger=error,atguigu.MyConsole
#日志级别(从低到高): trace debug info warn error fatal
#日志的级别内容越低,输出的内容就越多
接着,启动程序 先模拟数据JsonMocker后启动DauApp
我们先在kafka执行消费者看看
[root@flink102 kafka-2.11]# bin/kafka-console-consumer.sh --bootstrap-server Flink102:9092 --topic GMALL_STARTUP
就启动JsonMocker,数据进来了
查看数据日志
[root@flink102 applog]# ll
total 0
drwxr-xr-x 3 root root 17 Mar 16 11:10 gamll0315-logger
drwxr-xr-x 2 root root 49 Mar 16 11:07 gmall0315
[root@flink102 applog]# cd gamll0315-logger/
[root@flink102 gamll0315-logger]# ll
total 0
drwxr-xr-x 2 root root 21 Mar 16 16:10 log
[root@flink102 gamll0315-logger]# cd log/
[root@flink102 log]# ll
total 2040
//今天的数据,已经更新了,3月16号
-rw-r--r-- 1 root root 2086424 Mar 16 16:10 app.log
[root@flink102 log]#
查看数据日志,执行命令:
[root@flink102 log]# tail -20f app.log
查看当前进程
[root@flink102 ~]# jps -l
1745 org.apache.zookeeper.server.quorum.QuorumPeerMain
1781 kafka.Kafka
2344 kafka.tools.ConsoleConsumer
2074 gamll0315-logger-0.0.1-SNAPSHOT.jar
2811 sun.tools.jps.Jps
[root@flink102 ~]#
重新启动,先启动JsonMocker,再启动DauAPP,发现DauAPP仍然报错
解决方案:
首先单击菜单栏里的run,里面有一个Edit Configuration选项,选中这个选项。
选中以后,在出现的选项框左侧选中你要运行的项目,在右侧Configuration下有个VM options,在这个框中填入-Dspark.master=local[],然后Apply,点击OK,在运行程序就正常运行了。
需注意:填入-Dspark.master=local[]时,中间不要有空格,否则程序输出结果会乱码,无法识别)
以下说明有数据了,数据算是打通了
3、代码编写工作
(1)制作case class
Startuplog.class
package com.study.gmall0315.realtime.bean
case class Startuplog(mid:String,
uid:String,
appid:String,
area:String,
os:String,
ch:String,
logType:String,
vs:String,
var logDate:String,
var logHour:String,
var logHourMinute:String,
var ts:Long
) {
}
DauApp.scala
package com.study.gmall0315.realtime.app
import java.text.SimpleDateFormat
import java.util.Date
import com.alibaba.fastjson.JSON
import com.study.gmall0315.common.constant.GmallConstant
import com.study.gmall0315.realtime.bean.Startuplog
import com.study.gmall0315.realtime.util.{MyKafkaUtil, RedisUtil}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import redis.clients.jedis.Jedis
object DauApp {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("dau_app").setAppName("local[*]")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val inputDStream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getKafkaStream(GmallConstant.KAFKA_TOPIC_STARTUP, ssc)
/* inputDStream.foreachRDD{rdd=>
println(rdd.map(_.value()).collect().mkString("\n"))
}
*/
//转换处理
val startuplogStream: DStream[Startuplog] = inputDStream.map { record =>
val jsonStr = record.value()
val startuplog = JSON.parseObject(jsonStr, classOf[Startuplog])
val date = new Date(startuplog.ts) //date转成String
val dateStr: String = new SimpleDateFormat("yyyy-MM-dd HH:mm").format(date)
val dateArr: Array[String] = dateStr.split(" ")
startuplog.logDate = dateArr(0)
startuplog.logHour = dateArr(1).split(":")(0)
startuplog.logHourMinute = dateArr(1)
startuplog
}
//利用redis进行去重过滤
//保持到redis中
startuplogStream.foreachRDD { rdd =>
/**
* driver
* redis type set
* key dau:2020.3.16 value:mids
*/
rdd.foreachPartition { startuplogItr =>
//executor
val jedis: Jedis = RedisUtil.getJedisClient
for (startuplog <- startuplogItr) {
val key = "dau:" + startuplog.logDate
val value = startuplog.mid
jedis.sadd(key, value)
}
jedis.close()
}
}
ssc.start()
ssc.awaitTermination()
}
}
(2)虚拟机准备工作
先把redis启动起来
[root@flink102 applog]# service redisd start
[root@flink102 applog]# redis-cli
127.0.0.1:6379>
//flushall的作用是清空redis库的数据
127.0.0.1:6379> flushall
OK
//查看所有库的数据,表示清空ok,没有数据了
127.0.0.1:6379> keys *
(empty list or set)
(3)idea代码启动
先把模拟数据JsonMocker启动起来,后启动DauApp
进入redis客户端查看
命令查看
127.0.0.1:6379> keys *
1) "dau:2020-03-17"
127.0.0.1:6379>
显示数据信息,执行:smembers 加数据信息
,执行如下
127.0.0.1:6379> smembers dau:2020-03-17
1) "mid_245"
2) "mid_421"
3) "mid_177"
4) "mid_479"
5) "mid_401"
6) "mid_1"
7) "mid_288"
8) "mid_133"
9) "mid_454"
10) "mid_241"
11) "mid_385"
12) "mid_17"
13) "mid_238"
14) "mid_486"
15) "mid_152"
16) "mid_99"
17) "mid_49"
18) "mid_27"
19) "mid_31"
20) "mid_215"
21) "mid_194"
22) "mid_402"
23) "mid_95"
24) "mid_268"
25) "mid_98"
26) "mid_111"
27) "mid_162"
28) "mid_403"
29) "mid_451"
30) "mid_335"
31) "mid_70"
32) "mid_300"
33) "mid_463"
34) "mid_466"
35) "mid_91"
36) "mid_353"
37) "mid_433"
38) "mid_462"
39) "mid_405"
40) "mid_58"
41) "mid_382"
42) "mid_469"
43) "mid_370"
44) "mid_281"
45) "mid_131"
46) "mid_442"
..........
..........
498) "mid_118"
499) "mid_183"
500) "mid_446"
127.0.0.1:6379>