Scala_Spark-电商平台离线分析项目-需求七前数据生成与数据消费测试
第四模块:广告流量实时统计统计
技术点:SparkStreaming、kafka集群
kafka.broker.list=node01:9092,node02:9092,node03:9092
kafka.topics=AdRealTimeLog0308
(一)执行步骤
1)试验一下本地生产数据能否发送到kafka里去
-
开启zookeeper集群
【三台服务器启动zookeeper】,三台机器都执行以下命令启动zookeeper
cd /export/servers/zookeeper-3.4.5-cdh5.14.0
bin/zkServer.sh start
进程QuorumPeerMain -
开启kafka集群
【启动kafka集群】默认端口9092
三台机器启动kafka服务
[root@node01 servers]# cd /export/servers/kafka_2.11-1.0.0/
前台启动 ./kafka-server-start.sh …/config/server.properties
后台启动命令 nohup bin/kafka-server-start.sh config/server.properties > /dev/null 2>&1 & -
node01开启一个消费者
所有配置完成的情况下
[root@node01 ~]# kafka-console-consumer.sh --zookeeper node01:2181 --topic AdRealTimeLog0308 Using the ConsoleConsumer with old consumer is deprecated and will be removed in a future major release. Consider using the new consumer by passing [bootstrap-server] instead of [zookeeper]. // 等待消费
-
本地运行模拟生产实时数据文件MockRealTimeData.scala
// node01上出现 集群上消费成功 代表生产数据发送到kafka没有问题 1573145438221 3 3 93 5 1573145438221 6 6 87 17 1573145438221 0 0 10 16 1573145438221 7 7 11 15 1573145438221 0 0 8 18 1573145438221 0 0 97 1
2)试验一下能否在IDEA本地消费kafka集群里的数据
-
在以上试验成功状态下,IDEA运行AdverStat.scala文件
MockRealTimeData.scala 生产数据到kafka
//本地控制台输出 本地消费成功 代表本地从kafka消费数据没有问题 0000-00-00 00:00:01,200 WARN --- [ main] org.apache.spark.streaming.kafka010.KafkaUtils (line: 66) : overriding receive.buffer.bytes to 65536 see KAFKA-3135 1573150923605 1 1 93 2 1573150923605 7 7 19 0 1573150923605 2 2 72 0 1573150923605 1 1 3 3 1573150923605 1 1 44 0 1573150923605 7 7 65 15 1573150923605 6 6 64 11 1573150923605 9 9 20 2
(二)IDEA代码
1)实现
1.主程序 AdverStat.scala
import commons.conf.ConfigurationManager
import commons.constant.Constants
import org.apache.hadoop.hdfs.DFSClient.Conf
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* AdverStat.scala
*
* 第三个模块:广告流量实时统计
* 需求七前数据生成与数据消费测试
*
* 技术点:sparkStreaming kafka
*/
object AdverStat {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("adverstat").setMaster("local[*]")
val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()
// 标准应当是 val streamingContext = StreamingContext.getActiveOrCreate(checkpointDir,func)
val streamingContext = new StreamingContext(sparkSession.sparkContext,Seconds(5))
// 配置kafka相关信息
val kafka_brokers = ConfigurationManager.config.getString(Constants.KAFKA_BROKERS) //node01:9092,node02:9092,node03:9092
val kafka_topics = ConfigurationManager.config.getString(Constants.KAFKA_TOPICS) //kafka.topics=AdRealTimeLog0308
// kafka配置信息
val kafkaParam = Map(
"bootstrap.servers" -> kafka_brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "group1",
// auto.offset.reset
// latest: 先去Zookeeper获取offset,如果有,直接使用,如果没有,从最新的数据开始消费
// earlist: 先去zookeeper获取offset,如果有直接使用,如果没有,从最开始的数据开始消费
// none: 先去Zookeeper获取offset,如果有,直接使用,如果没有,直接报错
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false:java.lang.Boolean)
)
// 创建DStream
// 从kafka中消费数据 拿到的每一条数据都是messege,里面是key-value
val adRealTimeDStream = KafkaUtils.createDirectStream[String,String](
streamingContext,
// 让kafka分区均匀地在excutor上分配 有三种选择
LocationStrategies.PreferConsistent,
// 消费者订阅
ConsumerStrategies.Subscribe[String,String](Array(kafka_topics),kafkaParam)
)
// 取出了DStream里面每一条数据的value值
// adReadTimeValueDStream:DStream[RDD RDD RDD ...] RDD[String]
// String: timestamp province city userid adid
val adReadTimeValueDStream = adRealTimeDStream.map(item => item.value())
// adRealTimeFilterDStream 所有不在黑名单里的实时数据都在里面了
val adRealTimeFilterDStream =adReadTimeValueDStream.transform{
logRDD =>
// blackListArray:Array[AdBlacklist] AdBlacklist:userId
val blackListArray = AdBlacklistDAO.findAll() // 这里连接了数据库 创建了mySqlPool
// userIdArray:Array[Long] [userId1,userId2,...]
var userIdArray = blackListArray.map(item=>item.userid)
// 过滤掉已经存在在黑名单里的
logRDD.filter{
// log: timestamp province city userid adid
case log =>
val logSplit = log.split(" ")
val userId = logSplit(3).toLong
! userIdArray.contains(userId)
}
}
adRealTimeFilterDStream.foreachRDD(rdd=>rdd.foreach(println(_)))
streamingContext.start()
streamingContext.awaitTermination()
}
}
2.模拟生产实时数据 MockRealTimeData.scala
/*
* MockRealTimeData.scala
*/
import java.util.Properties
import commons.conf.ConfigurationManager
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
object MockRealTimeData {
/**
* 模拟的数据
* 时间点: 当前时间毫秒
* userId: 0 - 99
* 省份、城市 ID相同 : 1 - 9
* adid: 0 - 19
* ((0L,"北京","北京"),(1L,"上海","上海"),(2L,"南京","江苏省"),(3L,"广州","广东省"),(4L,"三亚","海南省"),(5L,"武汉","湖北省"),(6L,"长沙","湖南省"),(7L,"西安","陕西省"),(8L,"成都","四川省"),(9L,"哈尔滨","东北省"))
* 格式 :timestamp province city userid adid
* 某个时间点 某个省份 某个城市 某个用户 某个广告
*/
def generateMockData(): Array[String] = {
val array = ArrayBuffer[String]()
val random = new Random()
// 模拟实时数据:
// timestamp province city userid adid
for (i <- 0 to 50) {
val timestamp = System.currentTimeMillis()
val province = random.nextInt(10)
val city = province
val adid = random.nextInt(20)
val userid = random.nextInt(100)
// 拼接实时数据
array += timestamp + " " + province + " " + city + " " + userid + " " + adid
}
array.toArray
}
def createKafkaProducer(broker: String): KafkaProducer[String, String] = {
// 创建配置对象
val prop = new Properties()
// 添加配置
prop.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, broker)
prop.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
prop.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer")
// 根据配置创建Kafka生产者
new KafkaProducer[String, String](prop)
}
def main(args: Array[String]): Unit = {
// 获取配置文件commerce.properties中的Kafka配置参数
val broker = ConfigurationManager.config.getString("kafka.broker.list")
val topic = ConfigurationManager.config.getString("kafka.topics")
// 创建Kafka消费者
val kafkaProducer = createKafkaProducer(broker)
while (true) {
// 随机产生实时数据并通过Kafka生产者发送到Kafka集群中
for (item <- generateMockData()) {
kafkaProducer.send(new ProducerRecord[String, String](topic, item))
}
Thread.sleep(5000)
}
}
}
3.数据库交互的各个方法
import java.sql.ResultSet
import commons.pool.{CreateMySqlPool, QueryCallback}
import scala.collection.mutable.ArrayBuffer
/**
* 需求七:广告黑名单实时统计工具方法
*/
/**
* 用户黑名单DAO类
*/
object AdBlacklistDAO {
/**
* 批量插入广告黑名单用户
*
* @param adBlacklists
*/
def insertBatch(adBlacklists: Array[AdBlacklist]) {
// 批量插入
val sql = "INSERT INTO ad_blacklist VALUES(?)"
val paramsList = new ArrayBuffer[Array[Any]]()
// 向paramsList添加userId
for (adBlacklist <- adBlacklists) {
val params: Array[Any] = Array(adBlacklist.userid)
paramsList += params
}
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 执行批量插入操作
client.executeBatch(sql, paramsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
/**
* 查询所有广告黑名单用户
*
* @return
*/
def findAll(): Array[AdBlacklist] = {
// 将黑名单中的所有数据查询出来
val sql = "SELECT * FROM ad_blacklist"
val adBlacklists = new ArrayBuffer[AdBlacklist]()
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 执行sql查询并且通过处理函数将所有的userid加入array中
client.executeQuery(sql, null, new QueryCallback {
override def process(rs: ResultSet): Unit = {
while (rs.next()) {
val userid = rs.getInt(1).toLong
adBlacklists += AdBlacklist(userid)
}
}
})
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
adBlacklists.toArray
}
}
/**
* 用户广告点击量DAO实现类
*
*/
object AdUserClickCountDAO {
def updateBatch(adUserClickCounts: Array[AdUserClickCount]) {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 首先对用户广告点击量进行分类,分成待插入的和待更新的
val insertAdUserClickCounts = ArrayBuffer[AdUserClickCount]()
val updateAdUserClickCounts = ArrayBuffer[AdUserClickCount]()
val selectSQL = "SELECT count(*) FROM ad_user_click_count WHERE date=? AND userid=? AND adid=? "
for (adUserClickCount <- adUserClickCounts) {
val selectParams: Array[Any] = Array(adUserClickCount.date, adUserClickCount.userid, adUserClickCount.adid)
// 根据传入的用户点击次数统计数据从已有的ad_user_click_count中进行查询
client.executeQuery(selectSQL, selectParams, new QueryCallback {
override def process(rs: ResultSet): Unit = {
// 如果能查询到并且点击次数大于0,则认为是待更新项
if (rs.next() && rs.getInt(1) > 0) {
updateAdUserClickCounts += adUserClickCount
} else {
insertAdUserClickCounts += adUserClickCount
}
}
})
}
// 执行批量插入
val insertSQL = "INSERT INTO ad_user_click_count VALUES(?,?,?,?)"
val insertParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
// 将待插入项全部加入到参数列表中
for (adUserClickCount <- insertAdUserClickCounts) {
insertParamsList += Array[Any](adUserClickCount.date, adUserClickCount.userid, adUserClickCount.adid, adUserClickCount.clickCount)
}
// 执行批量插入
client.executeBatch(insertSQL, insertParamsList.toArray)
// 执行批量更新
// clickCount=clickCount + :此处的UPDATE是进行累加
val updateSQL = "UPDATE ad_user_click_count SET clickCount=clickCount + ? WHERE date=? AND userid=? AND adid=?"
val updateParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
// 将待更新项全部加入到参数列表中
for (adUserClickCount <- updateAdUserClickCounts) {
updateParamsList += Array[Any](adUserClickCount.clickCount, adUserClickCount.date, adUserClickCount.userid, adUserClickCount.adid)
}
// 执行批量更新
client.executeBatch(updateSQL, updateParamsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
/**
* 根据多个key查询用户广告点击量
*
* @param date 日期
* @param userid 用户id
* @param adid 广告id
* @return
*/
def findClickCountByMultiKey(date: String, userid: Long, adid: Long): Int = {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
val sql = "SELECT clickCount FROM ad_user_click_count " +
"WHERE date=? " +
"AND userid=? " +
"AND adid=?"
var clickCount = 0
val params = Array[Any](date, userid, adid)
// 根据多个条件查询指定用户的点击量,将查询结果累加到clickCount中
client.executeQuery(sql, params, new QueryCallback {
override def process(rs: ResultSet): Unit = {
if (rs.next()) {
clickCount = rs.getInt(1)
}
}
})
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
clickCount
}
}
/**
* 广告实时统计DAO实现类
*
* @author Administrator
*
*/
object AdStatDAO {
def updateBatch(adStats: Array[AdStat]) {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 区分开来哪些是要插入的,哪些是要更新的
val insertAdStats = ArrayBuffer[AdStat]()
val updateAdStats = ArrayBuffer[AdStat]()
val selectSQL = "SELECT count(*) " +
"FROM ad_stat " +
"WHERE date=? " +
"AND province=? " +
"AND city=? " +
"AND adid=?"
for (adStat <- adStats) {
val params = Array[Any](adStat.date, adStat.province, adStat.city, adStat.adid)
// 通过查询结果判断当前项时待插入还是待更新
client.executeQuery(selectSQL, params, new QueryCallback {
override def process(rs: ResultSet): Unit = {
if (rs.next() && rs.getInt(1) > 0) {
updateAdStats += adStat
} else {
insertAdStats += adStat
}
}
})
}
// 对于需要插入的数据,执行批量插入操作
val insertSQL = "INSERT INTO ad_stat VALUES(?,?,?,?,?)"
val insertParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (adStat <- insertAdStats) {
insertParamsList += Array[Any](adStat.date, adStat.province, adStat.city, adStat.adid, adStat.clickCount)
}
client.executeBatch(insertSQL, insertParamsList.toArray)
// 对于需要更新的数据,执行批量更新操作
// 此处的UPDATE是进行覆盖
val updateSQL = "UPDATE ad_stat SET clickCount=? " +
"WHERE date=? " +
"AND province=? " +
"AND city=? " +
"AND adid=?"
val updateParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (adStat <- updateAdStats) {
updateParamsList += Array[Any](adStat.clickCount, adStat.date, adStat.province, adStat.city, adStat.adid)
}
client.executeBatch(updateSQL, updateParamsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
}
/**
* 各省份top3热门广告DAO实现类
*
* @author Administrator
*
*/
object AdProvinceTop3DAO {
def updateBatch(adProvinceTop3s: Array[AdProvinceTop3]) {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// dateProvinces可以实现一次去重
// AdProvinceTop3:date province adid clickCount,由于每条数据由date province adid组成
// 当只取date province时,一定会有重复的情况
val dateProvinces = ArrayBuffer[String]()
for (adProvinceTop3 <- adProvinceTop3s) {
// 组合新key
val key = adProvinceTop3.date + "_" + adProvinceTop3.province
// dateProvinces中不包含当前key才添加
// 借此去重
if (!dateProvinces.contains(key)) {
dateProvinces += key
}
}
// 根据去重后的date和province,进行批量删除操作
// 先将原来的数据全部删除
val deleteSQL = "DELETE FROM ad_province_top3 WHERE date=? AND province=?"
val deleteParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (dateProvince <- dateProvinces) {
val dateProvinceSplited = dateProvince.split("_")
val date = dateProvinceSplited(0)
val province = dateProvinceSplited(1)
val params = Array[Any](date, province)
deleteParamsList += params
}
client.executeBatch(deleteSQL, deleteParamsList.toArray)
// 批量插入传入进来的所有数据
val insertSQL = "INSERT INTO ad_province_top3 VALUES(?,?,?,?)"
val insertParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
// 将传入的数据转化为参数列表
for (adProvinceTop3 <- adProvinceTop3s) {
insertParamsList += Array[Any](adProvinceTop3.date, adProvinceTop3.province, adProvinceTop3.adid, adProvinceTop3.clickCount)
}
client.executeBatch(insertSQL, insertParamsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
}
/**
* 广告点击趋势DAO实现类
*
* @author Administrator
*
*/
object AdClickTrendDAO {
def updateBatch(adClickTrends: Array[AdClickTrend]) {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 区分开来哪些是要插入的,哪些是要更新的
val updateAdClickTrends = ArrayBuffer[AdClickTrend]()
val insertAdClickTrends = ArrayBuffer[AdClickTrend]()
val selectSQL = "SELECT count(*) " +
"FROM ad_click_trend " +
"WHERE date=? " +
"AND hour=? " +
"AND minute=? " +
"AND adid=?"
for (adClickTrend <- adClickTrends) {
// 通过查询结果判断当前项时待插入还是待更新
val params = Array[Any](adClickTrend.date, adClickTrend.hour, adClickTrend.minute, adClickTrend.adid)
client.executeQuery(selectSQL, params, new QueryCallback {
override def process(rs: ResultSet): Unit = {
if (rs.next() && rs.getInt(1) > 0) {
updateAdClickTrends += adClickTrend
} else {
insertAdClickTrends += adClickTrend
}
}
})
}
// 执行批量更新操作
// 此处的UPDATE是覆盖
val updateSQL = "UPDATE ad_click_trend SET clickCount=? " +
"WHERE date=? " +
"AND hour=? " +
"AND minute=? " +
"AND adid=?"
val updateParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (adClickTrend <- updateAdClickTrends) {
updateParamsList += Array[Any](adClickTrend.clickCount, adClickTrend.date, adClickTrend.hour, adClickTrend.minute, adClickTrend.adid)
}
client.executeBatch(updateSQL, updateParamsList.toArray)
// 执行批量更新操作
val insertSQL = "INSERT INTO ad_click_trend VALUES(?,?,?,?,?)"
val insertParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (adClickTrend <- insertAdClickTrends) {
insertParamsList += Array[Any](adClickTrend.date, adClickTrend.hour, adClickTrend.minute, adClickTrend.adid, adClickTrend.clickCount)
}
client.executeBatch(insertSQL, insertParamsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
}
4.各类表
/**
* 需求七 广告黑名单实时统计样例类
*/
/**
* 广告黑名单
* @author wuyufei
*
*/
case class AdBlacklist(userid:Long)
/**
* 用户广告点击量
* @author wuyufei
*
*/
case class AdUserClickCount(date:String,
userid:Long,
adid:Long,
clickCount:Long)
/**
* 广告实时统计
* @author wuyufei
*
*/
case class AdStat(date:String,
province:String,
city:String,
adid:Long,
clickCount:Long)
/**
* 各省top3热门广告
* @author wuyufei
*
*/
case class AdProvinceTop3(date:String,
province:String,
adid:Long,
clickCount:Long)
/**
* 广告点击趋势
* @author wuyufei
*
*/
case class AdClickTrend(date:String,
hour:String,
minute:String,
adid:Long,
clickCount:Long)
2)配置
commerce.properties
#
# commerce.properties
#
# jbdc配置
jdbc.datasource.size=10
jdbc.url=jdbc:mysql://localhost:3306/commerce?useUnicode=true&characterEncoding=utf8
jdbc.user=root
jdbc.password=123456
# 可以使用的属性如下:
# startDate: 格式: yyyy-MM-DD [必选]
# endDate: 格式: yyyy-MM-DD [必选]
# startAge: 范围: 0 - 59
# endAge: 范围: 0 - 59
# professionals: 范围:professionals[0 - 59]
# cities: 0 - 9 ((0,"北京","华北"),(1,"上海","华东"),(2,"南京","华东"),(3,"广州","华南"),(4,"三亚","华南"),(5,"武汉","华中"),(6,"长沙","华中"),(7,"西安","西北"),(8,"成都","西南"),(9,"哈尔滨","东北"))
# sex: 范围: 0 - 1
# keywords: 范围: ("火锅", "蛋糕", "重庆辣子鸡", "重庆小面", "呷哺呷哺", "新辣道鱼火锅", "国贸大厦", "太古商场", "日本料理", "温泉")
# categoryIds:0 - 99,以逗号分隔
# targetPageFlow: 0 - 99, 以逗号分隔
task.params.json={startDate:"2019-10-01", \
endDate:"2019-10-30", \
startAge: 20, \
endAge: 50, \
professionals: "", \
cities: "", \
sex:"", \
keywords:"", \
categoryIds:"", \
targetPageFlow:"1,2,3,4,5,6,7"}
# Kafka配置
kafka.broker.list=node01:9092,node02:9092,node03:9092
kafka.topics=AdRealTimeLog0308
常量配置文件
/*
* Constants.scala
*/
package commons.constant
/**
* 常量接口
*/
object Constants {
/**
* 项目配置相关的常量
*/
val JDBC_DATASOURCE_SIZE = "jdbc.datasource.size"
val JDBC_URL = "jdbc.url"
val JDBC_USER = "jdbc.user"
val JDBC_PASSWORD = "jdbc.password"
val KAFKA_BROKERS = "kafka.broker.list"
val KAFKA_TOPICS = "kafka.topics"
/**
* Spark作业相关的常量
*/
val SPARK_APP_NAME_SESSION = "UserVisitSessionAnalyzeSpark"
val SPARK_APP_NAME_PAGE = "PageOneStepConvertRateSpark"
/**
* user_visit_action、user_info、product_info表中字段对应的字段名常量
*/
val FIELD_SESSION_ID = "sessionid"
val FIELD_SEARCH_KEYWORDS = "searchKeywords"
val FIELD_CLICK_CATEGORY_IDS = "clickCategoryIds"
val FIELD_AGE = "age"
val FIELD_PROFESSIONAL = "professional"
val FIELD_CITY = "city"
val FIELD_SEX = "sex"
val FIELD_VISIT_LENGTH = "visitLength"
val FIELD_STEP_LENGTH = "stepLength"
val FIELD_START_TIME = "startTime"
val FIELD_CLICK_COUNT = "clickCount"
val FIELD_ORDER_COUNT = "orderCount"
val FIELD_PAY_COUNT = "payCount"
val FIELD_CATEGORY_ID = "categoryid"
/**
* Spark累加器Key名称常量
*/
val SESSION_COUNT = "session_count"
val TIME_PERIOD_1s_3s = "1s_3s"
val TIME_PERIOD_4s_6s = "4s_6s"
val TIME_PERIOD_7s_9s = "7s_9s"
val TIME_PERIOD_10s_30s = "10s_30s"
val TIME_PERIOD_30s_60s = "30s_60s"
val TIME_PERIOD_1m_3m = "1m_3m"
val TIME_PERIOD_3m_10m = "3m_10m"
val TIME_PERIOD_10m_30m = "10m_30m"
val TIME_PERIOD_30m = "30m"
val STEP_PERIOD_1_3 = "1_3"
val STEP_PERIOD_4_6 = "4_6"
val STEP_PERIOD_7_9 = "7_9"
val STEP_PERIOD_10_30 = "10_30"
val STEP_PERIOD_30_60 = "30_60"
val STEP_PERIOD_60 = "60"
/**
* task.params.json中限制条件对应的常量字段
*/
val TASK_PARAMS = "task.params.json"
val PARAM_START_DATE = "startDate"
val PARAM_END_DATE = "endDate"
val PARAM_START_AGE = "startAge"
val PARAM_END_AGE = "endAge"
val PARAM_PROFESSIONALS = "professionals"
val PARAM_CITIES = "cities"
val PARAM_SEX = "sex"
val PARAM_KEYWORDS = "keywords"
val PARAM_CATEGORY_IDS = "categoryIds"
val PARAM_TARGET_PAGE_FLOW = "targetPageFlow"
}
maven依赖
<dependencies>
<dependency>
<groupId>com.atguigu</groupId>
<artifactId>commons</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<!-- Spark的依赖引入 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
</dependency>
<!-- 引入Scala -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.10.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<!-- scala-maven-plugin插件用于在任意的maven项目中对scala代码进行编译/测试/运行/文档化 -->
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>com.atguigu.stream.AdClickRealTimeStat</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
</build>