抽象类提取
BaseAppV2
import com.atguigu.gmall.realtime.util.MyKafkaUtil
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
abstract class BaseAppV2 {
val topics: Set[String]
val groupId: String
val master: String
val appName: String
val bachTime: Int
var ssc: StreamingContext = _
def run(streams: Map[String, DStream[String]]) : Unit
def main(args: Array[String]): Unit = {
// 1. 先创建 StreamingContext
val conf: SparkConf = new SparkConf().setMaster(master).setAppName(appName)
ssc = new StreamingContext(conf, Seconds(bachTime))
// 2. 从kafka得到一个stream
val streams = topics.map(topic => {
(topic,MyKafkaUtil.getKafkaStream(ssc, groupId, Set(topic)))
}).toMap
run(streams)
// 4. 启动上下文
ssc.start()
// 5. 阻塞
ssc.awaitTermination()
}
}
import org.apache.spark.streaming.dstream.DStream
object aa extends BaseAppV2{
override val topics: Set[String] = _
override val groupId: String = _
override val master: String = _
override val appName: String = _
override val bachTime: Int = _
override def run(streams: Map[String, DStream[String]]): Unit = ???
}
import com.alibaba.fastjson.JSON
import com.atguigu.gmall.realtime.bean.OrderInfo
import com.atguigu.realtime.gmall.common.Constant
import org.apache.spark.streaming.dstream.DStream
object OrderAppV2 extends BaseApp{
override val topics: Set[String] = Set(Constant.ORDER_INFO_TOPIC)
override val groupId: String = "OrderApp"
override val master: String = "local[2]"
override val appName: String = "OrderApp"
override val bachTime: Int = 3
override def run(sourceStream: DStream[String]): Unit = {
//先验证能不能消费到数据
//sourceStream.print()
sourceStream.map(json => {
JSON.parseObject(json,classOf[OrderInfo])
}) //.print()
.foreachRDD(rdd => {
import org.apache.phoenix.spark._
rdd.saveToPhoenix("gmall_order_info0421",
Seq("ID", "PROVINCE_ID", "CONSIGNEE", "ORDER_COMMENT", "CONSIGNEE_TEL", "ORDER_STATUS", "PAYMENT_WAY", "USER_ID", "IMG_URL", "TOTAL_AMOUNT", "EXPIRE_TIME", "DELIVERY_ADDRESS", "CREATE_TIME", "OPERATE_TIME", "TRACKING_NO", "PARENT_ORDER_ID", "OUT_TRADE_NO", "TRADE_BODY", "CREATE_DATE", "CREATE_HOUR"),
zkUrl = Option("hadoop102,hadoop105,hadoop104:2181"))
})
}
}
bean
import java.text.SimpleDateFormat
import java.util.Date
case class EventLog(mid: String,
uid: String,
appId: String,
area: String,
os: String,
logType: String,
eventId: String,
pageId: String,
nextPageId: String,
itemId: String,
ts: Long,
var logDate: String = null,
var logHour: String = null){
private val date = new Date(ts)
logDate = new SimpleDateFormat("yyyy-MM-dd").format(date)
logDate = new SimpleDateFormat("HH").format(date)
}
case class AlertInfo(mid: String,
uids: java.util.HashSet[String],
itemIds: java.util.HashSet[String],
events: java.util.List[String],
ts: Long)
util
import java.util.Properties
object ConfigUtil {
val is = ClassLoader.getSystemResourceAsStream("config.properties")
val properties = new Properties()
properties.load(is)
def getConf(name: String) = {
properties.getProperty(name)
}
}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010._
import scala.collection.mutable
object MyKafkaUtil {
val kafkaParams = mutable.Map[String, Object](
"bootstrap.servers" -> ConfigUtil.getConf("kafka.servers"),
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"auto.offset.reset" -> "latest", // 如果能读到上次消费的位置, 就从这个位置开始消费, 如果没有, 则从最新
"enable.auto.commit" -> (true: java.lang.Boolean)
)
def getKafkaStream(ssc: StreamingContext, groupId: String, topics: Set[String]) = {
kafkaParams("group.id") = groupId
KafkaUtils
.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
.map(_.value())
}
}
import redis.clients.jedis.Jedis
object RedisUtil {
val host = ConfigUtil.getConf("redis.server")
val port = ConfigUtil.getConf("redis.port").toInt
def getClient = {
new Jedis(host, port)
}
}
使用案例
import java.util
import com.alibaba.fastjson.JSON
import com.atguigu.gmall.realtime.bean.{OrderDetail, OrderInfo, SaleDetail, UserInfo}
import com.atguigu.gmall.realtime.util.{ESUtil, RedisUtil}
import com.atguigu.realtime.gmall.common.Constant
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.dstream.DStream
import org.codehaus.jackson.map.ser.impl.PropertySerializerMap.SerializerAndMapResult
import org.json4s.jackson.Serialization
import redis.clients.jedis.Jedis
import scala.collection.JavaConverters._
import scala.collection.mutable
object SaleDetailApp extends BaseAppV2 {
override val topics: Set[String] = Set(Constant.ORDER_INFO_TOPIC, Constant.ORDER_DETAIL_TOPIC)
override val groupId: String = "SaleDetailApp"
override val master: String = "local[2]"
override val appName: String = "SaleDetailApp"
override val bachTime: Int = 3
// 把orderInfo的数据缓存到redis中
def cacheOrderInfo(client: Jedis, orderInfo: OrderInfo) = {
implicit val f = org.json4s.DefaultFormats
//用json4s能更好的支持样例类
//client.set("order_info:" + orderInfo.id,Serialization.write(orderInfo) )
//30分钟后删除
client.setex("order_info:" + orderInfo.id,60 * 30,Serialization.write(orderInfo) )
}
// 缓存OrderDetail
def cacheOrderDetail(client: Jedis, orderDetail: OrderDetail) = {
implicit val f = org.json4s.DefaultFormats
client.hset("order_detail:" + orderDetail.order_id,orderDetail.id, Serialization.write(orderDetail))
client.expire("order_detail:" + orderDetail.order_id,60 * 30)
}
//方法二:fullJoin,分三种情况,并且存redis缓存,等待下次join
def joinOrderInfoOrderDetail(orderInfoStream: DStream[OrderInfo], orderDetailStream: DStream[OrderDetail]) = {
//通过orderInfo的id和orderDetail的orderid来join
val orderIdToOrderStream: DStream[(String, OrderInfo)] = orderInfoStream
.map(info => (info.id, info))
val orderIdToOrderDetailStream: DStream[(String, OrderDetail)] = orderDetailStream
.map(detail => (detail.order_id, detail))
// 必须使用fullJoin: DStream[(String, (Option[OrderInfo], Option[OrderDetail]))]
orderIdToOrderStream
.fullOuterJoin(orderIdToOrderDetailStream)
.mapPartitions(it => {
val client: Jedis = RedisUtil.getClient
//用map返回的是List,我们不要list?
val r: Iterator[SaleDetail] = it.flatMap {
// some some
case (orderId, (Some(orderInfo), Some(orderDetail))) =>
println("some some")
// 1. 把orderInfo写缓存
cacheOrderInfo(client,orderInfo)
// 2. 合并成一个SaleDetail
val saleDetail: SaleDetail = SaleDetail().mergeOrderInfo(orderInfo).mergeOrderDetail(orderDetail)
// 3. 去orderDetail的缓存中读取数据
//如果数据存在,则把orderDetail取出来进行合并,且orderDetail可能有多条
if (client.exists("order_detail:" + orderInfo.id)) {
/*
orderDetail 在redis中的存储方式:
key value(hash)
"order_detail:"+order_id field value
order_detail_id json字符串
*/
val t: List[SaleDetail] = client
.hgetAll("order_detail:" + orderInfo.id)
.asScala
.map {
case (order_detail_id, json) =>
val orderDetail: OrderDetail = JSON.parseObject(json, classOf[OrderDetail])
SaleDetail().mergeOrderInfo(orderInfo).mergeOrderDetail(orderDetail)
}
.toList :+ saleDetail
//join完了,就要整个删掉
client.del("order_detail:" + orderInfo.id)
t
}else{
// flatMap要求返回值必须是一个集合
saleDetail :: Nil
}
// some none
case (orderId, (Some(orderInfo), None)) =>
println("some none")
//1、orderInfo存缓存
cacheOrderInfo(client,orderInfo)
//2、去orderDetail的缓存中读取数据
if(client.exists("order_detail:" + orderInfo.id)){
val t: List[SaleDetail] = client.hgetAll("order_detail:" + orderInfo.id)
.asScala
.map {
case (order_detail_id, json) =>
val orderDetail: OrderDetail = JSON.parseObject(json, classOf[OrderDetail])
SaleDetail().mergeOrderInfo(orderInfo).mergeOrderDetail(orderDetail)
}
.toList
client.del("order_detail:" + orderInfo.id)
t
}else{
// flatMap要求返回值必须是一个集合
Nil
}
// none some
case (orderId, (None, Some(orderDetail))) =>
println("none some")
// 1、去orderInfo的缓存中读取数据
/*
orderInfo 在redis中的存储方式:
key value
"order_info:"+order_id json字符串
*/
if(client.exists("order_info:" + orderDetail.order_id)){
val json: String = client.get("order_info:" + orderDetail.order_id)
val orderInfo: OrderInfo = JSON.parseObject(json, classOf[OrderInfo])
SaleDetail().mergeOrderInfo(orderInfo).mergeOrderDetail(orderDetail) :: Nil
}else{
cacheOrderDetail(client,orderDetail)
Nil
}
}
client.close()
r
})
}
// 从mysql反查到user信息
def joinUser(saleDetail: DStream[SaleDetail]) = {
// 使用spark-sql读
val spark: SparkSession = SparkSession.builder()
.config(ssc.sparkContext.getConf)
.getOrCreate()
import spark.implicits._
// 1. 先查询到user_info信息 sparkSql-> df/ds -> rdd
def readUserInfoes(ids: String) = {
spark
.read
.format("jdbc")
.option("url", "jdbc:mysql://hadoop102:3306/gmall0421?useSSL=false")
.option("user", "root")
.option("password", "123456")
.option("query", s"select * from user_info where id in (${ids})")
.load()
.as[UserInfo]
.rdd
.map(userInfo => (userInfo.id, userInfo))
}
// 2. saleDetail和user信息进行join saleDetail-> rdd
saleDetail.transform(rdd => {
rdd.cache() // rdd使用多次, 加缓存. 必须加
// [1, 2, 3] => '1','2','3'
val ids = rdd.map(_.user_id).collect().mkString("'", "','", "'") // '1','2','3'
val userInfoRDD = readUserInfoes(ids) // 每个批次都要第一次user数据
rdd
.map(saleDetail => (saleDetail.user_id, saleDetail))
.join(userInfoRDD)
.map {
case (_, (saleDetail, userInfo)) =>
saleDetail.mergeUserInfo(userInfo)
}
})
}
def write2ES(saleDetailWithUser: DStream[SaleDetail]): Unit = {
saleDetailWithUser.foreachRDD(rdd => {
rdd.foreachPartition(it => {
ESUtil.insertBulk(
"gmall0421_sale_detail",
it.map(sale => (sale.order_detail_id,sale))
)
})
})
}
override def run(streams: Map[String, DStream[String]]): Unit = {
// 分别获取两个流
val orderInfoStream = streams(Constant.ORDER_INFO_TOPIC).map(json => {
JSON.parseObject(json, classOf[OrderInfo])
})
val orderDetailStream = streams(Constant.ORDER_DETAIL_TOPIC).map(json => {
JSON.parseObject(json, classOf[OrderDetail])
})
// 1. 对两个流进行join join leftJoin rightJoin fullJoin
val saleDetail: DStream[SaleDetail] = joinOrderInfoOrderDetail(orderInfoStream, orderDetailStream)
// 2. join user
val saleDetailWithUser = joinUser(saleDetail)
saleDetailWithUser.print()
// 3. 写数据到 ES
write2ES(saleDetailWithUser)
}
}
/*
orderInfo 在redis中的存储方式:
key value
"order_info:"+order_id json字符串
orderDetail 在redis中的存储方式:
key value(hash)
"order_detail:"+order_id field value
order_detail_id json字符串