import com.alibaba.fastjson.JSON
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object StreamingETL2 extends App {
val ssc=Spark_Utils.apply
//{"user_name": "47vm9sfSn14","user_region": "甘肃省","user_id": 44953796,"pay_way": "支付宝支付","pay_Time": "1524837923","user_active_id": 1,"user_prop_id": 8,"pay_money": 536,"pay_times": 113}
case class pay_time(user_name:String,user_region:String,user_id:String,pay_way:String,pay_Time:String,user_active_id:Int,user_prop_id:Int,pay_money:Int,pay_times:Int)
val topics = Array("recharge_topic_li")
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,//存储策略,如果broker与Kafka代理位于相同的主机上,那么使用PreferBrokers,它更愿意为该分区在Kafka leader上调度分区。最后,如果分区之间的负载有明显的倾斜,请使用PreferFixed
ConsumerStrategies.Subscribe[String, String](topics, Spark_Utils.kafka_util)
)
//计算每个用户的消费总和
val paymoney= stream.map(f=>{
val sf=f.value()
val js=JSON.parseObject(sf,classOf[pay_time])
Tuple2(js.user_id,js.pay_money)
}).reduceByKey(_+_)
//打标签 计算每个用户的消费次数
val paycount=stream.map(f=>{
val sf=f.value()
val js=JSON.parseObject(sf,classOf[pay_time])
(js.user_id,1)
}).reduceByKey(_+_)
//计算每5秒每个用户的平均消费金额
def payavg(usersum:DStream[(String,Int)],usercount:DStream[(String,Int)])={
// implicit def Int2Double(x:DStream[String],y:DStream[Int])=(x:DStream[String],y.map(f=>f.toDouble))
//这里与另一个join 可以拿取相同的字段
val payment=usersum.join(usercount).map(f=>{
// 消费/次数
val user=f._1
val money =f._2._1
val count=f._2._2
(user,0.1*money/count)
})
payment
}
ssc.checkpoint("window")
//每5秒求用户前一分钟的消费
val paymoneys=paymoney.reduceByKeyAndWindow(_+_,_-_,Seconds(5),Seconds(60))
//每5秒求用户前一分钟的消费次数
val paycounts= paycount.reduceByKeyAndWindow(_+_,_-_,Seconds(5),Seconds(60))
val payavgs=payavg(paymoneys,paycounts).map(f=>println(f._1+"平均消费"+f._2)).print()
ssc.start()
ssc.awaitTermination()
}
SPARK计算用户不同时段的消费金额
最新推荐文章于 2024-04-16 04:13:58 发布