0、版本&语言
Scala 2.11.8 Spark 2.1.1 Kafka 0.10.2
1、数据介绍
1.1 数据来源
1.2 数据概览
字段 描述 user_id 用户id(需注册、登录) session_id 用户唯一标识(不登录的情况下也存在) page_id 页面id action_time 一个动作(如浏览)停留的时间
字段 描述 user_id 用户id user_name 用户名 name 真名 age 年龄 professional 职业 city 城市 sex 性别
2、用户访问session分析(Spark Sql)
2.1 需求
按照一定条件进行查询,如年龄在25-30,职业是老师,常住城市为杭州 的 男性 会访问那些页面,并停留多久
2.2 实现
2.2.1 复合条件查询
val taskParamJsonStr = "{'ages':[25,30],'sex':['male'],'professional':['professional1'],'cities':['city1']}"
val taskParam = JSON. parseObject( taskParamJsonStr, classOf[ TaskParam] )
val sex = taskParam. getSex
if ( sex != null && sex. size > 0 ) {
buffer. append( " and i.sex in( " )
append( JSON. toJSONString( sex, SerializerFeature. UseSingleQuotes)
replace( "[" , "" )
replace( "]" , "" ) )
append( ")" )
}
spark. sql( buffer. toString) . createOrReplaceTempView( "filter_after_action" )
spark. sqlContext. cacheTable( "filter_after_action" )
2.2.1 计算时长与步长
2.2.1.1 计算时长
def getTimeLen( endTime: String , startTime: String ) : Long = {
val sdf: SimpleDateFormat = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss" )
( sdf. parse( endTime) . getTime - sdf. parse( startTime) . getTime) / 1000
}
spark. udf. register( "getTimAeLen" , getTimeLen _)
val rdd = spark. sql( "select count(*) stepLen, getTimeLen(max(action_time),min(action_time)) timeLen from filter_after_action group by session_id" )
2.2.1.2 计算步长
2.2.1.2.1 自定义累加器
package cn. uaap. session
import cn. uaap. constant. Constants
import cn. uaap. util. StringUtils
import org. apache. spark. util. AccumulatorV2
class SessionAggrStatAccumulator extends AccumulatorV2[ String , String ] {
var result = Constants. AGGR_RESULT. toString
override def isZero: Boolean = ? ? ?
override def copy( ) : AccumulatorV2[ String , String ] = ? ? ?
override def reset( ) : Unit = ? ? ?
override def add( v: String ) : Unit = {
val v1 = result
val v2 = v
if ( StringUtils. isNotEmpty( v1) && StringUtils. isNotEmpty( v2) ) {
var newResult = ""
val oldValue = StringUtils. getFieldFromConcatString( v1, "\\|" , v2)
if ( oldValue != null ) {
val newValue = oldValue. toInt + 1
newResult = StringUtils. setFieldInConcatString( v1, "\\|" , v2, String . valueOf( newValue) )
}
result = newResult
}
}
override def merge( other: AccumulatorV2[ String , String ] ) : Unit = ? ? ?
override def value: String = { result}
}
2.2.1.2.2 获取步长与停留时间
def aggStepLength( row: Row, acc: SessionAggrStatAccumulator) : Unit = {
val stepLength = row. getAs[ Long ] ( "stepLen" )
if ( stepLength >= 1 && stepLength <= 3 ) acc. add( Constants. STEP_PERIOD_1_3)
else if ( stepLength >= 4 && stepLength <= 6 ) acc. add( Constants. STEP_PERIOD_4_6)
else if ( stepLength >= 7 && stepLength <= 9 ) acc. add( Constants. STEP_PERIOD_7_9)
else if ( stepLength >= 10 && stepLength <= 30 ) acc. add( Constants. STEP_PERIOD_10_30)
else if ( stepLength > 30 && stepLength <= 60 ) acc. add( Constants. STEP_PERIOD_30_60)
else if ( stepLength > 60 ) acc. add( Constants. STEP_PERIOD_60)
}
def aggCostTime( row: Row, acc: SessionAggrStatAccumulator) : Unit = {
val nowTimeLen= row. getAs[ Long ] ( "timeLen" )
if ( nowTimeLen >= 1 && nowTimeLen <= 3 ) acc. add( Constants. TIME_PERIOD_1s_3s)
else if ( nowTimeLen >= 4 && nowTimeLen <= 6 ) acc. add( Constants. TIME_PERIOD_4s_6s)
else if ( nowTimeLen >= 7 && nowTimeLen <= 9 ) acc. add( Constants. TIME_PERIOD_7s_9s)
else if ( nowTimeLen >= 10 && nowTimeLen <= 30 ) acc. add( Constants. TIME_PERIOD_10s_30s)
else if ( nowTimeLen > 30 && nowTimeLen <= 60 ) acc. add( Constants. TIME_PERIOD_30s_60s)
else if ( nowTimeLen > 60 && nowTimeLen <= 180 ) acc. add( Constants. TIME_PERIOD_1m_3m)
else if ( nowTimeLen > 180 && nowTimeLen <= 600 ) acc. add( Constants. TIME_PERIOD_3m_10m)
else if ( nowTimeLen > 600 && nowTimeLen <= 1800 ) acc. add( Constants. TIME_PERIOD_10m_30m)
else if ( nowTimeLen > 1800 ) acc. add( Constants. TIME_PERIOD_30m)
}
val acc: SessionAggrStatAccumulator = new SessionAggrStatAccumulator
spark. sparkContext. register( acc)
rdd. collect( ) . foreach( row=> {
acc. add( Constants. SESSION_COUNT)
aggStepLength( row, acc)
aggCostTime( row, acc)
} )
print( acc. value)
3、刷单用户实时过滤(Spark Streaming)
3.1 需求
刷单用户:指在一段时间内某用户对某商品进行很多次点击,以使其成为热门商品,提高销量
3.2 数据
3.2.1 数据来源
3.2.2 数据存储
3.2.3 数据概况
字段 描述 timestamp 时间戳 user_id 用户id item_id 商品id
3.3 实现
3.3.1 设置checkpoint
val streamingContext = new StreamingContext( sparkSession. sparkContext, Seconds( 5 ) )
streamingContext. checkpoint( "checkpoint" )
val sc = streamingContext. sparkContext
3.3.2 配置kafka
val kafka_brokers = ConfigurationManager. getProperty( Constants. KAFKA_BROKERS)
val kafka_topics = ConfigurationManager. getProperty( Constants. KAFKA_TOPICS)
val kafkaParam = Map(
"bootstrap.servers" - > kafka_brokers,
"key.deserializer" - > classOf[ StringDeserializer] ,
"value.deserializer" - > classOf[ StringDeserializer] ,
"group.id" - > "group1" ,
"enable.auto.commit" - > ( false : java. lang. Boolean )
)
3.3.3 创建DStream
val realTimeDStream= KafkaUtils. createDirectStream[ String , String ] (
streamingContext,
LocationStrategies. PreferConsistent,
ConsumerStrategies. Subscribe[ String , String ] ( Array( kafka_topics) , kafkaParam)
)
3.3.4 transform
val readTimeValueDStream = realTimeDStream. map( item=> {
val msg = item. value
val logSplit = msg. split( " " )
val date = DateUtils. formatDate( new Date( logSplit( 0 ) . trim. toLong) )
val userId = logSplit( 3 ) . trim
val itemId = logSplit( 4 ) . trim
( date+ "_" + userId+ "_" + itemId, 1L )
} )
3.3.5 updateStateByKey
val updateFunc = ( currentBatch: Seq[ Long ] , historyAllBatch: Option[ Long ] ) => {
val nowBatchSum = currentBatch. sum
val historyAllBatchSum = historyAllBatch. getOrElse( 0L )
Some( nowBatchSum + historyAllBatchSum)
}
val result = readTimeValueDStream. updateStateByKey( updateFunc)
result. filter( _. _2> 3 ) . foreachRDD( rdd=> rdd. foreach( println( _) ) )
4、页面单跳转化统计(Spark Core)
4.1 需求
一段时间内,一组页面 两两之间 跳转的比率(类似AARRR)
4.2 实现
val taskParam = JSON. parseObject( "{'start_date':'2020-04-14','end_date':'2020-04-14','page_flow':[1,2,3,4,6]}" , classOf[ TaskParam] )
4.2.1 根据条件过滤Session
def filterSessionByCondition( spark: SparkSession, taskParam : TaskParam) : ( RDD[ Row] , java. util. List[ Integer] ) = {
val buffer = new StringBuffer
val startTime = taskParam. getStartDate
val endTime = taskParam. getEndDate
val page_flow = taskParam. getPageFlow
buffer. append( "select * from user_visit_action where (date>='" + startTime+ "') and (date<='" + endTime+ "')" )
buffer. append( " and (page_id in " )
. append( JSON. toJSONString( page_flow, SerializerFeature. UseSingleQuotes)
. replace( "[" , "(" )
. replace( "]" , ")" ) )
. append( ")" )
( spark. sql( buffer. toString) . rdd, page_flow)
}
4.2.2 计算单页跳转率
def calPageConvertRate( result: ( RDD[ Row] , java. util. List[ Integer] ) ) : Unit = {
val rdd = result. _1
val page_flow = result. _2
val container = new mutable. HashMap[ Integer, Integer]
for ( i <- 0 until page_flow. size) {
val page_id = page_flow. get( i)
val nowPageTotalClickCnt = rdd. filter( row => row. getAs[ Long ] ( "page_id" ) == page_id) . count
container. put( page_id, nowPageTotalClickCnt. toInt)
}
val sbuilder = new mutable. StringBuilder( )
for ( i <- 0 until page_flow. size - 1 ) {
val now_page_id = page_flow. get( i)
val next_page_id = page_flow. get( i+ 1 )
sbuilder. append( now_page_id) . append( "_" ) . append( next_page_id) . append( "=0" ) . append( "|" )
}
sbuilder. deleteCharAt( sbuilder. length - 1 )
var resultStr = sbuilder. toString
for ( i <- 0 until page_flow. size - 1 ) {
val now_page_id = page_flow. get( i)
val next_page_id = page_flow. get( i+ 1 )
val now_page_click_cnt = container. get( now_page_id) . get
val next_page_click_cnt = container. get( next_page_id) . get
var rate = 0.0
if ( now_page_click_cnt != 0 && now_page_click_cnt != 0 ) {
rate = NumberUtils. formatDouble( next_page_click_cnt/ now_page_click_cnt, 2 )
}
val key = now_page_id+ "_" + next_page_id
resultStr = StringUtils. setFieldInConcatString( resultStr, "\\|" , key, rate. toString)
}
print( resultStr)
}