1. 需求描述
处理后的点击日志格式如下:
- ip , 访问时间,视频类别,请求状态
- ClickLog(187.124.156.143,20171122,6,-,404)
需要的数据结果:
CategoryClickCount(dayCategoryId:String, clickCount:Int)
- dayCategoryId : 访问的日期 + 视频类别
- clickCount : 访问的数量
2. API操作HBase
object CategoryClickCountDAO {
val tableName = "category_clickcount"
val cf = "info"
val qualifier = "click_count"
/**
* 存入数据
* @param list
* @author hepengpeng 2019/5/28 16:15
* @modify{原因} by hepengpeng 2019/5/28 16:15
* @throws
*/
def save(list:ListBuffer[CategoryClickCount]) = {
val table = HBaseUtils.getInstance().getTable(tableName)
for(els <- list) {
table.incrementColumnValue(Bytes.toBytes(els.dayCategoryId),
Bytes.toBytes(cf), Bytes.toBytes(qualifier), els.clickCount);
}
}
/**
* 获取指定日期指定类别的访问数量
* @param dayCategory
* @author hepengpeng 2019/5/28 16:15
* @modify{原因} by hepengpeng 2019/5/28 16:15
* @throws
*/
def count(dayCategory:String) = {
val table = HBaseUtils.getInstance().getTable(tableName)
val get = new Get(Bytes.toBytes(dayCategory))
val value = table.get(get).getValue(cf.getBytes(), qualifier.getBytes())
if (value == null) {
0L
} else {
Bytes.toLong(value)
}
}
def main(args: Array[String]): Unit = {
print(count("20190330_1"))
}
}
3. 保存收集数据到HBase
ClickLog(187.124.156.143,20171122,6,-,404)
ClickLog (ip:String, time:String, categoryId:Int, statusCode:Int, referer:String)
cleanData.map(log => {
(log.time.substring(0, 8) + "_" + log.categoryId, 1)
}).reduceByKey(_+_).foreachRDD(rdd =>{
rdd.foreachPartition(partionsRdds => {
val list = new ListBuffer[CategoryClickCount]
partionsRdds.foreach(pair => {
list.append(CategoryClickCount(pair._1, pair._2))
})
CategoryClickCountDAO.save(list)
})
})