情景:在机器学习中构建模型更重要的是特征,特征又分为很多种,原始基本特征、统计特征、偏好特征等等,今天想记录一下统计特征,假如给你一个需求让你统计某一天内这些人一天内看过别人朋友圈的次数,估计你会统计,但是让你统计这一天内的人在三天、七天、十五天、三十天内看过别人朋友圈的数量作为几个特征,想过怎么弄么,给大家分享一个我做过的方法
1、把每一天的数据,例如这一天这些人看过别人朋友圈多少次
2、建立hive表,按天分区
3、把每一天的数据load到hive中,其它的办法也可以
4、之后按照正常的统计逻辑操作就可以啦,分享一下自己写的部分代码吧,写的有点low~
object ownLookownphoto {
def main(args: Array[String]): Unit = {
val date = List("20190601","20190602","20190603","20190604","20190605")
for(item <- date){
merge(item)
}
}
def merge(date:String): Unit ={
val statis_feature = acquire_statis_feature(date)
val next_date = acquireDay(date,1)
statis_feature.write.format("csv").option("header","true").save("file_path")
}
def acquire_statis_feature(date:String): DataFrame ={
val result_3 = acquire_day_feature(date,3)
val result_7 = acquire_day_feature(date,7)
val result_15 = acquire_day_feature(date,15)
val result_30 = acquire_day_feature(date,30)
val result = result_3.join(result_7,Seq("uid"),"outer")
.join(result_15,Seq("uid"),"outer")
.join(result_30,Seq("uid"),"outer")
result
}
def acquireDay(dayStr:String,num:Int): String ={
val sdf: SimpleDateFormat = new SimpleDateFormat("yyyyMMdd")
val date = sdf.parse(dayStr)
val cal: Calendar = Calendar.getInstance()
cal.setTime(date)
cal.add(Calendar.DATE, num)
sdf.format(cal.getTime)
}
def acquire_day_feature(date:String,inter:Int): DataFrame ={
val last_day = acquireDay(date,-inter)
val data = spark.sql(s"select * from table_name where day > $last_day and day < $date")
val statisresult = data.groupBy("uid").sum(
"count")
.withColumnRenamed("sum(count)",s"count_$inter")
statisresult
}
}