目录
AdUserClickRealTimeStatSpark.java
本篇文章记录广告点击流量实时统计-过滤出每个batch中的黑名单用户以生成动态黑名单。
动态黑名单
- 计算出每个batch中的每天每个用户对每个广告的点击量,并持久化到mysql中
- 依据上述计算出来的数据,对每个batch中的按date、userid、adid聚合的数据都要遍历一遍,查询一下,对应的累计的点击次数,如果超过了100,那么就认定为黑名单, 然后对黑名单用户进行去重,去重后,将黑名单用户,持久化插入到mysql中, 所以说mysql中的ad_blacklist表中的黑名单用户,就是动态地实时地增长的,这样mysql中的ad_blacklist表,就可以认为是一张动态黑名单
- 基于上述计算出来的动态黑名单,在最一开始,就对每个batch中的点击行为,根据动态黑名单进行过滤,把黑名单中的用户的点击行为,直接过滤掉
代码
domain
AdBlacklist.java
package graduation.java.domain; /** * FileName: AdBlacklist * Author: hadoop * Email: 3165845957@qq.com * Date: 19-4-3 下午3:38 * Description: * 用户黑名单实体类 */ public class AdBlacklist { private long userid ; public long getUserid() { return userid; } public void setUserid(long userid) { this.userid = userid; } @Override public String toString() { return "AdBlacklist{" + "userid=" + userid + '}'; } }
dao
IAdBlacklistDAO.java
package graduation.java.dao; import graduation.java.domain.AdBlacklist; import java.util.List; /** * FileName: IAdBlacklistDAO * Author: hadoop * Email: 3165845957@qq.com * Date: 19-4-3 下午3:39 * Description: * 广告黑名单DAO接口 */ public interface IAdBlacklistDAO { /** * 批量插入广告黑名单用户 * @param adBlacklists */ void insertBatch(List<AdBlacklist> adBlacklists); /** * 查找所有广告黑名单用户 * @return */ List<AdBlacklist> findAll(); }
IAdUserClickCountDAO.java
package graduation.java.dao; import graduation.java.domain.AdUserClickCount; import java.util.List; /** * FileName: IAdUserClickCountDAO * Author: hadoop * Email: 3165845957@qq.com * Date: 19-4-3 上午10:42 * Description: * 广告用户点击量DAO接口类 */ public interface IAdUserClickCountDAO { /** * 批量更新广告用户点击量 * @param adUserClickCounts */ void updateBatch(List<AdUserClickCount> adUserClickCounts); /** * 根据多个key查询用户广告点击量 * @param date 日期 * @param userid 用户id * @param adid 广告id * @return */ int findClickCountByMultiKey(String date,long userid,long adid); }
impl
AdBlacklistDAOImpl.java
package graduation.java.impl; import graduation.java.dao.IAdBlacklistDAO; import graduation.java.domain.AdBlacklist; import graduation.java.jdbc.JDBCHelper; import java.sql.ResultSet; import java.util.ArrayList; import java.util.List; /** * FileName: AdBlacklistDAOImpl * Author: hadoop * Email: 3165845957@qq.com * Date: 19-4-3 下午3:43 * Description: */ public class AdBlacklistDAOImpl implements IAdBlacklistDAO { /** * 批量插入广告黑名单用户 * @param adBlacklists */ @Override public void insertBatch(List<AdBlacklist> adBlacklists) { String insertSQL = "INSERT INTO ad_blacklist VALUES(?)"; List<Object[]> paramLiist = new ArrayList<Object[]>(); for (AdBlacklist adBlacklist : adBlacklists){ Object[] params = new Object[]{adBlacklist.getUserid()}; paramLiist.add(params); } JDBCHelper jdbcHelper = JDBCHelper.getInstance(); jdbcHelper.executeBatch(insertSQL,paramLiist); } /** * 查询所有的广告黑名单用户 * @return */ @Override public List<AdBlacklist> findAll() { String selectSQL = "SELECT * FROM ad_blacklist"; final List<AdBlacklist> adBlacklists = new ArrayList<AdBlacklist>(); JDBCHelper jdbcHelper = JDBCHelper.getInstance(); jdbcHelper.executeQuery(selectSQL, null, new JDBCHelper.QueryCallback() { @Override public void process(ResultSet rs) throws Exception { while (rs.next()){ long userid = rs.getLong(1); //long userid = Long.valueOf(String.valueOf(rs.getInt(1)); AdBlacklist adBlacklist = new AdBlacklist(); adBlacklist.setUserid(userid); adBlacklists.add(adBlacklist); } } }); return adBlacklists; } }
AdUserClickCountDAOImpl.java
package graduation.java.impl; import graduation.java.dao.IAdUserClickCountDAO; import graduation.java.domain.AdUserClickCount; import graduation.java.jdbc.JDBCHelper; import graduation.java.model.AdUserClickCountQueryResult; import java.sql.ResultSet; import java.util.ArrayList; import java.util.List; /** * FileName: AdUserClickCountDAOImpl * Author: hadoop * Email: 3165845957@qq.com * Date: 19-4-3 上午10:44 * Description: * 广告用户点击量DAO实现类 */ public class AdUserClickCountDAOImpl implements IAdUserClickCountDAO { @Override public void updateBatch(List<AdUserClickCount> adUserClickCounts) { JDBCHelper jdbcHelper = JDBCHelper.getInstance(); //首先将用户广告点击量进行分类,分成待插入和已插入 List<AdUserClickCount> insertAdUserClickCounts = new ArrayList<AdUserClickCount>(); List<AdUserClickCount> updateAdUserClickCounts = new ArrayList<AdUserClickCount>(); String selectSQL = "SELECT count(*) FROM ad_user_click_count " + "WHERE date=? AND user_id=? AND adid=?"; Object[] selectParam = null; for (AdUserClickCount adUserClickCount : adUserClickCounts){ final AdUserClickCountQueryResult queryResult = new AdUserClickCountQueryResult(); selectParam = new Object[]{ adUserClickCount.getDate(), adUserClickCount.getUserid(), adUserClickCount.getAdid()}; jdbcHelper.executeQuery(selectSQL, selectParam, new JDBCHelper.QueryCallback() { @Override public void process(ResultSet rs) throws Exception { if (rs.next()){ int clickCount = rs.getInt(1); queryResult.setClickCount(clickCount); } } }); int count = queryResult.getClickCount(); if (count > 0){ updateAdUserClickCounts.add(adUserClickCount); }else{ insertAdUserClickCounts.add(adUserClickCount); } } //执行批量插入 String insertSQL = "INSERT INTO ad_user_click_count VALUES(?,?,?,?)"; List<Object[]> insertParamList = new ArrayList<Object[]>(); for (AdUserClickCount adUserClickCount : insertAdUserClickCounts){ Object[] insertParam = new Object[]{ adUserClickCount.getDate(), adUserClickCount.getUserid(), adUserClickCount.getAdid(), adUserClickCount.getClickCount()}; insertParamList.add(insertParam); } jdbcHelper.executeBatch(insertSQL,insertParamList); //执行批量更新操作 String updateSQL = "UPDATE ad_user_click_count SET click_count=? " + "WHERE date=? AND user_id=? AND adid=?"; List<Object[]> updateParamList = new ArrayList<Object[]>(); for (AdUserClickCount adUserClickCount:updateAdUserClickCounts){ Object[] updateParam = new Object[]{ adUserClickCount.getClickCount(), adUserClickCount.getDate(), adUserClickCount.getUserid(), adUserClickCount.getAdid() }; updateParamList.add(updateParam); } jdbcHelper.executeBatch(updateSQL,updateParamList); } @Override public int findClickCountByMultiKey(String date, long userid, long adid) { String sql = "SELECT click_count " + "FROM ad_user_click_count " + "WHERE date=? " + "AND user_id=? " + "AND ad_id=?"; Object[] params = new Object[]{date,userid,adid}; final AdUserClickCountQueryResult queryResult = new AdUserClickCountQueryResult(); JDBCHelper jdbcHelper = JDBCHelper.getInstance(); jdbcHelper.executeQuery(sql, params, new JDBCHelper.QueryCallback() { @Override public void process(ResultSet rs) throws Exception { while (rs.next()){ int clickCount = rs.getInt(1); queryResult.setClickCount(clickCount); } } }); int clickCount = queryResult.getClickCount(); return 0; } }
spark.ad
AdUserClickRealTimeStatSpark.java
// 现在我们在mysql里面,已经有了累计的每天各用户对各广告的点击量 // 遍历每个batch中的所有记录,对每条记录都要去查询一下,这一天这个用户对这个广告的累计点击量是多少 // 从mysql中查询 // 查询出来的结果,如果是100,如果你发现某个用户某天对某个广告的点击量已经大于等于100了 // 那么就判定这个用户就是黑名单用户,就写入mysql的表中,持久化 // 对batch中的数据,去查询mysql中的点击次数,使用哪个dstream呢? // dailyUserAdClickCountDStream // 为什么用这个batch?因为这个batch是聚合过的数据,已经按照yyyyMMdd_userid_adid进行过聚合了 // 比如原始数据可能是一个batch有一万条,聚合过后可能只有五千条 // 所以选用这个聚合后的dstream,既可以满足咱们的需求,而且呢,还可以尽量减少要处理的数据量 // 一石二鸟,一举两得 JavaPairDStream<String,Long> blacklistDStream = dailyUserAdClickCountDStream.filter(new Function<Tuple2<String, Long>, Boolean>() { private static final long serialVersionUID = 1L; @Override public Boolean call(Tuple2<String, Long> tuple) throws Exception { String key = tuple._1; String[] keySplited = key.split("_"); //yyyyMMdd->yyyy-MM-dd String date = DateUtils.formatDate(DateUtils.parseDateKey(keySplited[0])); long userid = Long.valueOf(keySplited[1]); long adid = Long.valueOf(keySplited[2]); //从mysql数据库中查询指定日期指定用户指定广告的点击量 IAdUserClickCountDAO adUserClickCountDAO = DAOFactory.getAdUserClickCountDAO(); int clickCount = adUserClickCountDAO.findClickCountByMultiKey(date,userid,adid); // 判断,如果点击量大于等于100,ok,那么不好意思,你就是黑名单用户 // 那么就拉入黑名单,返回true if (clickCount >= 100){ return true; } // 反之,如果点击量小于100的,那么就暂时不要管它了 return false; } }); // blacklistDStream // 里面的每个batch,其实就是都是过滤出来的已经在某天对某个广告点击量超过100的用户 // 遍历这个dstream中的每个rdd,然后将黑名单用户增加到mysql中 // 这里一旦增加以后,在整个这段程序的前面,会加上根据黑名单动态过滤用户的逻辑 // 我们可以认为,一旦用户被拉入黑名单之后,以后就不会再出现在这里了 // 所以直接插入mysql即可 // 我们有没有发现这里有一个小小的问题? // blacklistDStream中,可能有userid是重复的,如果直接这样插入的话 // 那么是不是会发生,插入重复的黑明单用户 // 我们在插入前要进行去重 // yyyyMMdd_userid_adid // 20151220_10001_10002 100 // 20151220_10001_10003 100 // 10001这个userid就重复了 // 实际上,是要通过对dstream执行操作,对其中的rdd中的userid进行全局的去重 JavaDStream<Long> blacklistUseridDStream = blacklistDStream.map(new Function<Tuple2<String, Long>, Long>() { private static final long serialVersionUID = 1L; @Override public Long call(Tuple2<String, Long> tuple) throws Exception { String key = tuple._1; String[] keySplited = key.split("_"); long userid = Long.valueOf(keySplited[1]); return userid; } }); //广告用户黑名单去重 JavaDStream<Long> distinctBlacklistUseridDStream = blacklistUseridDStream.transform(new Function<JavaRDD<Long>, JavaRDD<Long>>() { private static final long serialVersionUID = 1L; @Override public JavaRDD<Long> call(JavaRDD<Long> rdd) throws Exception { return rdd.distinct(); } }); // 到这一步为止,distinctBlacklistUseridDStream // 每一个rdd,只包含了userid,而且还进行了全局的去重,保证每一次过滤出来的黑名单用户都没有重复的 distinctBlacklistUseridDStream.foreachRDD(new VoidFunction<JavaRDD<Long>>() { private static final long serialVersionUID = 1L; @Override public void call(JavaRDD<Long> rdd) throws Exception { rdd.foreachPartition(new VoidFunction<Iterator<Long>>() { private static final long serialVersionUID = 1L; @Override public void call(Iterator<Long> iterator) throws Exception { List<AdBlacklist> adBlacklists = new ArrayList<AdBlacklist>(); while (iterator.hasNext()){ long userid = iterator.next(); AdBlacklist adBlacklist = new AdBlacklist(); adBlacklist.setUserid(userid); adBlacklists.add(adBlacklist); } IAdBlacklistDAO adBlacklistDAO = DAOFactory.getAdBlacklistDAO(); adBlacklistDAO.insertBatch(adBlacklists); // 到此为止,我们其实已经实现了动态黑名单了 // 1、计算出每个batch中的每天每个用户对每个广告的点击量,并持久化到mysql中 // 2、依据上述计算出来的数据,对每个batch中的按date、userid、adid聚合的数据 // 都要遍历一遍,查询一下,对应的累计的点击次数,如果超过了100,那么就认定为黑名单 // 然后对黑名单用户进行去重,去重后,将黑名单用户,持久化插入到mysql中 // 所以说mysql中的ad_blacklist表中的黑名单用户,就是动态地实时地增长的 // 所以说,mysql中的ad_blacklist表,就可以认为是一张动态黑名单 // 3、基于上述计算出来的动态黑名单,在最一开始,就对每个batch中的点击行为 // 根据动态黑名单进行过滤 // 把黑名单中的用户的点击行为,直接过滤掉 // 动态黑名单机制,就完成了 } }); } });