目录
Spark Streaming foreachRDD的正确使用方式
对于这种实时计算程序的mysql插入,有两种pattern(模式)
AdUserClickCountQueryResult.java
本篇文章记录广告点击流量实时统计-使用高性能方式将实时计算结果写入MySQL中。
误区
Spark Streaming foreachRDD的正确使用方式
误区一:在driver上创建连接对象(比如网络连接或数据库连接)
如果在driver上创建连接对象,然后在RDD的算子函数内使用连接对象,那么就意味着需要将连接对象序列化后从driver传递到worker上。而连接对象(比如Connection对象)通常来说是不支持序列化的,此时通常会报序列化的异常(serialization errors)。因此连接对象必须在worker上创建,不要在driver上创建。
dstream.foreachRDD { rdd =>
val connection = createNewConnection() // 在driver上执行
rdd.foreach { record =>
connection.send(record) // 在worker上执行
}
}
误区二:为每一条记录都创建一个连接对象
dstream.foreachRDD { rdd =>
rdd.foreach { record =>
val connection = createNewConnection()
connection.send(record)
connection.close()
}
}
通常来说,连接对象的创建和销毁都是很消耗时间的。因此频繁地创建和销毁连接对象,可能会导致降低spark作业的整体性能和吞吐量。
正确做法一:为每个RDD分区创建一个连接对象
dstream.foreachRDD { rdd =>
rdd.foreachPartition { partitionOfRecords =>
val connection = createNewConnection()
partitionOfRecords.foreach(record => connection.send(record))
connection.close()
}
}
比较正确的做法是:对DStream中的RDD,调用foreachPartition,对RDD中每个分区创建一个连接对象,使用一个连接对象将一个分区内的数据都写入底层MySQL中。这样可以大大减少创建的连接对象的数量。
正确做法二:为每个RDD分区使用一个连接池中的连接对象
dstream.foreachRDD { rdd =>
rdd.foreachPartition { partitionOfRecords =>
// 静态连接池,同时连接是懒创建的
val connection = ConnectionPool.getConnection()
partitionOfRecords.foreach(record => connection.send(record))
ConnectionPool.returnConnection(connection) // 用完以后将连接返回给连接池,进行复用
}
}
对于这种实时计算程序的mysql插入,有两种pattern(模式)
1、每次插入前,先查询,看看有没有数据,如果有,则执行insert语句;如果没有,则执行update语句;好处在于,每个key就对应一条记录;坏处在于,本来对一个分区的数据就是一条insert batch,现在很麻烦,还得先执行select语句,再决定是insert还是update。
j2ee系统,查询某个key的时候,就直接查询指定的key就好。
2、每次插入记录,你就插入就好,但是呢,需要在mysql库中,给每一个表,都加一个时间戳(timestamp),对于同一个key,5秒一个batch,每隔5秒中就有一个记录插入进去。相当于在mysql中维护了一个key的多个版本。
j2ee系统,查询某个key的时候,还得限定是要order by timestamp desc limit 1,查询最新时间版本的数据
通过mysql来用这种方式,不是很好,很不方便后面j2ee系统的使用
这种方式不用mysql,若用hbase(timestamp的多个版本,而且它不却分insert和update,统一就是去对某个行键rowkey去做更新)
代码
domain
AdUserClickCount.java
package graduation.java.domain; /** * FileName: AdUserClickCount * Author: hadoop * Email: 3165845957@qq.com * Date: 19-4-3 上午10:39 * Description: * 广告用户点击量实体类 */ public class AdUserClickCount { private String date; private long userid ; private long adid; private long clickCount; public String getDate() { return date; } public void setDate(String date) { this.date = date; } public long getUserid() { return userid; } public void setUserid(long userid) { this.userid = userid; } public long getAdid() { return adid; } public void setAdid(long adid) { this.adid = adid; } public long getClickCount() { return clickCount; } public void setClickCount(long clickCount) { this.clickCount = clickCount; } @Override public String toString() { return "AdUserClickCount{" + "date='" + date + '\'' + ", userid=" + userid + ", adid=" + adid + ", clickCount=" + clickCount + '}'; } }
dao
IAdUserClickCountDAO.java
package graduation.java.dao; import graduation.java.domain.AdUserClickCount; import java.util.List; /** * FileName: IAdUserClickCountDAO * Author: hadoop * Email: 3165845957@qq.com * Date: 19-4-3 上午10:42 * Description: * 广告用户点击量DAO接口类 */ public interface IAdUserClickCountDAO { /** * 批量更新广告用户点击量 * @param adUserClickCounts */ void updateBatch(List<AdUserClickCount> adUserClickCounts); }
impl
AdUserClickCountDAOImpl.java
package graduation.java.impl; import graduation.java.dao.IAdUserClickCountDAO; import graduation.java.domain.AdUserClickCount; import graduation.java.jdbc.JDBCHelper; import graduation.java.model.AdUserClickCountQueryResult; import java.sql.ResultSet; import java.util.ArrayList; import java.util.List; /** * FileName: AdUserClickCountDAOImpl * Author: hadoop * Email: 3165845957@qq.com * Date: 19-4-3 上午10:44 * Description: * 广告用户点击量DAO实现类 */ public class AdUserClickCountDAOImpl implements IAdUserClickCountDAO { @Override public void updateBatch(List<AdUserClickCount> adUserClickCounts) { JDBCHelper jdbcHelper = JDBCHelper.getInstance(); //首先将用户广告点击量进行分类,分成待插入和已插入 List<AdUserClickCount> insertAdUserClickCounts = new ArrayList<AdUserClickCount>(); List<AdUserClickCount> updateAdUserClickCounts = new ArrayList<AdUserClickCount>(); String selectSQL = "SELECT count(*) FROM ad_user_click_count " + "WHERE date=? AND user_id=? AND adid=?"; Object[] selectParam = null; for (AdUserClickCount adUserClickCount : adUserClickCounts){ final AdUserClickCountQueryResult queryResult = new AdUserClickCountQueryResult(); selectParam = new Object[]{ adUserClickCount.getDate(), adUserClickCount.getUserid(), adUserClickCount.getAdid()}; jdbcHelper.executeQuery(selectSQL, selectParam, new JDBCHelper.QueryCallback() { @Override public void process(ResultSet rs) throws Exception { if (rs.next()){ int count = rs.getInt(1); queryResult.setCount(count); } } }); int count = queryResult.getCount(); if (count > 0){ updateAdUserClickCounts.add(adUserClickCount); }else{ insertAdUserClickCounts.add(adUserClickCount); } } //执行批量插入 String insertSQL = "INSERT INTO ad_user_click_count VALUES(?,?,?,?)"; List<Object[]> insertParamList = new ArrayList<Object[]>(); for (AdUserClickCount adUserClickCount : insertAdUserClickCounts){ Object[] insertParam = new Object[]{ adUserClickCount.getDate(), adUserClickCount.getUserid(), adUserClickCount.getAdid(), adUserClickCount.getClickCount()}; insertParamList.add(insertParam); } jdbcHelper.executeBatch(insertSQL,insertParamList); //执行批量更新操作 String updateSQL = "UPDATE ad_user_click_count SET click_count=? " + "WHERE date=? AND user_id=? AND adid=?"; List<Object[]> updateParamList = new ArrayList<Object[]>(); for (AdUserClickCount adUserClickCount:updateAdUserClickCounts){ Object[] updateParam = new Object[]{ adUserClickCount.getClickCount(), adUserClickCount.getDate(), adUserClickCount.getUserid(), adUserClickCount.getAdid() }; updateParamList.add(updateParam); } jdbcHelper.executeBatch(updateSQL,updateParamList); } }
factory
DAOFactory.java
/** * 获取广告用户点击量管理DAO * @return */ public static AdUserClickCountDAOImpl getAdUserClickCountDAO(){ return new AdUserClickCountDAOImpl(); }
model
AdUserClickCountQueryResult.java
package graduation.java.model; /** * FileName: AdUserClickCountQueryResult * Author: hadoop * Email: 3165845957@qq.com * Date: 19-4-3 上午10:55 * Description: * 用户广告点击量查询结果 */ public class AdUserClickCountQueryResult { private int count; public int getCount() { return count; } public void setCount(int count) { this.count = count; } @Override public String toString() { return "AdUserClickCountQueryResult{" + "count=" + count + '}'; } }
spark.ad
AdUserRealTimeStatSpark.java
// 到这里为止,获取到了什么数据呢? // dailyUserAdClickCountDStream DStream // 源源不断的,每个5s的batch中,当天每个用户对每支广告的点击次数 // <yyyyMMdd_userid_adid, clickCount> dailyUserAdClickCountDStream.foreachRDD (new VoidFunction<JavaPairRDD<String, Long>>() { private static final long serialVersionUID = 1L; @Override public void call(JavaPairRDD<String, Long> rdd) throws Exception { rdd.foreachPartition(new VoidFunction<Iterator<Tuple2<String, Long>>>() { private static final long serialVersionUID = 1L; @Override public void call(Iterator<Tuple2<String, Long>> iterator) throws Exception { // 对每个分区的数据就去获取一次连接对象 // 每次都是从连接池中获取,而不是每次都创建 // 写数据库操作,性能已经提到最高了 List<AdUserClickCount> adUserClickCounts = new ArrayList<AdUserClickCount>(); while (iterator.hasNext()) { Tuple2<String, Long> tuple = iterator.next(); String[] keySplited = tuple._1.split("_"); String date = DateUtils.formatDate(DateUtils.parseDateKey(keySplited[0])); // yyyy-MM-dd long userid = Long.valueOf(keySplited[1]); long adid = Long.valueOf(keySplited[2]); long clickCount = tuple._2; AdUserClickCount adUserClickCount = new AdUserClickCount(); adUserClickCount.setDate(date); adUserClickCount.setUserid(userid); adUserClickCount.setAdid(adid); adUserClickCount.setClickCount(clickCount); adUserClickCounts.add(adUserClickCount); } IAdUserClickCountDAO adUserClickCountDAO = DAOFactory.getAdUserClickCountDAO(); adUserClickCountDAO.updateBatch(adUserClickCounts); } }); } });