转:http://blog.csdn.net/kwu_ganymede/article/details/49927311
1、优化前的SQL
- SELECT
- COUNT(*) pv
- FROM
- (
- SELECT
- cookieid,
- userid,
- to_date(DATETIME) day1
- FROM
- ods.tracklog_5min
- WHERE
- DAY>='20151001'
- AND DAY<='20151031'
- AND lower(requesturl) IN ('http://chat.hexun.com/',
- 'http://zhibo.hexun.com/'))t1
- INNER JOIN
- (
- SELECT
- cookieid,
- to_date(DATETIME) day2
- FROM
- ods.tracklog_5min
- WHERE
- DAY>='20151001'
- AND DAY<='20151031'
- AND ((
- lower(requesturl) LIKE 'http://zhibo.hexun.com/%'
- OR lower(requesturl) LIKE 'http://chat.hexun.com/%')
- AND requesturl LIKE '%/default.html%'))t2
- ON
- t1.cookieid=t2.cookieid
- AND t1.day1=t2.day2
- INNER JOIN
- (
- SELECT
- cookieid,
- to_date(DATETIME) day3
- FROM
- ods.tracklog_5min
- WHERE
- DAY>='20151001'
- AND DAY<='20151031'
- AND ( (
- lower(requesturl) LIKE 'http://px.hexun.com/%'
- AND lower(requesturl) LIKE '%/default.html%' )
- OR (
- lower(requesturl) LIKE 'http://px.hexun.com/pack/%'
- AND lower(requesturl) LIKE '%.html%' )
- OR (
- lower(requesturl) LIKE 'http://px.hexun.com/p/%'
- AND lower(requesturl) LIKE '%.html%' ) ))t3
- ON
- t1.cookieid=t3.cookieid
- AND t1.day1=t3.day3
- LEFT JOIN
- stage.saleplatform_productvisitdetail_temp t4
- ON
- t1.userid=t4.userid
- WHERE
- t4.createtime>t1.day1
- OR t4.userid IS NULL;
可以看,上面的SQL针对同一源表的数据查询了三次,浪费了系统的资源,相同的源完全可以通用。
2、优化后的SQL
抽出公共数据
- create table default.tracklog_10month as
- select * from ods.tracklog_5min
- WHERE DAY>='20151001' AND DAY<='20151031';
- SELECT
- COUNT(*) pv
- FROM
- (
- SELECT
- cookieid,
- userid,
- to_date(DATETIME) day1
- FROM
- default.tracklog_10month
- WHERE
- lower(requesturl) IN ('http://chat.hexun.com/',
- 'http://zhibo.hexun.com/'))t1
- INNER JOIN
- (
- SELECT
- cookieid,
- to_date(DATETIME) day2
- FROM
- default.tracklog_10month
- WHERE (lower(requesturl) LIKE 'http://zhibo.hexun.com/%'
- OR lower(requesturl) LIKE 'http://chat.hexun.com/%')
- AND requesturl LIKE '%/default.html%')t2
- ON
- t1.cookieid=t2.cookieid
- AND t1.day1=t2.day2
- INNER JOIN
- (
- SELECT
- cookieid,
- to_date(DATETIME) day3
- FROM
- default.tracklog_10month
- WHERE
- ( (
- lower(requesturl) LIKE 'http://px.hexun.com/%'
- AND lower(requesturl) LIKE '%/default.html%' )
- OR (
- lower(requesturl) LIKE 'http://px.hexun.com/pack/%'
- AND lower(requesturl) LIKE '%.html%' )
- OR (
- lower(requesturl) LIKE 'http://px.hexun.com/p/%'
- AND lower(requesturl) LIKE '%.html%' ) ))t3
- ON
- t1.cookieid=t3.cookieid
- AND t1.day1=t3.day3
- LEFT JOIN
- stage.saleplatform_productvisitdetail_temp t4
- ON
- t1.userid=t4.userid
- WHERE
- t4.createtime>t1.day1
- OR t4.userid IS NULL;
3、共享中间结果集
本质就是降IO,减少MR阶段中大量读写磁盘及网络IO的压力。