数据优化:
1、小表放在前面,大表放在后面。因为会把前面的表读进内存再进行关联。
2、把分区的条件在on关系后面,不要放在where后面。因为放where后面会把所有分区关联后再按分区过滤。
3、不要使用count(distinct()),使用先分组,后count()。
4、数据倾斜 mapjoin
对于第三点做的测试
优化之前时间:
Time taken: 3107.217 seconds
insert overwrite table TMP_CX_FIN_PAC_HOW_TRADE_INFOS_04_07_02
select
Partyno,
COUNT(DISTINCT concat(F.case_no,F.case_times)) AS DAMAGE_ID_count,
SUM(double(NVL(F.APPLY_INDEMNITY_AMOUNT,0))) AS APPLY_INDEMNITY_AMOUNT
from (
select
E.Partyno,
B.REPORT_NO,
C.ESTIMATE_LOSS_AMOUNT APPLY_INDEMNITY_AMOUNT,
f.case_no,
f.case_times
from gbd_cx.EPCIS_EPCISCLM_T_CLAIM_INFO B ----数据量 2995238
join gbd_cx.EPCIS_EPCISCLM_T_CLAIM_LIFE_BASE C ----数据量 3007382
ON B.REPORT_NO = C.report_no
and B.Case_Times=C.Case_Times
and NVL(C.CLAIM_MARK,'#')='5'
and C.END_DATE IS NOT NULL
and to_date(C.END_DATE)>= pa_monthadd(date_sub(from_unixtime(unix_timestamp()),1),-60)
AND to_date(C.END_DATE)<pa_monthadd(date_sub(from_unixtime(unix_timestamp()),1),0)
join gbd_cx.epcis_epcisclm_t_person_pay_detail F ----数据量 3023130
on b.case_no = F.case_no
and b.case_times = F.case_times
join BDL_CX_CLIENT_PARTY E ----数据量 353150133
on F.PARTY_NO=E.Clientno
join BDL_CX_CLIENT A ----数据量 575258453
on A.Policy_No=B.Policy_No
AND A.Y='2014'
AND A.M='03'
and A.Client_Role in('1','2')
) f
group by Partyno
--------------------------
优化之后时间:
Time taken: 1190.552 seconds
select
Partyno,
COUNT(concat(F.case_no,F.case_times)) AS DAMAGE_ID_count,
SUM(double(NVL(F.APPLY_INDEMNITY_AMOUNT,0))) AS APPLY_INDEMNITY_AMOUNT
from (
select
E.Partyno,
sum(double(NVL(C.ESTIMATE_LOSS_AMOUNT,0))) APPLY_INDEMNITY_AMOUNT,
f.case_no,
f.case_times
from ${hive_db_name_cx}.EPCIS_EPCISCLM_T_CLAIM_INFO B ----数据量 2995238
join ${hive_db_name_cx}.EPCIS_EPCISCLM_T_CLAIM_LIFE_BASE C ----数据量 3007382
ON B.REPORT_NO = C.report_no
and B.Case_Times=C.Case_Times
and NVL(C.CLAIM_MARK,'#')='5'
and C.END_DATE IS NOT NULL
and to_date(C.END_DATE)>= pa_monthadd(date_sub(from_unixtime(unix_timestamp()),1),-60)
AND to_date(C.END_DATE)<pa_monthadd(date_sub(from_unixtime(unix_timestamp()),1),0)
join ${hive_db_name_cx}.epcis_epcisclm_t_person_pay_detail F ----数据量 3023130
on b.case_no = F.case_no
and b.case_times = F.case_times
join BDL_CX_CLIENT_PARTY E ----数据量 353150133
on F.PARTY_NO=E.Clientno
join BDL_CX_CLIENT A ----数据量 575258453
on A.Policy_No=B.Policy_No
AND A.Y='${dateStr:0:4}'
AND A.M='${dateStr:4:2}'
and A.Client_Role in('1','2')
group by E.Partyno,f.case_no,f.case_times
) f
group by Partyno
以下4个语句的执行时间分别是 155,101,114,87秒。区别就是过滤条件以及表的先后顺序。
这个说明:1,过滤条件写到on里面效果确实好于写到where里;
2,小表写到前面也有一点作用。
create table test_performance_1 as
select a.*,B.REPORT_NO
from BDL_CX_CLIENT A ----数据量 575258453
join gbd_cx.EPCIS_EPCISCLM_T_CLAIM_INFO B ----数据量 2995238
on A.Policy_No=B.Policy_No
where A.Client_Role in('1','2')
AND A.Y='2014'
AND A.M='03';
create table test_performance_2 as
select a.*,B.REPORT_NO
from BDL_CX_CLIENT A ----数据量 575258453
join gbd_cx.EPCIS_EPCISCLM_T_CLAIM_INFO B ----数据量 2995238
on A.Policy_No=B.Policy_No
and A.Client_Role in('1','2')
AND A.Y='2014'
AND A.M='03';
create table test_performance_3 as
select a.*,B.REPORT_NO
from gbd_cx.EPCIS_EPCISCLM_T_CLAIM_INFO B ----数据量 2995238
join BDL_CX_CLIENT A ----数据量 575258453
on B.Policy_No=A.Policy_No
where A.Client_Role in('1','2')
AND A.Y='2014'
AND A.M='03';
create table test_performance_4 as
select a.*,B.REPORT_NO
from gbd_cx.EPCIS_EPCISCLM_T_CLAIM_INFO B ----数据量 2995238
join BDL_CX_CLIENT A ----数据量 575258453
on A.Policy_No=B.Policy_No
and A.Client_Role in('1','2')
AND A.Y='2014'
AND A.M='03';