1.去重
去重有两种操作:
1.通过distinct将某列数据进行去重
2.可以通过GROUP BY对数据进行去重,可以通过设置reduce数量 提高执行速度
2.合理设置Reduce数
1)调整reduce个数方法一
(1)每个Reduce处理的数据量默认是256MB
hive.exec.reducers.bytes.per.reducer=256000000
(2)每个任务最大的reduce数,默认为1009
hive.exec.reducers.max=1009
(3)计算reducer数的公式
N=min(参数2,总输入数据量/参数1)
2)调整reduce个数方法二
在hadoop的mapred-default.xml文件中修改
设置每个job的Reduce个数
set mapreduce.job.reduces = 15;
3)reduce个数并不是越多越好
(1)过多的启动和初始化reduce也会消耗时间和资源;
(2)另外,有多少个reduce,就会有多少个输出文件,如果生成了很多个小文件,那么如果这些小文件作为下一个任务的输入,则也会出现小文件过多的问题;
在设置reduce个数的时候也需要考虑这两个原则:处理大数据量利用合适的reduce数;使单个reduce任务处理数据量大小要合适;
3.Hive可以通过设置防止一些危险操作:
1)分区表不使用分区过滤
将hive.strict.checks.no.partition.filter设置为true时,对于分区表,除非where语句中含有分区字段过滤条件来限制范围,否则不允许执行。换句话说,就是用户不允许扫描所有分区。进行这个限制的原因是,通常分区表都拥有非常大的数据集,而且数据增加迅速。没有进行分区限制的查询可能会消耗令人不可接受的巨大资源来处理这个表。
2)使用order by没有limit过滤
将hive.strict.checks.orderby.no.limit设置为true时,对于使用了order by语句的查询,要求必须使用limit语句。因为order by为了执行排序过程会将所有的结果数据分发到同一个Reducer中进行处理,强制要求用户增加这个LIMIT语句可以防止Reducer额外执行很长一段时间。
3)笛卡尔积
将hive.strict.checks.cartesian.product设置为true时,会限制笛卡尔积的查询。对关系型数据库非常了解的用户可能期望在 执行JOIN查询的时候不使用ON语句而是使用where语句,这样关系数据库的执行优化器就可以高效地将WHERE语句转化成那个ON语句。不幸的是,Hive并不会执行这种优化,因此,如果表足够大,那么这个查询就会出现不可控的情况。
4. 列出每个部门薪水前两名最高的人员名称以及薪水。
create table emp(
EMPNO int
,ENAME string
,JOB string
,MGR int
,HIREDATE string
,SAL int
,BONUS int
,DEPTNO int
)
row format delimited
fields terminated by ',';
①维度 部门
②部门下的薪水排名前两名
WITH emp_sal_pm AS (
SELECT
T1.DEPTNO
,T1.ENAME
,T1.SAL
,ROW_NUMBER() OVER(PARTITION BY T1.DEPTNO ORDER BY T1.SAL DESC) as sal_pm
FROM emp T1
)
SELECT
T2.ENAME
,T2.SAL
FROM emp_sal_pm T2
WHERE T2.sal_pm <=2
5.连续登录问题
如何去分析数据:
1.查看数据的字段信息
2.需要查看每一行数据代表什么意思
create table learn5.deal_tb(
id string
,datestr string
,amount string
)row format delimited fields terminated by ',';
load data local inpath "/usr/local/soft/hive-3.1.2/data/deal_tb.txt" into table learn5.deal_tb;
需求:
统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数等
连续交易的总额分析:
①什么是连续交易?
用户每天都在交易
WITH sum_amount_table AS(
SELECT
T.id
,T.datestr
,SUM(amount) as sum_amount
FROM learn5.deal_tb T
GROUP BY T.id,T.datestr
)
SELECT
id
,datestr
,sum_amount
, ROW_NUMBER() OVER(PARTITION BY id ORDER BY datestr) days
FROM sum_amount_table
+-----+-------------+-------------+-------+
| id | datestr | sum_amount | days |
+-----+-------------+-------------+-------+
| 1 | 2019-02-08 | 12461.55 | 1 |
| 1 | 2019-02-09 | 1052.99 | 2 |
| 1 | 2019-02-10 | 85.69 | 3 |
| 1 | 2019-02-12 | 769.85 | 4 |
| 1 | 2019-02-13 | 943.86 | 5 |
| 1 | 2019-02-14 | 538.42 | 6 |
| 1 | 2019-02-15 | 369.76 | 7 |
| 1 | 2019-02-16 | 369.76 | 8 |
| 1 | 2019-02-18 | 795.15 | 9 |
| 1 | 2019-02-19 | 715.65 | 10 |
| 1 | 2019-02-21 | 537.71 | 11 |
| 2 | 2019-02-08 | 12461.55 | 1 |
| 2 | 2019-02-09 | 1052.99 | 2 |
| 2 | 2019-02-10 | 85.69 | 3 |
| 2 | 2019-02-12 | 769.85 | 4 |
| 2 | 2019-02-13 | 943.86 | 5 |
| 2 | 2019-02-14 | 943.18 | 6 |
| 2 | 2019-02-15 | 369.76 | 7 |
| 2 | 2019-02-18 | 795.15 | 8 |
| 2 | 2019-02-19 | 715.65 | 9 |
| 2 | 2019-02-21 | 537.71 | 10 |
| 3 | 2019-02-08 | 12461.55 | 1 |
| 3 | 2019-02-09 | 1052.99 | 2 |
| 3 | 2019-02-10 | 85.69 | 3 |
| 3 | 2019-02-12 | 769.85 | 4 |
| 3 | 2019-02-13 | 943.86 | 5 |
| 3 | 2019-02-14 | 276.81 | 6 |
| 3 | 2019-02-15 | 369.76 | 7 |
| 3 | 2019-02-16 | 369.76 | 8 |
| 3 | 2019-02-18 | 795.15 | 9 |
| 3 | 2019-02-19 | 715.65 | 10 |
| 3 | 2019-02-21 | 537.71 | 11 |
+-----+-------------+-------------+-------+
Reduce() :
2019-02-07 2019-02-08,2019-02-09,2019-02-10
+-----+-------------+-------------+-------+-------------+
| id | datestr | sum_amount | days | start_day |
+-----+-------------+-------------+-------+-------------+
| 1 | 2019-02-08 | 12461.55 | 1 | 2019-02-07 |
| 1 | 2019-02-09 | 1052.99 | 2 | 2019-02-07 |
| 1 | 2019-02-10 | 85.69 | 3 | 2019-02-07 |
| 1 | 2019-02-12 | 769.85 | 4 | 2019-02-08 |
| 1 | 2019-02-13 | 943.86 | 5 | 2019-02-08 |
| 1 | 2019-02-14 | 538.42 | 6 | 2019-02-08 |
| 1 | 2019-02-15 | 369.76 | 7 | 2019-02-08 |
| 1 | 2019-02-16 | 369.76 | 8 | 2019-02-08 |
| 1 | 2019-02-18 | 795.15 | 9 | 2019-02-09 |
| 1 | 2019-02-19 | 715.65 | 10 | 2019-02-09 |
| 1 | 2019-02-21 | 537.71 | 11 | 2019-02-10 |
| 2 | 2019-02-08 | 12461.55 | 1 | 2019-02-07 |
| 2 | 2019-02-09 | 1052.99 | 2 | 2019-02-07 |
| 2 | 2019-02-10 | 85.69 | 3 | 2019-02-07 |
| 2 | 2019-02-12 | 769.85 | 4 | 2019-02-08 |
| 2 | 2019-02-13 | 943.86 | 5 | 2019-02-08 |
| 2 | 2019-02-14 | 943.18 | 6 | 2019-02-08 |
| 2 | 2019-02-15 | 369.76 | 7 | 2019-02-08 |
| 2 | 2019-02-18 | 795.15 | 8 | 2019-02-10 |
| 2 | 2019-02-19 | 715.65 | 9 | 2019-02-10 |
| 2 | 2019-02-21 | 537.71 | 10 | 2019-02-11 |
| 3 | 2019-02-08 | 12461.55 | 1 | 2019-02-07 |
| 3 | 2019-02-09 | 1052.99 | 2 | 2019-02-07 |
| 3 | 2019-02-10 | 85.69 | 3 | 2019-02-07 |
| 3 | 2019-02-12 | 769.85 | 4 | 2019-02-08 |
| 3 | 2019-02-13 | 943.86 | 5 | 2019-02-08 |
| 3 | 2019-02-14 | 276.81 | 6 | 2019-02-08 |
| 3 | 2019-02-15 | 369.76 | 7 | 2019-02-08 |
| 3 | 2019-02-16 | 369.76 | 8 | 2019-02-08 |
| 3 | 2019-02-18 | 795.15 | 9 | 2019-02-09 |
| 3 | 2019-02-19 | 715.65 | 10 | 2019-02-09 |
| 3 | 2019-02-21 | 537.71 | 11 | 2019-02-10 |
+-----+-------------+-------------+-------+-------------+
WITH sum_amount_table AS(
SELECT
T.id
,T.datestr
,SUM(amount) as sum_amount
FROM learn5.deal_tb T
GROUP BY T.id,T.datestr
)
, days_table AS (
SELECT
id
,datestr
,sum_amount
, ROW_NUMBER() OVER(PARTITION BY id ORDER BY datestr) days
FROM sum_amount_table
)
, date_sub_table AS(
SELECT
id
,datestr
,sum_amount
,days
,date_sub(datestr,days) start_day --通过日期减去排序得到日期开始的天
FROM days_table
)
--连续登陆天数、连续登陆开始和结束时间
INSERT INTO TABLE continue_res
SELECT
id
,start_day
,sum(sum_amount) as continue_amount -- 通过对起始天数一致的数据进行分组统计得到 连续交易的总额
,count(*) AS continue_days
,min(datestr) AS continue_start
,max(datestr) as continue_end
FROM date_sub_table
GROUP BY id,start_day
CREATE TABLE continue_res (
id string
,start_day STRING
, continue_amount STRING
, continue_days STRING
, continue_start STRING
, continue_end STRING
)
SELECT
id
,continue_end
,LEAD(continue_start,1,current_date) OVER(PARTITION BY id ORDER BY start_day) as last_day
FROM continue_res
+-----+-------------+---------------------+----------------+-----------------+-------------- -+
| id | start_day | continue_amount | continue_days | continue_start | continue_end |
+-----+-------------+---------------------+----------------+-----------------+-------------- -+
| 1 | 2019-02-07 | 13600.23 | 3 | 2019-02-08 | 2019-02-10 |
| 1 | 2019-02-08 | 2991.6500000000005 | 5 | 2019-02-12 | 2019-02-16 |
| 1 | 2019-02-09 | 1510.8 | 2 | 2019-02-18 | 2019-02-19 |
| 1 | 2019-02-10 | 537.71 | 1 | 2019-02-21 | 2019-02-21 |
| 2 | 2019-02-07 | 13600.23 | 3 | 2019-02-08 | 2019-02-10 |
| 2 | 2019-02-08 | 3026.6499999999996 | 4 | 2019-02-12 | 2019-02-15 |
| 2 | 2019-02-10 | 1510.8 | 2 | 2019-02-18 | 2019-02-19 |
| 2 | 2019-02-11 | 537.71 | 1 | 2019-02-21 | 2019-02-21 |
| 3 | 2019-02-07 | 13600.23 | 3 | 2019-02-08 | 2019-02-10 |
| 3 | 2019-02-08 | 2730.04 | 5 | 2019-02-12 | 2019-02-16 |
| 3 | 2019-02-09 | 1510.8 | 2 | 2019-02-18 | 2019-02-19 |
| 3 | 2019-02-10 | 537.71 | 1 | 2019-02-21 | 2019-02-21 |
+-----+-------------+---------------------+----------------+-----------------+-------------- -+
-- 间隔天数
LAG(col,n,default_val):往前第n行数据
LEAD(col,n, default_val):往后第n行数据
SELECT
T1.id
,T1.continue_end
,T1.last_day
,datediff(T1.last_day,T1.continue_end)
FROM (
SELECT
id
,continue_end
,LEAD(continue_start,1,"2022-04-15") OVER(PARTITION BY id ORDER BY start_day) as last_day
FROM learn5.continue_res
) T1
6.设备异常需求:
create table learn5.base_vio_sbbh(
sbbh STRING COMMENT "设备编号",
wfsj STRING COMMENT "违法时间"
);
通过 base_vio_sbbh.sql的语句 将数据插入至HIVE中
-- 需求说明:若某设备一段时间均无违法数据上报,则认为该设备可能异常。
分析:
1.将每个设备的违法时间进行排列
2.计算上下两行之间时间差是多少
3.对时间差进行排序
4.通过箱线图进行异常值的判定
Q1:下四分位数(Q1): 当前排列的所有数据的 1/4 位置
Q3:上四分位数(Q3): 当前排列的所有数据的 3/4 位置
IQR:四分位間距(interquartile range,簡稱IQR)=(Q3-Q1): 拿 Q3 和 Q1 对应的值进行相见
箱形图为我们提供了识别异常值的一个标准:
异常值被定义为小于 Q1-1.5IQR或大于Q3+1.5IQR的值。虽然这种标准有点任意性,但它来源于经验判断
小于 Q1-1.5(Q3-Q1) 或者 大于 Q3+1.5(Q3-Q1)
+---------------------+-------------+-----+
| sbbh | wfsj | pm |
+---------------------+-------------+-----+
| 451000000000071113 | 2020-04-13 | 1 | 2020-04-22 | 2 |
| 451000000000071113 | 2020-04-22 | 2 | 2020-05-13 | 3 |
| 451000000000071113 | 2020-05-13 | 3 |
| 451000000000071113 | 2020-05-18 | 4 |
| 451000000000071113 | 2020-06-14 | 5 |
| 451000000000071113 | 2020-06-15 | 6 |
| 451000000000071113 | 2020-06-16 | 7 |
| 451000000000071113 | 2020-06-18 | 8 |
| 451000000000071113 | 2020-06-23 | 9 |
| 451000000000071113 | 2020-06-28 | 10 |
| 451000000000071113 | 2020-06-29 | 11 |
| 451000000000071113 | 2020-07-02 | 12 |
| 451000000000071113 | 2020-07-07 | 13 |
| 451000000000071113 | 2020-07-09 | 14 |
| 451000000000071113 | 2020-07-22 | 15 |
| 451000000000071113 | 2020-07-26 | 16 |
| 451000000000071113 | 2020-07-28 | 17 |
| 451000000000071113 | 2020-07-30 | 18 |
| 451000000000071113 | 2020-08-01 | 19 |
| 451000000000071113 | 2020-08-02 | 20 |
| 451000000000071113 | 2020-08-03 | 21 |
| 451000000000071113 | 2020-08-05 | 22 |
| 451000000000071113 | 2020-08-06 | 23 |
| 451000000000071113 | 2020-08-08 | 24 |
| 451000000000071113 | 2020-08-09 | 25 |
| 451000000000071113 | 2020-08-11 | 26 |
| 451000000000071113 | 2020-08-12 | 27 |
| 451000000000071113 | 2020-08-13 | 28 |
| 451000000000071113 | 2020-08-14 | 29 |
| 451000000000071113 | 2020-08-19 | 30 |
| 451000000000071113 | 2020-08-20 | 31 |
| 451000000000071113 | 2020-08-21 | 32 |
| 451000000000071113 | 2020-08-28 | 33 |
| 451000000000071113 | 2020-09-02 | 34 |
| 451000000000071113 | 2020-09-03 | 35 |
| 451000000000071113 | 2020-09-05 | 36 |
| 451000000000071113 | 2020-09-11 | 37 |
| 451000000000071113 | 2020-09-24 | 38 |
| 451000000000071113 | 2020-09-27 | 39 |
| 451000000000071113 | 2020-09-28 | 40 |
| 451000000000071113 | 2020-09-30 | 41 |
| 451000000000071113 | 2020-10-01 | 42 |
| 451000000000071113 | 2020-10-02 | 43 |
CREATE TABLE learn5.mid_sbbh_yczpx(
sbbh STRING
,last_wfsj STRING
,after_wfsj STRING
,sjc int
)
WITH sbbh_wfsj_pm AS (
SELECT
sbbh
,wfsj
,ROW_NUMBER() OVER(PARTITION BY sbbh ORDER BY wfsj) as pm
FROM learn5.base_vio_sbbh
)
INSERT OVERWRITE TABLE learn5.mid_sbbh_yczpx
SELECT
T1.sbbh
,T1.wfsj
,if(T2.wfsj is NULL,"2021-01-01",T2.wfsj) -- LEFT JION 之后 T2.wfsj有null值 也可以使用NVL函数
,datediff(NVL(T2.wfsj,"2021-01-01"),T1.wfsj) sjc
-- 通过设备编号与排名进行自关联操作
FROM sbbh_wfsj_pm T1 LEFT JOIN sbbh_wfsj_pm T2 ON T1.pm + 1 = T2.pm AND T1.sbbh = T2.sbbh
ORDER BY sjc desc
+---------------------+----------------+-----------------+-------------+-------------+
| _u1.sbbh | _u1.last_wfsj | _u1.after_wfsj | _u1.q1_sjc | _u1.q3_sjc |
+---------------------+----------------+-----------------+-------------+-------------+
| 451000000000071113 | 2020-06-18 | 2020-06-23 | 5 | NULL |
| 451000000000072222 | 2020-08-14 | 2020-08-19 | 5 | NULL |
| 451000000000071113 | 2020-06-28 | 2020-06-29 | NULL | 1 |
| 451000000000072222 | 2020-08-13 | 2020-08-14 | NULL | 1 |
+---------------------+----------------+-----------------+-------------+-------------+
| 451000000000071113 | 2020-06-18 | 2020-06-23 | 5 | 1 |
| 451000000000072222 | 2020-08-13 | 2020-08-14 | 5 | 1 |
CREATE TABLE learn5.mid_sbbh_Q_sjc(
sbbh STRING,
Q1_sjc int,
Q3_sjc int
)
WITH Q_num_table AS (
SELECT
sbbh
,floor(count(*) * 0.75) AS Q1_num
,floor(count(*) * 0.25) AS Q3_num
FROM learn5.mid_sbbh_yczpx
GROUP BY sbbh
)
, sbbh_sjc_num_table AS (
SELECT
sbbh
,last_wfsj
,after_wfsj
,sjc
,ROW_NUMBER() OVER(PARTITION BY sbbh ORDER BY sjc desc) as sjc_num
FROM learn5.mid_sbbh_yczpx
)
INSERT OVERWRITE TABLE learn5.mid_sbbh_Q_sjc
SELECT
TT.sbbh
,SUM(TT.Q1_sjc) as Q1_sjc
,SUM(TT.Q3_sjc) as Q3_sjc
FROM (
SELECT
T2.sbbh
,T2.last_wfsj
,T2.after_wfsj
,T2.sjc as Q1_sjc
,null as Q3_sjc
FROM Q_num_table T1 JOIN sbbh_sjc_num_table T2 ON T1.sbbh = T2.sbbh
AND T1.Q1_num = T2.sjc_num
UNION ALL
SELECT
T2.sbbh
,T2.last_wfsj
,T2.after_wfsj
,null as Q1_sjc
,T2.sjc as Q3_sjc
FROM Q_num_table T1 JOIN sbbh_sjc_num_table T2 ON T1.sbbh = T2.sbbh
AND T1.Q3_num = T2.sjc_num
) TT
GROUP BY TT.sbbh
-- 小于 Q1-1.5(Q3-Q1) 或者 大于 Q3+1.5(Q3-Q1)
SELECT
T2.*
,T1.Q1_sjc
,T1.Q3_sjc
,T1.Q1_sjc-(Q3_sjc-Q1_sjc)*1.5
,T1.Q3_sjc+(Q3_sjc-Q1_sjc)*1.5
FROM learn5.mid_sbbh_Q_sjc T1 JOIN learn5.mid_sbbh_yczpx T2 ON T1.sbbh = T2.sbbh
WHERE
T2.sjc > (T1.Q3_sjc+(T1.Q3_sjc-T1.Q1_sjc)*1.5)
7. 开启本地模式
-- 一般用来测试,如果数据量比较大 不建议使用该参数 ,设置该参数以后 使用的资源是当前客户端的计算资源 非集群资源
set hive.exec.mode.local.auto=true;
8.数据仓库建模