Hive03
文章目录
1. nvl
select comm, nvl(comm, 0 ) from emp;
如果comm 为null, 用0来替代
2. avg
求平均
3. case when
select
dept,
sum(case sex when 'M' then 1 else 0 end) m_cnt,
sum(case sex when 'F' then 1 else 0 end) f_cnt
from people
group by dept;
4. if
select
dept,
sum(if(sex='M',1,0)) m_cnt,
sum(if(sex='F',1,0)) f_cnt
from people
group by dept;
5. concat_ws concat
select
concat(dept,",",sex), concat_ws("|",collect_list(name))
from people
group by dept,sex;
5. collect_list collect_set
collect_list 不去重
collect_set 去重
6. 行列转换
select
concat(dept,",",sex), concat_ws("|",collect_list(name))
from people
group by dept,sex;
RD,PK,M
RD,RUOZE,M
RD,WANQIAN,F
QA,XINGXING,M
QA,LISIDANNI,F
QA,JINSHA,F
===>
RD,M PK|RUOZE
RD,F WANQIAN
7. 行列转换
page ads
page1 [1,2,3]
page2 [3,4,5]
page1 1
page1 2
page1 3
page2 3
page2 4
page2 5
select page, ad_id
from page_ads lateral view explode(ads) tmp as ad_id;
select ad_id, count(1)
from page_ads lateral view explode(ads) tmp as ad_id group by ad_id;
8. 窗口函数之 cume_dist
select
dept,user,sal,
round(over(order by sal),2) rn1,
round(over(partition by dept order by sal),2) rn2
from ruozedata_window02;
小于等于当前行值的行数 / 分组内的总行数
dept user sal rn1 rn2
dept01 ruoze 10000 0.2 0.3333333333333333 1/3
dept01 jepson 20000 0.4 0.6666666666666666 2/3
dept01 xingxing 30000 0.6 1.0 3/3
dept02 zhangsan 40000 0.8 0.5
dept02 lisi 50000 1.0 1.0
select
dept,user,sal,
round(PERCENT_RANK() over(order by sal),2) rn1,
round(PERCENT_RANK() over(partition by dept order by sal),2) rn2
from ruozedata_window02;
分组内当前行的rank-1/分组内总行数-1
dept user sal rn1 rn2
dept01 ruoze 10000 0.0 1-1/5-1 0.0 1-1/3-1
dept01 jepson 20000 0.25 2-1/5-1 0.5 2-1/3-1
dept01 xingxing 30000 0.5 1.0
dept02 zhangsan 40000 0.75 0.0
dept02 lisi 50000 1.0 1.0
9. lag 、lead 、FIRST_VALUE、 LAST_VALUE
向前
select
cookieid,
time,
lag(time, 1, '1970-01-01 00:00:00') over(partition by cookieid order by time) pre1,
lag(time, 2) over(partition by cookieid order by time) pre1
from window03;
向后
select cookieid,time,url,
lead(time, 1, '1970-01-01 00:00:00') over(partition by cookieid order by time) pre1,
lead(time, 2) over(partition by cookieid order by time) pre2
from window03;
select cookieid,time,url,
FIRST_VALUE(url) over(partition by cookieid order by time) rn
from window03;
select cookieid,time,url,
LAST_VALUE(url) over(partition by cookieid order by time) rn
from window03;
10. UDTF
package com.hpznyf.hive.UDTF;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
add jar /home/hadoop/lib/hpznyf-hive-1.0.jar;
create temporary function hpznyf_split as "com.hpznyf.hive.UDTF.SplitUDTF"
*/
public class SplitUDTF extends GenericUDTF {
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
List<String> structFieldNames = new ArrayList<>();
List<ObjectInspector> structFieldObjectInspectors = new ArrayList<>();
structFieldNames.add("word");
structFieldObjectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames, structFieldObjectInspectors);
}
List<String> list = new ArrayList<>();
@Override
public void process(Object[] args) throws HiveException {
String line = args[0].toString();
String split = args[1].toString();
String[] fields = line.split(split);
for(String field : fields){
list.clear();
list.add(field);
forward(list);
}
}
@Override
public void close() throws HiveException {
}
}
11. 执行流程
select * from emp;
TableScanOperator
ReduceSinkOperator => Map -> Reduce
JoinOperator
MapJoinOperator
SelectOperator
FileSinkOperator
FileterOperator
GroupByOperator