-- 基础查询
select 字段列表|表达式|子查询
from [临时]表(子查询|视图)
where not 条件A and|or 条件B --先面向原始行进行筛选
group by 字段A[,字段B...]
having 聚合条件(非原始字段) --后:针对聚合后的字段进行二次筛选
order by 字段A[,字段B,...] --再:全局排序(非limit 最后一条)
limit N(前N条记录)|M(行号偏移量),N(记录数)
where 子句
字段 ? 值
-- 数值或日期
关系: > >= < <= = <>
between (>=)small_value and (<=)big_value;
逻辑 not bool_expr,expr1 and|or expr2
if(bool_expr,value_iftrue,value_iffalse_or_null)
nvl(value,value_default)
case expr when v1 then val1 when v2 then val2...else valn end
case when c1 then val1 when c2 then val2...else valn end
通配符:like '%' | '_' 模糊匹配
% 表示任意长度的任意内容
_ 表示一个任意内容
rlike '\\d+$'正则匹配
--子查询
select 子查询(查看某个字段的值,与主查询存在逻辑主外键关系 select ONLY_ONE_FIELD_RELATIVE from T where PK=FK)
from 子查询(数据表的子集 select F1,...,FN from T where ... group by ...)
where 子查询(FIELD in|=|>= (select ONLY_ONE_FIELD_IN ...))
group by FIELDS|substr(FIELD,0,4)
having 子查询(FIELD in|=|>= (select ONLY_ONE_FIELD_IN ...))
order by FIELD|substr(FIELD,0,4)
limit...
---CTE:重用,结构层次清晰
with
sub_alia as(...),
sub_alia2 as (select...from sub_alia)
select ...
--连接查询:join
hive仅支持等值连接
隐式连接:
select A.*,B.*
from A,B
where A.pk=B.fk
连接:
--不可以进行非等值连接
select ...
from TABLE_A as A
inner join TABLE_B as B --内连接:取交集
left [outer] join TABLE_B as B --左外连接:取左全集,右交集
right [outer] join TABLE_B as B --右外连接:取右全集,左交集
full [outer] join TABLE_B as B --全集
cross join TABLE_B as B --笛卡尔积
on A.pk=B.fk
...
--union(去重),union all(不去重)
--insert
--写表操作
from SOURCE_TABLE
insert overwrite|into [table] [DATABASES_N].TABLE_N1 [partition(pf1...,pfn)]
select A,B,func(C)
insert overwrite|into [table] [DATABASES_N].TABLE_N1 [partition(pf1...,pfn)]
select B,E,D,F;
--写盘操作
from SOURCE_TABLE
insert overwrite [local] directory 'ABSOLUTE_PATH1'
row format serde|delimited
field terminated by '|'
collection items terminated by ','
map keys terminated by ':'
select ...
insert overwrite [local] directory 'ABSOLUTE_PATH2'
[row format ...]
select ...;
--表数据和hdfs之间相互导
export table TABLE_N [partition(pf1=v1,...)] to 'HDFS_PATH'
import table TABLE_N [partition(pf1=v1,...)] from 'HDFS_PATH'
--排序
order by expr(field|func|case...when)--全局排序:性能差
NULL 值在排序时值最小
order by if(NAME is NULL,1,0) --排序时如何将排序字段值为NULL后置
--支持爱找字段顺序编号排序
set hive.groupby.orderby.position.aloias=true;
select a,b,c from TABLE_N order by 2;
--在order by B 之前,可以先对数据进行distribute by A sort by B
sort by FIELD_N --每一个reducer端排序
当reducer 的数量为1时,等同于 order by
FIELD_N 必须是select 字段列表中的一员
cluster by -- distribute by FIELD_A sort by FIELD_A
--不支持 ASC|DESC
--分组
group by expr(field|func|case...when) --为了聚合而分组,否则类似distinct
distribute by expr(field|func|case...when) --类似于hadoop job 中的Partitioner
--聚合函数
--基础聚合
max,min,count,avg,sum
count(null) = 0
--集合类型聚合
collect_set(F|func|case...when) --去重后的列表
collect_list(F|func|case...when) --列表
--hive优化
-- 1、mapjoin
--如果两张表符合mapjoin的场景,则join可以走非等值连接
set hive.auto.convert.join=true --开启mapjoin支持,默认已开启
set hive.mapjoin.smalltable.filesize=25000000; --重新定义小表边界
set hive.optimize.bucketmapjoin=false; --分桶表默认关闭mapjoin
select count(order_customer_id) from cb_orders group by order_date limit 20;
--2、控制reducer的数量
set mapreduce.job.reduces=-1;
set mapred.reduce.tasks=-1;
set hive.exec.reducers.max=1009;
set hive.exec.reducers.bytes.per.reducer=256000000