前言
hivesql中很多常用的功能,过段时间没有使用就容易忘记,需要去网上搜索,这里总结一下,省的以后还去去搜,供自己以后参考。
查看分区的行
show rowcount extended table_name;
创建二级分区表
set hive.default.fileformat=Orc;
use db_name;
CREATE TABLE IF NOT EXISTS tb_name(
col_name1 STRING comment '',
col_name2 STRING comment '',
col_name3 STRING comment '',
col_name4 STRING comment '',
col_name5 STRING comment ''
)
COMMENT 'for demo'
PARTITION BY LIST(col_name)
SUBPARTITION BY LIST (col_name2) (SUBPARTITION default) ( PARTITION default)
STORED AS ORCFILE COMPRESS;
alter表(增加列)
use db_name;
ALTER TABLE tb_name ADD COLUMNS (
cols_name6 string COMMENT ''
);
alter表(drop列)
# 不写即删除, 删除col_name4,col_name5,col_name6,保留col_name1,col_name2,col_name3
use db_name;
ALTER TABLE tb_name ADD COLUMNS (
col_name1 STRING comment '',
col_name2 STRING comment '',
col_name3 STRING comment ''
);
删除表
use db_name;
drop table tb_name;
insert分区数据
# 将p_20221231分区的数据,写到p_20230101分区,注意分区名和值的统一,否则会写入失败
use db_name;
alter table tb_name drop partition (p_20230101);
alter table tb_name add partition p_20230101 values in(20230101);
insert table tb_name partition(col_name1=20230101)
select
'20230101' as col_name1,
col_name2,
col_name3,
col_name4
from tb_name
partition ( p_20221231)p
case-when操作
select
col_name1,
col_name2,
col_name3,
col_name4,
case
when col_name3 is null then CONCAT_WS('-', col_name1, col_name2)
when col_name4 is null then CONCAT_WS('-', col_name1, col_name2, col_name3)
else col_name1
end as new_col_name
from tb_name partition(p_20230101)t
LATERAL VIEW操作
col_name4的值为(字符串):‘{ “list_value”: [value1, value2]}’
# col_name4的值为:"[value1, value2]"
# 先将其col_name3转为json,然后获取list_value的值,接着使用regexp_replace替换掉("[]")等,变成字符串。再接着使用split分割成数组,并使用explode将其打平。
select
col_name1,
col_name2,
col_name3,
col_name4,
id_tmp
from tb_name partition(p_20230101)t
LATERAL VIEW explode(split(regexp_replace(get_json_object(col_name4, '$.list_value'), "\"|\"|\\[|\\]", ''), ',')) id_list as id_tmp
分组排序编号,获取topK操作
#
SELECT *
from (
SELECT col_name1,col_name2, row_number() over (partition by col_name2, col_name3 order by col_name4 desc) rank rank
from tb_name
)
where rank < 500