Hive----关联查询排序系统函数分桶表行列转换窗口函数

最新推荐文章于 2024-05-01 22:20:02 发布

T D Z

最新推荐文章于 2024-05-01 22:20:02 发布

阅读量408

点赞数 1

本文链接：https://blog.csdn.net/weixin_51077563/article/details/110459052

版权

一.关联查询

join on --可以多表连接
left join on --左连接
right join on --右连接
left semi on --根据连接的条件返回主表的内容 right不能和semi一块使用,只能左连 (类似于子查询)
union all --联合查询,连接两个查询的结果集要求字段个数和数据类型一致 all(不会去重)
union --去重 (union all 和 union返回结果是两个表的结合)
full join --两个表的拼接(包括null值)

--join的多表连接第一种方式
a
join
b
join
c
join
d
on  a.id=b.id and b.id=c.id and  c.id=d.id ;

--join的多表连接第二种方式
a 
join
b 
on  a.id = b.id)t

join
c
on  t.id = c.id)t2
join
d 
on  t2.id = d.id

--left semi on  返回主表的内容 right不能和semi一块使用,只能左连
select
*
from
tb_x
left semi join   --右连接不可以用semi
tb_y 
on tb_x.id = tb_y.id ;

--union all联合查询,连接两个查询的结果集 要求字段个数和数据类型一致 all(不会去重)
select
*
from
tb_a
where id >=2  
union   all 
select
*
from
tb_a 
where id<=2;

--union去重
select
*
from
tb_y
union 
select
*
from
tb_y ;

--错误示范 列数不同
select
* , name as neme2  -- 3列
from
tb_x
where id >=2  
union   all 
select
*                                --2列   不能union
from
tb_y 
where id<=2;

--full join  两个表的拼接(包括null值)
select
*
from
tb_a 
full join
tb_b
on
tb_a.id=tb_b.id;

二.排序

--在执行SQL的时候默认是一个reducetesk 
set mapreduce.job.reduces=n;  -- 配置reduce的个数 
set mapreduce.job.reduces;  -- 查看配置结果

order by --全局最终结果排序

select * from tb_a order by id   --不写默认升序(asc)
select * from tb_a order by id desc   --倒序

distribute by --指定分区字段分区
与sort结合使用

select * from tb_x distribute by name; --指定分区字段    分区

sort by --区内数据排序

select *  from tb_x  distribute by  name  sort by name  desc; --和distribute by联合使用的时候sort在后

cluster by --当分区字段和排序字段相同并且是升序的时候使用cluster by 替换 distribute by sort by

select *  from tb_x  distribute by  name  sort by name  desc;
select *  from tb_x  cluster by  name ; --使用cluster代替distrubute和sort,只能升序使用

三.系统函数详解

array(ele1,ele2…) --创建数组数组是hive中的一种集合类型和 java中的数组一样

select array(1,2,3,4) ;
select array(log_id,url,ct) from tb_log;
select array(lig_id,url,ct)[0] from tb_log; --索引0id的数组

array_contains(arr,element) --给我一个数组,判断element在arr中存不存在,返回true或false

select array_contains(array(1,2,3,4) , 1);       --存在true
select array_contains(array(1,2,3,4) , 5);       --不存在false

upper() --小写转大写

select upper('a,b,c') ;    --A,B,C

lower() --大写转小写

select lower('A,B,C') ;   --a,b,c

split(str , sqe) --切割

select split('hello_tom_jim_city','_');    --["hello","tom","jim","city"]
select split('hello_tom_jim_city','_')[0]  --索引0 hello

trim() --去除首尾空格

select trim('      hello       ');    --两边空格没有了  (hello)

uuid() --生成一个随机的字符串

select uuid();    -- 67e80de6-5fd6-40af-87e2-bbd6e16ec81d

replace(字符串 , 要替换掉的子串 , 替换掉的新的子串) --替换字符串

select('hello' , 'll' , 'LL');    --hello-->heLLo
select(uuid() , '-', ' ');        --5a461055 3889 4443 8663 0bc9ac7a7792    -替换成了空格

substr(str , 起始位置(偏移量) [, 长度]) --偏移量从1开始包括开始长度可省略
substring(str , 起始位置(偏移量) [, 长度]) --作用和substr一样,用哪个都可以
substring_index(str , ‘分隔符’ , 长度) --指定一个分隔符,取一定的长度返回 ,长度不可省略

select substr('hello' , 2);    --ello
select substr('hello' , 2 , 3)   -- ell
select substring_index('a-b-c' , '-' , 1);   --a
select substring_index('a-b-c' , '-' , 2);   --a-b
a JSS
b JSS-XZS
c JSS-XZS-FX   --表连接的时候可以用substring_index来切割

select trunc(‘2020-12-03’,‘MM’); --当前月的第一天
select trunc(‘2020-12-03’,‘Q’); --当前季度的第一天
select trunc(‘2020-12-03’,‘YEAR’); --当前年的第一天
select date_sub(‘2020-12-03’,1); --当前天减一天
select date_add(‘2020-12-03’,1); --当前天加一天
select datediff(‘2020-12-03’,‘2020-11-03’); --两天相差的天数

select trunc('2020-12-03','MM'); --当前月的第一天
select trunc('2020-12-03','Q');  --当前季度的第一天
select trunc('2020-12-03','YEAR');  --当前年的第一天
select date_sub('2020-12-03',1);   --当前天减一天
select date_add('2020-12-03',1);   --当前天加一天
select datediff('2020-12-03','2020-11-03'); --两天相差的天数

json_tuple --解析Bean

select  
json_tuple(data , 'movie' , 'rate' , 'timeStamp','uid')  --顺序一致
as (mid , rate , ctime ,uid) --别名
from  tb_m ;

四.分桶表和抽样查询
分区表是将数据分文件夹管理 , 减少数据扫描的文件范围直接从对应文件夹中读取数据
分桶表是对join 对查询的优化将数据按照指定的字段的规则分文件

--分桶步骤
1 创建普通表  导入数据
create table tb_stu(
id int, 
name string)
row format delimited fields terminated

最低0.47元/天解锁文章

T D Z

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
Hive----关联查询排序系统函数分桶表行列转换窗口函数

关联查询join on --可以多表连接left join on --左连接right join on --右连接left semi on --返回主表的内容 right不能和semi一块使用,只能左连 (类似于子查询)union all --联合查询,连接两个查询的结果集要求字段个数和数据类型一致 all(不会去重)union --去重 (union all 和 union返回结果是两个表的结合)full join --两个表的拼接(包括null值)--join的多表连
复制链接

扫一扫