hive基础查询笔记

最新推荐文章于 2024-11-07 19:50:24 发布

原创最新推荐文章于 2024-11-07 19:50:24 发布 · 1.2k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#markdown #编辑器 #uml

hive 专栏收录该内容

0 篇文章

订阅专栏

#
==使用正则表达式==

hive (ods)> select symbol,'price.*' from stocks;

==表结构==

hive (ods)> 
          > desc emp1;
OK
col_name        data_type       comment
name                    string                                      
salary                  float                                       
subordinates            array<string>                               
deductions              map<string,float>                           
address                 struct<street:string,city:string,state:string,zip:int>                      
country                 string                                      
state                   string                                      

# Partition Information          
# col_name              data_type               comment             

country                 string                                      
state                   string

==查询数组、struct、map中的元素==

hive (ods)> select name, subordinates[0], deductions["shebao"] ,address.city from emp1;
OK
name    _c1     _c2     city
lucy    aLucy   100.0   Beijing

hive (ods)> select * from emp1;
OK
emp1.name       emp1.salary     emp1.subordinates       emp1.deductions emp1.address    emp1.country    emp1.state
lucy    10000.0 ["aLucy"]       {"shebao":100.0}        {"street":"xidan","city":"Beijing","state":"Dong","zip":100000} BJ      SHOUDU
Time taken: 0.137 seconds, Fetched: 1 row(s)

==计算==

hive (ods)> 
          > 
          > select upper(name), salary, deductions["shebao"], round(salary * (1-deductions["shebao"])) from emp1;
OK
_c0     salary  _c2     _c3
LUCY    10000.0 100.0   -990000.0
Time taken: 0.187 seconds, Fetched: 1 row(s)

==聚合查询==

hive (ods)> 
          > 
          > 
          > select count(*), avg(salary) from emp1;

==设置参数提高聚合性能==

set hive.map.aggr=true;

hive (ods)> 
          > select count(distinct symbol) from emp1;

表生成函数

hive 之行拆列lateral view explode(col3) col3 as name

explode(ARRAY) 列表中的每个元素生成一行

explode(MAP) map中每个key-value对，生成一行，key为一列，value为一列

hive (ods)> 
          > create table explode_test(
          > col1 string,
          > col2 string,
          > col3 string
          > )
          > row format delimited fields terminated by '\t'
          > stored as textfile;
OK
Time taken: 0.207 seconds
hive (ods)> 
          > 
          > load data local inpath '/home/hadoop/study_hadoop/explode.txt' into table explode_test;
Loading data to table ods.explode_test
Table ods.explode_test stats: [numFiles=1, totalSize=20]
OK
Time taken: 0.622 seconds
hive (ods)> select *from explode_test;
OK
explode_test.col1       explode_test.col2       explode_test.col3
a       b       1,2,3
c       d       4,5,6
Time taken: 0.121 seconds, Fetched: 2 row(s)

==遍历数组中的每一列==

一

hive (ods)> select col1, col2, name
          > from explode_test
          > lateral view explode(split(col3,',')) col3 as name;
OK
col1    col2    name
a       b       1
a       b       2
a       b       3
c       d       4
c       d       5
c       d       6
Time taken: 0.124 seconds, Fetched: 6 row(s)

二遍历数组中的每一列

hive (ods)> 
          > 
          > create table hzl_test
          > 
          > (
          > 
          > col1 string,
          > 
          > col2 string,
          > 
          > col3 array<int>
          > 
          > )
          > 
          > row format delimited 
          > 
          > fields terminated by '\t'
          > collection items terminated by ','  ;
OK
Time taken: 0.135 seconds
hive (ods)> 
          > 
          > load data local inpath '/home/hadoop/study_hadoop/explode.txt' into table hzl_test;
Loading data to table ods.hzl_test
Table ods.hzl_test stats: [numFiles=1, totalSize=20]
OK
Time taken: 0.466 seconds
hive (ods)> select * from hzl_test;
OK
hzl_test.col1   hzl_test.col2   hzl_test.col3
a       b       [1,2,3]
c       d       [4,5,6]
Time taken: 0.117 seconds, Fetched: 2 row(s)
hive (ods)> 
          > 
          > 
          > select col1,col2,name
          > from hzl_test
          > lateral view explode(col3) col3 as name;
OK
col1    col2    name
a       b       1
a       b       2
a       b       3
c       d       4
c       d       5
c       d       6
Time taken: 0.12 seconds, Fetched: 6 row(s)

==补充：==

hive (ods)> 
          > 
          > select t.list[0],t.list[1],t.list[2] from (
          > select (split(col3,',')) list from explode_test) t;
OK
_c0     _c1     _c2
1       2       3
4       5       6

==查看数组长度size==

hive (ods)> 
          > 
          > select size(split(col3,',')) list from explode_test ;
OK
list
3
3

hive 列转行 concat_ws(‘,’,collect_set(col3))

hive (ods)> 
          > 
          > select * from tmp_jiangzl_test;
OK
tmp_jiangzl_test.col1   tmp_jiangzl_test.col2   tmp_jiangzl_test.col3
a       b       1
a       b       2
a       b       3
c       d       4
c       d       5
c       d       6
Time taken: 0.116 seconds, Fetched: 6 row(s)
hive (ods)> 
          > 
          > select col1,col2,concat_ws(',',collect_set(col3))
          > from tmp_jiangzl_test
          > group by col1,col2;

col1    col2    _c2
a       b       1,2,3
c       d       4,5,6
Time taken: 34.791 seconds, Fetched: 2 row(s)

parse_url_tuple

hive (ods)> select * from t_url;
OK
t_url.f1        t_url.f2
url1    http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1
url2    https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-getjsonobject
url3    https://www.google.com.hk/#hl=zh-CN&newwindow=1&safe=strict&q=hive+translate+example&oq=hive+translate+example&gs_l=serp.3...10174.11861.6.12051.8.8.0.0.0.0.132.883.0j7.7.0...0.0...1c.1j4.8.serp.0B9C1T_n0Hs&bav=on.2,or.&bvm=bv.44770516,d.aGc&fp=e13e41a6b9dab3f6&biw=1241&bih=589
Time taken: 0.122 seconds, Fetched: 3 row(s)
hive (ods)> select f1,b.* from t_url lateral view parse_url_tuple(f2,'HOST','PATH','QUERTY','QUERTY:k1')b as host,path,querty,querty_id;
OK
f1      b.host  b.path  b.querty        b.querty_id
url1    facebook.com    /path1/p.php    NULL    NULL
url2    cwiki.apache.org        /confluence/display/Hive/LanguageManual+UDF     NULL    NULL
url3    www.google.com.hk       /       NULL    NULL
Time taken: 0.142 seconds, Fetched: 3 row(s)

lateral view

是Hive中提供给UDTF的conjunction，它可以解决UDTF不能添加额外的select列的问题。当我们想对hive表中某一列进行split之后，想对其转换成1 to N的模式，即一行转多列。hive不允许我们在UDTF函数之外，再添加其它select语句。

get_json_object

hive (ods)> select get_json_object(t_json.f2, '$.owner') from t_json;
OK
_c0
amy1
amy2
amy3
Time taken: 0.106 seconds, Fetched: 3 row(s)
hive (ods)> 
          > 
          > select * from t_json;
OK
t_json.f1       t_json.f2       t_json.f3
first   {"store":{"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],"bicycle":{"price":19.951,"color":"red1"}},"email":"amy@only_for_json_udf_test.net","owner":"amy1"}    third
first   {"store":{"fruit":[{"weight":9,"type":"apple"},{"weight":91,"type":"pear"}],"bicycle":{"price":19.952,"color":"red2"}},"email":"amy@only_for_json_udf_test.net","owner":"amy2"}   third
first   {"store":{"fruit":[{"weight":10,"type":"apple"},{"weight":911,"type":"pear"}],"bicycle":{"price":19.953,"color":"red3"}},"email":"amy@only_for_json_udf_test.net","owner":"amy3"} third
Time taken: 0.102 seconds, Fetched: 3 row(s)
hive (ods)>

URL解析函数:parse_url

hive (ods)> 
          > 
          > 
          > select parse_url('https://www.baidu.com/s?cl=3&tn=baidutop10&fr=top1000&wd=%E8%BF%AA%E5%A3%AB%E5%B0%BC%E6%94%B6%E8%B4%AD%E7%A6%8F%E5%85%8B%E6%96%AF&rsv_idx=2','HOST') from dual;
OK
_c0
www.baidu.com

其它内置函数

正则表达式

正则表达式替换函数：regexp_replace

hive (ods)> 
          > select regexp_replace('foobar', 'oo|ar', '') from dual;
OK
_c0
fb
Time taken: 0.112 seconds, Fetched: 1 row(s)
hive (ods)> select regexp_replace('foobar', 'oo|ar', '-') from dual;
OK
_c0
f-b-
Time taken: 0.105 seconds, Fetched: 1 row(s)
hive (ods)>

正则表达式解析函数：regexp_extract

hive (ods)> select regexp_extract('foothebar', 'foo(.*?)(bar)', 1) from dual;
OK
_c0
the
Time taken: 0.105 seconds, Fetched: 1 row(s)
hive (ods)> select regexp_extract('foothebar', 'foo(.*?)(bar)', 0) from dual;
OK
_c0
foothebar
Time taken: 0.104 seconds, Fetched: 1 row(s)
hive (ods)> select regexp_extract('foothebar', 'foo(.*?)(bar)', 2) from dual;
OK
_c0
bar

limit语句

hive (ods)> select * from staged_employees order by id limit 3;

嵌套select语句

hive (ods)> 
          > 
          > from (
          > select upper(emp1.name) as name, emp1.subordinates[0] as sub, emp1.salary, emp1.deductions["shebao"] as shebao, emp1.address.city 
          > from emp1 ) e
          > select e.name, e.sub,e.salary,e.shebao;
OK
e.name  e.sub   e.salary        e.shebao
LUCY    aLucy   10000.0 100.0
Time taken: 0.166 seconds, Fetched: 1 row(s)

case when

hive (ods)> select * from emp1;
OK
emp1.name       emp1.salary     emp1.subordinates       emp1.deductions emp1.address    emp1.country    emp1.state
lucy    10000.0 ["aLucy"]       {"shebao":100.0}        {"street":"xidan","city":"Beijing","state":"Dong","zip":100000} BJ      SHOUDU
Time taken: 0.109 seconds, Fetched: 1 row(s)
hive (ods)> select name,salary,
          > case when salary < 800 then 'low'
          > when salary >= 800 and salary <=5000 then 'middle'
          > when salary >5000 and salary <10000 then 'high'
          > else 'very high'
          > end as bracket 
          > from emp1;
OK
name    salary  bracket
lucy    10000.0 very high
Time taken: 0.3 seconds, Fetched: 1 row(s)
hive (ods)>

设置本地模式 set hive.exec.mode.local.auto = true;

列别名

不能在where子句中只用列别名，但是可以使用嵌套select语句

like rlike

Time taken: 0.141 seconds, Fetched: 4 row(s)
hive (ods)> 
          > 
          > select emp1.address.street from emp1 where emp1.address.street like '%Dong%';
OK
street
DongDan
DongDan
Time taken: 0.103 seconds, Fetched: 2 row(s)
hive (ods)> select emp1.address.street from emp1 where emp1.address.street rlike '.*Dong|HouHai.*';
OK
street
DongDan
DongDan
HouHai
Time taken: 0.164 seconds, Fetched: 3 row(s)

group by having

hive (ods)> select aaa,symbol,ymd,count(*) from stocks1 group by aaa,symbol,ymd having count(*)>1;

Query ID = hadoop_20171218101126_b400d584-6699-447f-8011-1aeb3019a1de
Total jobs = 1
Launching Job 1 out of 1
Number of reduce tasks not specified. Estimated from input data size: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapreduce.job.reduces=<number>
Starting Job = job_1513562135174_0002, Tracking URL = http://master:8088/proxy/application_1513562135174_0002/
Kill Command = /home/hadoop/hadoop-2.6.4/bin/hadoop job  -kill job_1513562135174_0002
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2017-12-18 10:11:38,117 Stage-1 map = 0%,  reduce = 0%
2017-12-18 10:11:49,277 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.72 sec
2017-12-18 10:12:02,299 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 4.86 sec
MapReduce Total cumulative CPU time: 4 seconds 860 msec
Ended Job = job_1513562135174_0002
MapReduce Jobs Launched: 
Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 4.86 sec   HDFS Read: 9900 HDFS Write: 63 SUCCESS
Total MapReduce CPU Time Spent: 4 seconds 860 msec

OK
aaa     symbol  ymd     _c3
aa      ok      '2017-12-11'    2
aa      ok      '2017-12-12'    2
bb      ok      '2017-12-11'    2
Time taken: 37.285 seconds, Fetched: 3 row(s)

/+streamtable(表名)/来指定你想要做为流数据的表,就是大表

hive (ods)> select /*+STREAMTABLE(s)*/s.ymd, s.symbol, d.* from stocks s join dividends d on s.ymd = d.ymd;

join group by

Select * from
(select label,qu from temp_testjoin_ta where dt = '2014-08-08') a
Join
(select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08') b
On a. qu = b.qu
group by label,inmyway,a.qu;

(tok_table_or_col label)        (. (tok_table_or_col a) qu)     (tok_table_or_col inmyway)
l1      q1      i1
l1      q1      i2
l1      q1      i3
Time taken: 51.026 seconds, Fetched: 3 row(s)

left semi Join

hive (ods)> Select * from
          > (select label,qu from temp_testjoin_ta where dt = '2014-08-08') a
          > left semi Join
          > (select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08') b
          > On a. qu = b.qu
          > ;

a.label a.qu
l1      q1

hive (ods)> 
          > 
          > 
          > Select * from
          > (select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08') b
          > left semi Join
          > (select label,qu from temp_testjoin_ta where dt = '2014-08-08') a
          > On b.qu = a.qu
          > ;


b.qu    b.inmyway
q1      i1
q1      i1
q1      i2
q1      i3

map side join

hive (ods)> 
          > 
          > 
          > Select /*+MAPJOIN(a)*/count(*) from
          > (select label,qu from temp_testjoin_ta where dt = '2014-08-08') a
          > left outer Join
          > (select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08') b
          > On a. qu = b.qu
          > ;

order by 注意：(desc降序，asc升序)。

hive (ods)> select qu,inmyway from temp_testjoin_tb where dt = '2014-08-08' order by qu;

OK
qu      inmyway
q1      i3
q1      i2
q1      i1
q1      i1
q2      i2
q2      i1
q3      i10
Time taken: 33.083 seconds, Fetched: 7 row(s)

避免数据倾斜

set hive.groupby.skewindata=true;

hive.groupby.skewindata=true的原理是：当有数据倾斜的时候进行负载均衡，当选项设定为 true，生成的查询计划会有两个 MR Job。第一个 MR Job 中，Map 的输出结果集合会随机分布到 Reduce 中，每个 Reduce 做部分聚合操作，并输出结果，这样处理的结果是相同的 Group By Key 有可能被分发到不同的 Reduce 中，从而达到负载均衡的目的；第二个 MR Job 再根据预处理的数据结果按照 Group By Key 分布到 Reduce 中（这个过程可以保证相同的 Group By Key 被分布到同一个 Reduce 中），最后完成最终的聚合操作。
总结：避免数据倾斜的问题，如果对于group by或distinct，设定 hive.groupby.skewindata=true

cluster by/ sort by/ distibute by

hive (ods)> select * from temp_testjoin_tb distribute by qu sort by qu;

q1      i3      2014-08-08
q1      i2      2014-08-08
q1      i1      2014-08-08
q1      i1      2014-08-08
q3      i10     2014-08-08
q2      i2      2014-08-08
q2      i1      2014-08-08

hive (ods)> 
          > 
          > select * from temp_testjoin_tb cluster by qu ;

q1      i3      2014-08-08
q1      i2      2014-08-08
q1      i1      2014-08-08
q1      i1      2014-08-08
q3      i10     2014-08-08
q2      i2      2014-08-08
q2      i1      2014-08-08

类型转换 cast（value as TYPE）

hive (ods)> select concat('$',cast(salary as string)) from emp1;
OK
_c0
$7500.0
$1200.0
$1200.0
$10000.0

hive抽样查询

rand（）

hive (ods)> select * from numbers tablesample(bucket 3 out of 10 on rand());
OK
numbers.number
4
5

分母表示取第几桶，分子表示分成几个桶

hive (ods)> select * from numbers tablesample(bucket 3 out of 10 on number);
OK
numbers.number
2
Time taken: 0.122 seconds, Fetched: 1 row(s)
hive (ods)> select * from numbers tablesample(bucket 3 out of 10 on number);
OK
numbers.number
2
Time taken: 0.098 seconds, Fetched: 1 row(s)
hive (ods)> select * from numbers tablesample(bucket 3 out of 10 on number);
OK
numbers.number
2
Time taken: 0.101 seconds, Fetched: 1 row(s)
hive (ods)>

数据块抽样 percent

hive (ods)> select * from numbers tablesample(9 percent);
OK
numbers.number
1
Time taken: 0.089 seconds, Fetched: 1 row(s)
hive (ods)> select * from numbers tablesample(50 percent);
OK
numbers.number
1
2
3
4
5
6
Time taken: 0.075 seconds, Fetched: 6 row(s)
hive (ods)> select * from numbers tablesample(40 percent);
OK
numbers.number
1
2
3
4
5
Time taken: 0.092 seconds, Fetched: 5 row(s)
hive (ods)>

分桶表的输入裁剪

hive (ods)> set hive.enforce.bucketing = true;

hive (ods)> dfs -ls /user/hive/warehouse/ods.db/numbers_bucketed;
Found 3 items
-rwxrwxrwx   3 hadoop supergroup          6 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000000_0
-rwxrwxrwx   3 hadoop supergroup          9 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000001_0
-rwxrwxrwx   3 hadoop supergroup          6 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000002_0
hive (ods)>


hive (ods)> select * from numbers_bucketed;
OK
numbers_bucketed.number
9
6
3
10
7
4
1
8
5
2
Time taken: 0.102 seconds, Fetched: 10 row(s)
hive (ods)> dfs -ls /user/hive/warehouse/ods.db/numbers_bucketed;
Found 3 items
-rwxrwxrwx   3 hadoop supergroup          6 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000000_0
-rwxrwxrwx   3 hadoop supergroup          9 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000001_0
-rwxrwxrwx   3 hadoop supergroup          6 2017-12-18 17:17 /user/hive/warehouse/ods.db/numbers_bucketed/000002_0
hive (ods)> dfs -cat /user/hive/warehouse/ods.db/numbers_bucketed/000000_0
          > ;
9
6
3
hive (ods)> dfs -cat /user/hive/warehouse/ods.db/numbers_bucketed/000001_0;
10
7
4
1
hive (ods)> dfs -cat /user/hive/warehouse/ods.db/numbers_bucketed/000002_0;
8
5
2
hive (ods)>

取样如下:

hive (ods)> select * from numbers_bucketed tablesample(bucket 2 out of 3 on number);
OK
numbers_bucketed.number
10
7
4
1
Time taken: 0.107 seconds, Fetched: 4 row(s)
hive (ods)> select * from numbers_bucketed tablesample(bucket 1 out of 3 on number);
OK
numbers_bucketed.number
9
6
3
Time taken: 0.099 seconds, Fetched: 3 row(s)
hive (ods)> select * from numbers_bucketed tablesample(bucket 3 out of 3 on number);
OK
numbers_bucketed.number
8
5
2
Time taken: 0.