传送门:https://www.cnblogs.com/yejibigdata/p/6380744.html
Hive中collect相关的函数有collect_list和collect_set。
它们都是将分组中的某列转为一个数组返回,不同的是collect_list不去重而collect_set去重。
json解析函数:get_json_object
语法: get_json_object(string json_string, string path)
返回值: string
说明:解析json的字符串json_string,返回path指定的内容。如果输入的json字符串无效,那么返回NULL。
举例: hive> select get_json_object(’{“store”:{“fruit”:[{“weight”:8,“type”:“apple”},{“weight”:9,“type”:“pear”}],“bicycle”:{“price”:19.95,“color”:“red”} }, “email”:“amy@only_for_json_udf_test.net”,“owner”:“amy”}’,’$.store’);
OK
{“fruit”:[{“weight”:8,“type”:“apple”},{“weight”:9,“type”:“pear”}],“bicycle”:{“price”:19.95,“color”:“red”}}
Time taken: 0.108 seconds, Fetched: 1 row(s)
hive> select get_json_object(’{“store”:{“fruit”:[{“weight”:8,“type”:“apple”},{“weight”:9,“type”:“pear”}],“bicycle”:{“price”:19.95,“color”:“red”} }, “email”:“amy@only_for_json_udf_test.net”,“owner”:“amy”}’,’$.email’);
OK
amy@only_for_json_udf_test.net
hive> select get_json_object(’{“store”:{“fruit”:[{“weight”:8,“type”:“apple”},{“weight”:9,“type”:“pear”}],“bicycle”:{“price”:19.95,“color”:“red”} }, “email”:“amy@only_for_json_udf_test.net”,“owner”:“amy”}’,’$.owner’);
OK
amy
Time taken: 0.499 seconds, Fetched: 1 row(s)
hive提供了复合数据类型:
Structs: structs内部的数据可以通过DOT(.)来存取,例如,表中一列c的类型为STRUCT{a INT; b INT},我们可以通过c.a来访问域a
Maps(K-V对):访问指定域可以通过[“指定域名称”]进行,例如,一个Map M包含了一个group-》gid的kv对,gid的值可以通过M[’ group’]来获取
Arrays:array中的数据为相同类型,例如,假如array A中元素[‘a’,‘b’,‘c’],则A[1]的值为’b’
Struct使用
create table qa_test.student_test(id INT, info struct<name:STRING, age:INT>)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’
COLLECTION ITEMS TERMINATED BY ‘:’;
hive> desc qa_test.student_test;
OK
id int
info structname:string,age:int
Time taken: 0.048 seconds, Fetched: 2 row(s)
$cat test5.txt
1,zhou:30
2,yan:30
3,chen:20
4,li:80
LOAD DATA LOCAL INPATH ‘/home/hadoop/test/test5’ INTO TABLE qa_test.student_test;
hive> select * from qa_test.student_test;
OK
1 {“name”:“zhou”,“age”:30}
2 {“name”:“yan”,“age”:30}
3 {“name”:“chen”,“age”:20}
4 {“name”:“li”,“age”:80}
hive> select info.age from qa_test.student_test;
OK
30
30
20
80
Time taken: 0.234 seconds, Fetched: 4 row(s)
hive> select info.name from qa_test.student_test;
OK
zhou
yan
chen
li
Time taken: 0.08 seconds, Fetched: 4 row(s)
Array使用
create table qa_test.class_test(name string, student_id_list array)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘,’
COLLECTION ITEMS TERMINATED BY ‘:’;
hive>desc qa_test.class_test;
OK
name string
student_id_list array
Time taken: 0.052 seconds, Fetched: 2 row(s)
$ cat test6.txt
034,1:2:3:4
035,5:6
036,7:8:9:10
LOAD DATA LOCAL INPATH ‘/home/hadoop/test/test6’ INTO TABLE qa_test.class_test;
hive> select * from qa_test.class_test;
OK
034 [1,2,3,4]
035 [5,6]
036 [7,8,9,10]
Time taken: 0.076 seconds, Fetched: 3 row(s)
select student_id_list[3] from qa_test.class_test;
hive> select student_id_list[3] from qa_test.class_test;
OK
4
NULL
10
Time taken: 0.12 seconds, Fetched: 3 row(s)
hive> select size(student_id_list) from qa_test.class_test;
OK
4
2
4
Time taken: 0.086 seconds, Fetched: 3 row(s)
hive> select array_contains(student_id_list,4) from qa_test.class_test;
OK
true
false
false
Time taken: 0.129 seconds, Fetched: 3 row(s)
hive>
> select sort_array(student_id_list) from qa_test.class_test;
OK
[1,2,3,4]
[5,6]
[7,8,9,10]
Time taken: 0.085 seconds, Fetched: 3 row(s)
Map使用
create table qa_test.employee(id string, perf map<string, int>)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ‘\t’
COLLECTION ITEMS TERMINATED BY ‘,’
MAP KEYS TERMINATED BY ‘:’;
$ cat test7.txt
1 job:80,team:60,person:70
2 job:60,team:80
3 job:90,team:70,person:100
LOAD DATA LOCAL INPATH ‘/home/hadoop/test/test7’ INTO TABLE qa_test.employee;
hive> select * from qa_test.employee;
OK
1 {“job”:80,“team”:60,“person”:70}
2 {“job”:60,“team”:80}
3 {“job”:90,“team”:70,“person”:100}
Time taken: 0.075 seconds, Fetched: 3 row(s)
hive> select perf[‘job’] from qa_test.employee where perf[‘job’] is not null;
OK
80
60
90
Time taken: 0.096 seconds, Fetched: 3 row(s)
hive> select size(perf) from qa_test.employee;
OK
3
2
3
Time taken: 0.091 seconds, Fetched: 3 row(s)
hive> select map_keys(perf) from qa_test.employee;
OK
[“job”,“team”,“person”]
[“job”,“team”]
[“job”,“team”,“person”]
Time taken: 0.136 seconds, Fetched: 3 row(s)
hive> select map_values(perf) from qa_test.employee;
OK
[80,60,70]
[60,80]
[90,70,100]
Time taken: 0.077 seconds, Fetched: 3 row(s)
说明:lateral view用于和json_tuple,parse_url_tuple,split, explode等UDTF一起使用,它能够将一行数据拆成多行数据,在此基础上可以对拆分后的数据进行聚合。
举例:
hive> select s.x,sp from test.dual s lateral view explode(split(concat_ws(',','1','2','3','4','5','6','7','8','9'),',')) t as sp;
x sp
a 1
b 2
a 3
解释一下,from后面是你的表名,在表名后面加lateral view explode。。。(你的行转列sql) ,还必须要起一个别名,我这个字段的别名为sp。然后再看看select后面的 s.*,就是原表的字段,我这里面只有一个字段,且为X
多个lateral view的sql类如:
SELECT * FROM exampleTable LATERAL VIEW explode(col1) myTable1 AS myCol1 LATERAL VIEW explode(myCol1) myTable2 AS myCol2;
drop table if exists t_ods_tmp_referurl;
create table t_ ods _tmp_referurl as
SELECT a.,b.
FROM ods_origin_weblog a LATERAL VIEW parse_url_tuple(regexp_replace(http_referer, “”", “”), ‘HOST’, ‘PATH’,‘QUERY’, ‘QUERY:id’) b as host, path, query, query_id;
复制表,并将时间截取到日:
drop table if exists t_ods_tmp_detail;
create table t_ods_tmp_detail as
select b.*,substring(time_local,0,10) as daystr,
substring(time_local,11) as tmstr,
substring(time_local,5,2) as month,
substring(time_local,8,2) as day,
substring(time_local,11,2) as hour
From t_ ods _tmp_referurl b;