hive 对于JSON 格式的处理
{"movie":"1193","rate":"5","time":"978300760","userid":"1"} ---格式
{"movie":"661","rate":"3","time":"978302109","userid":"1"}
{"movie":"914","rate":"3","time":"978301968","userid":"1"}
{"movie":"3408","rate":"4","time":"978300275","userid":"1"}
{"movie":"2355","rate":"5","time":"978824291","userid":"1"}
{"movie":"1197","rate":"3","time":"978302268","userid":"1"}
hive (d7_hive)> create table rating_json(json string); ---创建表导入数据
OK
Time taken: 1.034 seconds
hive (d7_hive)> desc function json_tuple; ---查看函数功能
OK
tab_name
json_tuple(jsonStr, p1, p2, ..., pn) - like get_json_object, but it takes multiple names and return a tuple. All the input parameters and output column types are string.
Time taken: 0.542 seconds, Fetched: 1 row(s)
hive (d7_hive)> load data local inpath '/home/hadoop/data/rating.json' into table rating_json; ---导入数据
Loading data to table d7_hive.rating_json
hive (d7_hive)> select json_tuple(json,'movie','rate','time','userid') as (movie_id,rate,time,userid)from rating_json limit 10;
OK
movie_id rate time userid
1193 5 978300760 1
661 3 978302109 1
914 3 978301968 1
3408 4 978300275 1
2355 5 978824291 1
1197 3 978302268 1
1287 5 978302039 1
2804 5 978300719 1
594 4 978302268 1
919 4 978301368 1
Time taken: 0.241 seconds, Fetched: 10 row(s)
数据清洗
json ==> 我们所需要的列(字段的扩充)
raw ==> width 大宽表
create table rating_width as ---创建一张宽表
select
movie_id,rate,time,user_id,
year(from_unixtime(cast(time as bigint))) as year,
month(from_unixtime(cast(time as bigint))) as month,
day(from_unixtime(cast(time as bigint))) as day,
hour(from_unixtime(cast(time as bigint))) as hour,
minute(from_unixtime(cast(time as bigint))) as minute,
from_unixtime(cast(time as bigint)) as ts
from
(
select
json_tuple(json,'movie','rate','time','userid') as (movie_id,rate,time,user_id)
from rating_json
) tmp
;
后续统计分析都是基于统计分析表
hive (d7_hive)> desc function parse_url_tuple; --- parse_url_tuple的使用
OK
tab_name
parse_url_tuple(url, partname1, partname2, ..., partnameN) - extracts N (N>=1) parts from a URL.
It takes a URL and one or multiple partnames, and returns a tuple. All the input parameters and output column types are string.
Time taken: 0.027 seconds, Fetched: 2 row(s)
hive (d7_hive)> select parse_url_tuple("http://www.data.com/d7/xxx.html?cookieid=1234567&a=b&c=d","HOST","PATH","QUERY","QUERY:cookie_id");
OK
c0 c1 c2 c3
www.data.com /d7/xxx.html cookieid=1234567&a=b&c=d NULL
Time taken: 0.094 seconds, Fetched: 1 row(s)
统计每种性别中年龄最大的两条数据
---准备数据
hive (d7_hive)> select * from hive_rownumber limit 10;
OK
hive_rownumber.id hive_rownumber.age hive_rownumber.name hive_rownumber.gender
1 18 zhangsan M
2 19 lisi M
3 22 wangwu F
4 16 zhaoliu F
5 30 tianqi M
6 26 wangba F
Time taken: 0.088 seconds, Fetched: 6 row(s)
先根据性别分组,然后根据年龄做降序,取前2条
使用窗口分析函数实现:
hive (d7_hive)> select id,age,name,gender,num
> from
> (
> select id,age,name,gender,
> ROW_NUMBER() over(PARTITION BY gender order by age desc) as num
> from hive_rownumber
> )
> t where t.num<=2;
HiveServer2 Clients
[hadoop@hadoop001 bin]$ ./hiveserver2
which: no hbase in (/home/hadoop/app/hive/bin:/home/hadoop/app/hadoop/bin:/usr/java/jdk1r/bin:/usr/local/sbin:/usr/sbin:/sbin:/home/hadoop/bin)
[hadoop@hadoop001 bin]$ ./beeline -u jdbc:hive2://hadoop001:10000/d7_hive -n hadoop
which: no hbase in (/home/hadoop/app/hive/bin:/home/hadoop/app/hadoop/bin:/usr/java/jdk1.8.0_45/bin:/usr/local/mysql/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/home/hadoop/bin)
scan complete in 2ms
Connecting to jdbc:hive2://hadoop001:10000/d7_hive
Connected to: Apache Hive (version 1.1.0-cdh5.7.0)
Driver: Hive JDBC (version 1.1.0-cdh5.7.0)
Transaction isolation: TRANSACTION_REPEATABLE_READ
Beeline version 1.1.0-cdh5.7.0 by Apache Hive
0: jdbc:hive2://hadoop001:10000/d7_hive> show tables;
+-----------------+--+
| tab_name |
+-----------------+--+
| hive_rownumber |
| rating_json |
| rating_width |
| wc |
+-----------------+--+
复杂数据类型 (Complex Types)
Array(1,2,3,4) 装的数据类型是一样的
[hadoop@hadoop001 data]$ cat hive_array.txt ---准备数据
zhangsan beijing,shanghai,tianjin,hangzhou
lisi changchu,chengdu,wuhan,beijing
hive (d7_hive)> create table hive_array(name string, work_locations array<string>) ---创建表
> row format delimited fields terminated by '\t'
> COLLECTION ITEMS TERMINATED BY ',';
OK
Time taken: 0.679 seconds
hive (d7_hive)> load data local inpath '/home/hadoop/data/hive_array.txt' into table hive_array; ---导入数据
hive (d7_hive)> select * from hive_array; ---查询数据
OK
hive_array.name hive_array.work_locations
zhangsan ["beijing","shanghai","tianjin","hangzhou"]
lisi ["changchu","chengdu","wuhan","beijing"]
Time taken: 1.02 seconds, Fetched: 2 row(s)
hive (d7_hive)> select name,work_locations from hive_array;
OK
name work_locations
zhangsan ["beijing","shanghai","tianjin","hangzhou"]
lisi ["changchu","chengdu","wuhan","beijing"]
hive (d7_hive)> select name,work_locations[0] from hive_array; ---取第一个元素
OK
name _c1
zhangsan beijing
lisi changchu
Time taken: 1.005 seconds, Fetched: 2 row(s)
hive (d7_hive)> select name,work_locations[3] from hive_array;
OK
name _c1
zhangsan hangzhou
lisi beijing
Time taken: 0.099 seconds, Fetched: 2 row(s)
hive (d7_hive)> select name,work_locations[4] from hive_array; ---没有第五个
OK
name _c1
zhangsan NULL
lisi NULL
hive (d7_hive)> select name,size(work_locations) from hive_array; ---计算数组
OK
name _c1
zhangsan 4
lisi 4
hive (d7_hive)> select * from hive_array where array_contains(work_locations,'hangzhou') ---包含哪个元素
> ;
OK
hive_array.name hive_array.work_locations
zhangsan ["beijing","shanghai","tianjin","hangzhou"]
Time taken: 0.135 seconds, Fetched: 1 row(s)
Map('a',1,'b',c) key的类型一样
hive (d7_hive)> create table hive_map(id int,name string, members map<string,string>, age int)
> row format delimited fields terminated by ','
> COLLECTION ITEMS TERMINATED BY '#'
> MAP KEYS TERMINATED BY ':';
OK
Time taken: 0.34 seconds
hive (d7_hive)> load data local inpath '/home/hadoop/data/hive_map.txt' into table hive_map;
Loading data to table d7_hive.hive_map
Table d7_hive.hive_map stats: [numFiles=1, totalSize=222]
OK
Time taken: 1.291 seconds
hive (d7_hive)> select id,name,members["father"] from hive_map;
OK
id name _c2
1 zhangsan xiaoming
2 lisi mayun
3 wangwu wangjianlin
4 mayun mayongzhen
hive (d7_hive)> select id,name,members["father"] as father,members["sister"] as sister from hive_map;
OK
id name father sister
1 zhangsan xiaoming NULL
2 lisi mayun NULL
3 wangwu wangjianlin jingtian
4 mayun mayongzhen NULL
Time taken: 0.325 seconds, Fetched: 4 row(s)
hive (d7_hive)> select id,map_keys(members) from hive_map; ---取出keys 值
OK
id _c1
1 ["father","mother","brother"]
2 ["father","mother","brother"]
3 ["father","mother","sister"]
4 ["father","mother"]
Time taken: 0.654 seconds, Fetched: 4 row(s)
hive (d7_hive)> select id,map_values(members) from hive_map;
OK
id _c1
1 ["xiaoming","xiaohuang","xiaoxu"]
2 ["mayun","huangyi","guanyu"]
3 ["wangjianlin","ruhua","jingtian"]
4 ["mayongzhen","angelababy"]
Time taken: 0.127 seconds, Fetched: 4 row(s)
hive (d7_hive)> select id,map_values(members),size(members) from hive_map;
OK
id _c1 _c2
1 ["xiaoming","xiaohuang","xiaoxu"] 3
2 ["mayun","huangyi","guanyu"] 3
3 ["wangjianlin","ruhua","jingtian"] 3
4 ["mayongzhen","angelababy"] 2
Time taken: 7.393 seconds, Fetched: 4 row(s)
hive (d7_hive)> select id,name,age,members['brother'] from hive_map where array_contains(map_keys(members),'brother');
OK ---找出brother的
id name age _c3
1 zhangsan 28 xiaoxu
2 lisi 22 guanyu
Time taken: 0.845 seconds, Fetched: 2 row(s)
Struct('a',1,2,34,4)
[hadoop@hadoop001 data]$ cat hive_struct.txt
192.168.1.1#zhangsan:40
192.168.1.2#lisi:50
192.168.1.3#wangwu:60
192.168.1.4#zhaoliu:70[hadoop@hadoop001 data]$
hive (default)> create table hive_struct(ip string, userinfo struct<name:string, age:int>)
> row format delimited fields terminated by '#'
> COLLECTION ITEMS TERMINATED BY ':';
OK
Time taken: 2.06 seconds
hive (default)> load data local inpath '/home/hadoop/data/hive_struct.txt' into table hive_struct ---导入数据
> ;
Loading data to table default.hive_struct
Table default.hive_struct stats: [numFiles=1, totalSize=88]
OK
Time taken: 1.248 seconds
hive (default)> select * from hive_struct;
OK
hive_struct.ip hive_struct.userinfo
192.168.1.1 {"name":"zhangsan","age":40}
192.168.1.2 {"name":"lisi","age":50}
192.168.1.3 {"name":"wangwu","age":60}
192.168.1.4 {"name":"zhaoliu","age":70}
Time taken: 0.612 seconds, Fetched: 4 row(s)
分区表
Hive:分区表 partition ;bigdata:天、小时,是一张表,只是不同的分区在不同的目录里面
[hadoop@hadoop001 data]$ cat order_created.txt
10703007267488 2014-05-01 06:01:12.334+01
10101043505096 2014-05-01 07:28:12.342+01
10103043509747 2014-05-01 07:50:12.33+01
10103043501575 2014-05-01 09:27:12.33+01
10104043514061 2014-05-01 09:03:12.324+01
hive (default)> create table order_partiton(
> order_no string,
> order_time string
> )
> PARTITIONED BY (event_month string)
> row format delimited fields terminated by '\t';
OK
Time taken: 3.021 seconds
hive (default)> select * from order_partiton;
OK
order_partiton.order_no order_partiton.order_time order_partiton.event_month
Time taken: 0.261 seconds
hive (default)>
hive (default)> load data local inpath '/home/hadoop/data/order_created.txt' into table order_partiton PARTITION (event_month='2014-05');
Loading data to table default.order_partiton partition (event_month=2014-05)
Partition default.order_partiton{event_month=2014-05} stats: [numFiles=1, numRows=0, totalSize=213, rawDataSize=0]
OK
Time taken: 3.324 seconds
hive (default)> select * from order_partiton;
OK
order_partiton.order_no order_partiton.order_time order_partiton.event_month ---最后一列分区字段
10703007267488 2014-05-01 06:01:12.334+01 2014-05
10101043505096 2014-05-01 07:28:12.342+01 2014-05
10103043509747 2014-05-01 07:50:12.33+01 2014-05
10103043501575 2014-05-01 09:27:12.33+01 2014-05
10104043514061 2014-05-01 09:03:12.324+01 2014-05
Time taken: 0.911 seconds, Fetched: 5 row(s)
hdfs:/user/hive/warehouse/order_partiton/event_month=2014-05
tablename/partition_column=partition_value
注:表的字段是不包含分区字段的,分区字段只是HDFS上的文件夹的名称。
hive (default)> select * from order_partiton where event_month='2014-05'; ---分区字段查询方法
OK
[hadoop@hadoop001 ~]$ hadoop fs -mkdir -p /user/hive/warehouse/order_partiton/event_month=2014-06
[hadoop@hadoop001 data]$ hadoop fs -put order_created.txt /user/hive/warehouse/order_partiton/event_month=2014-06
hive (default)> msck repair table order_partiton; ---增加分区
OK
Partitions not in metastore: order_partiton:event_month=2014-06
Repair: Added partition to metastore order_partiton:event_month=2014-06
Time taken: 3.304 seconds, Fetched: 2 row(s)
mysql> select * from partitions; ---分区已经增加
+---------+-------------+------------------+---------------------+-------+--------+
| PART_ID | CREATE_TIME | LAST_ACCESS_TIME | PART_NAME | SD_ID | TBL_ID |
+---------+-------------+------------------+---------------------+-------+--------+
| 1 | 1564404845 | 0 | event_month=2014-05 | 63 | 62 |
| 6 | 1564458320 | 0 | event_month=2014-05 | 67 | 66 |
| 11 | 1564465393 | 0 | event_month=2014-06 | 71 | 62 |
+---------+-------------+------------------+---------------------+-------+--------+
hive (default)> select * from order_partiton where event_month='2014-06'; ---新增分区数据可以查询到
OK
order_partiton.order_no order_partiton.order_time order_partiton.event_month
10703007267488 2014-05-01 06:01:12.334+01 2014-06
10101043505096 2014-05-01 07:28:12.342+01 2014-06
10103043509747 2014-05-01 07:50:12.33+01 2014-06
10103043501575 2014-05-01 09:27:12.33+01 2014-06
10104043514061 2014-05-01 09:03:12.324+01 2014-06
Time taken: 1.484 seconds, Fetched: 5 row(s)
hive (default)> show partitions order_partiton; ---查询分区信息
OK
partition
event_month=2014-05
event_month=2014-06
Time taken: 0.123 seconds, Fetched: 2 row(s)
多级分区表
hive (d7_hive)> create table order_mulit_partiton(
> order_no string,
> order_time string
> )
> PARTITIONED BY (event_month string, step string)
> row format delimited fields terminated by '\t';
OK
Time taken: 0.893 seconds
hive (d7_hive)>
> load data local inpath '/home/hadoop/data/order_created.txt' into table order_mulit_partiton PARTITION (event_month='2014-05', step='1');
Loading data to table d7_hive.order_mulit_partiton partition (event_month=2014-05, step=1)
Partition d7_hive.order_mulit_partiton{event_month=2014-05, step=1} stats: [numFiles=1, numRows=0, totalSize=213, rawDataSize=0]
OK
Time taken: 1.556 seconds
HDFS:/user/hive/warehouse/d7_hive.db/order_mulit_partiton/event_month=2014-05/step=1 ---两层目录
将相同部门的人员写到一个分区内
hive (default)> CREATE TABLE emp_partition(
> empno int,
> ename string,
> job string,
> mgr int,
> hiredate string,
> sal double,
> comm double
> )
> PARTITIONED BY (deptno int) ---以部门编号进行分区
> ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
OK
Time taken: 0.619 seconds
hive (default)> select * from data_emp;
OK
data_emp.empno data_emp.ename data_emp.job data_emp.mgr data_emp.hiredate data_emp.sal data_emp.comm data_emp.deptno
7369 SMITH CLERK 7902 1980-12-17 00:00:00 800.0 NULL 20
7499 ALLEN SALESMAN 7698 1981-02-20 00:00:00 1600.0 300.0 30
7521 WARD SALESMAN 7698 1981-02-22 00:00:00 1250.0 500.0 30
7566 JONES MANAGER 7839 1981-04-02 00:00:00 2975.0 NULL 20
7654 MARTIN SALESMAN 7698 1981-09-28 00:00:00 1250.0 1400.0 30
7698 BLAKE MANAGER 7839 1981-05-01 00:00:00 2850.0 NULL 30
7782 CLARK MANAGER 7839 1981-06-09 00:00:00 2450.0 NULL 10
7788 SCOTT ANALYST 7566 1982-12-09 00:00:00 3000.0 NULL 20
7839 KING PRESIDENT NULL 1981-11-17 00:00:00 5000.0 NULL 10
7844 TURNER SALESMAN 7698 1981-09-08 00:00:00 1500.0 0.0 30
7876 ADAMS CLERK 7788 1983-01-12 00:00:00 1100.0 NULL 20
7900 JAMES CLERK 7698 1981-12-03 00:00:00 950.0 NULL 30
7902 FORD ANALYST 7566 1981-12-03 00:00:00 3000.0 NULL 20
7934 MILLER CLERK 7782 1982-01-23 00:00:00 1300.0 NULL 10
Time taken: 2.109 seconds, Fetched: 14 row(s)
hive (default)> insert into table emp_partition partition(deptno=30)
> select empno,ename,job,mgr,hiredate,sal,comm from data_emp where deptno=10;
hive (default)> select * from emp_partition where deptno=30;
OK
emp_partition.empno emp_partition.ename emp_partition.job emp_partition.mgr emp_partition.hiredate emp_partition.sal emp_partition.comm emp_partition.deptno
7782 CLARK MANAGER 7839 1981-06-09 00:00:00 2450.0 NULL 30
7839 KING PRESIDENT NULL 1981-11-17 00:00:00 5000.0 NULL 30
7934 MILLER CLERK 7782 1982-01-23 00:00:00 1300.0 NULL 30
Time taken: 0.849 seconds, Fetched: 3 row(s)
hive (default)> create table emp_dynamic_partition( ---动态分区表
> empno int,
> ename string,
> job string,
> mgr int,
> hiredate string,
> sal double,
> comm double
> )
> PARTITIONED BY (deptno int)
> ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
OK
Time taken: 0.82 seconds
hive (default)> insert overwrite table emp_dynamic_partition partition(deptno)
> select empno,ename,job,mgr,hiredate,sal,comm,deptno from data_emp; ---导入数据
hive (default)> select * from emp_dynamic_partition where deptno = 20;
OK
emp_dynamic_partition.empno emp_dynamic_partition.ename emp_dynamic_partition.job emp_dynamic_partition.mgr emp_dynamic_partition.hiredate emp_dynamic_partition.sal emp_dynamic_partition.comm emp_dynamic_partition.deptno
7369 SMITH CLERK 7902 1980-12-17 00:00:00 800.0 NULL 20
7566 JONES MANAGER 7839 1981-04-02 00:00:00 2975.0 NULL 20
7788 SCOTT ANALYST 7566 1982-12-09 00:00:00 3000.0 NULL 20
7876 ADAMS CLERK 7788 1983-01-12 00:00:00 1100.0 NULL 20
7902 FORD ANALYST 7566 1981-12-03 00:00:00 3000.0 NULL 20
Time taken: 0.116 seconds, Fetched: 5 row(s)
2098

被折叠的 条评论
为什么被折叠?



