1准备数据
rating_json
2 导入
hive
#创建原始数据表
create table rating_json(json string);
#导入数据
load data local inpath '/home/hadoop/data/rating.json' overwite into table rating_json;
#查看
select * from rating_json limit 10;
3使用json_tuple函数解析json
select json_tuple(json,'movie','rate','time','userid') as (movie_id,rate,time,user_id) from rating_json limit10;
4大宽表
将时间戳处理成带有年月日的新大宽表,后续的统计分析都是基于这个rating_width表进行
create table rating_width as
select
movie_id,rate,time,user_id,
year(from_unixtime(cast(time as bigint))) as year,
month(from_unixtime(cast(time as bigint))) as month,
day(from_unixtime(cast(time as bigint))) as day,
hour(from_unixtime(cast(time as bigint))) as hour,
minute(from_unixtime(cast(time as bigint))) as minute,
from_unixtime(cast(time as bigint)) as ts
from
(
select
json_tuple(json,'movie','rate','time','userid') as (movie_id,rate,time,user_id)
from rating_json
) tmp
;
select * from rating_width limit 10;
5 top
窗口分析函数
需求:统计每种性别中年龄最大的两条数据
先根据性别分组,然后根据年龄做降序,取前2条
select id,age,name,gender,r
from
(
select id,age,name,gender,
ROW_NUMBER() over(PARTITION BY gender order by age desc) as r
from hive_rownumber
) t where t.r<=2;
6 beeline
beeline是hive推荐的连接方式,旧的连接可能会被淘汰,配合hiveserver2(HS2)一起使用
[hadoop@hadoop001 bin]$ ./beeline --help
which: no hbase in (/home/hadoop/app/hive/bin:/home/hadoop/app/hadoop/bin:/home/hadoop/app/hadoop/sbin:/usr/home/hadoop/bin:/bin:/usr/java/jdk1.8.0_45/bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sbin:/home/hadoop/bin)
Usage: java org.apache.hive.cli.beeline.BeeLine
-u <database url> the JDBC URL to connect to ##连接jdbc url
-r reconnect to last saved connect url (in conjunction with !save)
-n <username> the username to connect as ##用户名
-p <password> the password to connect as ##密码
-d <driver class> the driver class to use ##驱动
-i <init file> script file for initialization
-e <query> query that should be executed
-f <exec file> script file that should be executed
-w (or) --password-file <password file> the password file to read password from
--hiveconf property=value Use value for given property
--hivevar name=value hive variable name and value
This is Hive specific settings in which variables
can be set at session level and referenced in Hive
commands or queries.
--property-file=<property-file> the file to read connection properties (url, driver, user, password) from
--color=[true/false] control whether color is used for display
--showHeader=[true/false] show column names in query results
--headerInterval=ROWS; the interval between which heades are displayed
--fastConnect=[true/false] skip building table/column list for tab-completion
--autoCommit=[true/false] enable/disable automatic transaction commit
--verbose=[true/false] show verbose error messages and debug info
--showWarnings=[true/false] display connection warnings
--showNestedErrs=[true/false] display nested errors
--numberFormat=[pattern] format numbers using DecimalFormat pattern
--force=[true/false] continue running script even after errors
--maxWidth=MAXWIDTH the maximum width of the terminal
--maxColumnWidth=MAXCOLWIDTH the maximum width to use when displaying columns
--silent=[true/false] be more silent
--autosave=[true/false] automatically save preferences
--outputformat=[table/vertical/csv2/tsv2/dsv/csv/tsv] format mode for result display
--incrementalBufferRows=NUMROWS the number of rows to buffer when printing rows on stdout,
defaults to 1000; only applicable if --incremental=true
and --outputformat=table
--truncateTable=[true/false] truncate table column when it exceeds length
--delimiterForDSV=DELIMITER specify the delimiter for delimiter-separated values output format (default: |)
--isolation=LEVEL set the transaction isolation level
--nullemptystring=[true/false] set to true to get historic behavior of printing null as empty string
--maxHistoryRows=MAXHISTORYROWS The maximum number of rows to store beeline history.
--convertBinaryArrayToString=[true/false] display binary column data as string or as byte array
--help display this message
Example:
1. Connect using simple authentication to HiveServer2 on localhost:10000
$ beeline -u jdbc:hive2://localhost:10000 username password
2. Connect using simple authentication to HiveServer2 on hs.local:10000 using -n for username and -p for password
$ beeline -n username -p password -u jdbc:hive2://hs2.local:10012
3. Connect using Kerberos authentication with hive/localhost@mydomain.com as HiveServer2 principal
$ beeline -u "jdbc:hive2://hs2.local:10013/default;principal=hive/localhost@mydomain.com
4. Connect using SSL connection to HiveServer2 on localhost at 10000
$ beeline jdbc:hive2://localhost:10000/default;ssl=true;sslTrustStore=/usr/local/truststore;trustStorePassword=mytruststorepassword
5. Connect using LDAP authentication
$ beeline -u jdbc:hive2://hs2.local:10013/default <ldap-username> <ldap-password>
#启动Hiveserver2
bin/hiveserver2
#启动beeline
bin/beeline
#连接本地数据库
[hadoop@hadoop001 bin]$ ./beeline -u jdbc:hive2://hadoop001:10000/d7_hive -n hadoop
修改hs2的默认端口:
添加到hive-site.xml中
<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>
复杂数据类型
arrays: ARRAY<data_type> (Note: negative values and non-constant expressions are allowed as of Hive 0.14.)
maps: MAP<primitive_type, data_type> (Note: negative values and non-constant expressions are allowed as of Hive 0.14.)
structs: STRUCT<col_name : data_type [COMMENT col_comment], ...>
union: UNIONTYPE<data_type, data_type, ...> (Note: Only available starting with Hive 0.7.0.)
array
#创建表
create table hive_array(name string, work_locations array<string>)
row format delimited fields terminated by '\t'
COLLECTION ITEMS TERMINATED BY ',';
#导入数据
load data local inpath '/home/hadoop/data/hive_array.txt' overwaite into table hive_array
#查询
select * from hive_array
如何查询
work_locations[index] : index from zero
select name,work_locations[0] from hive_array;
select name,size(work_locations)from hive_array;
select * from hive_array where array_contains(work_locations,"beijing")
map
create table hive_map(id int,name string, members map<string,string>, age int)
row format delimited fields terminated by ','
COLLECTION ITEMS TERMINATED BY '#'
MAP KEYS TERMINATED BY ':';
load data local inpath '/home/hadoop/data/hive_map.txt' overwaite into table hive_map
select * from hive_map
select id,name,members["father'] from hive_map
select id,name,members["father'] as father,members['sister']as sister from hive_map
获取map中的key数组
select id,map_keys(members) from hive-map;
获取map中的value数组
select id,map_values(members) from hive-map;
select id,map_values(members),size (members) from hive-map;
father:xiaoming#mother:xiaohuang#brother:xiaoxu members['key']
Struct
create table hive_struct(ip string, userinfo struct<name:string, age:int>)
row format delimited fields terminated by '#'
COLLECTION ITEMS TERMINATED BY ':';
load data local inpath '/home/hadoop/data/hive_struct.txt' overwaite into table hive_struct
select * from hive_struct
userinfo.xxx
Partition 分区表
Hive:分区表 partition
bigdata:天、小时;hive里面是一张表,只是不同的分区在不同的目录里面
一级分区
create table order_partiton(
order_no string,
order_time string
)
PARTITIONED BY (event_month string)
row format delimited fields terminated by '\t';
load data local inpath '/home/hadoop/data/order_created.txt' into table order_partiton PARTITION (event_month='2014-05');
select * from order_partiton;
在HDFS上的数据存储目录:tablename/partition_column=partition_value
order_partiton/event_month=2014-05
真正的表的字段是不包含分区字段的,分区字段只是HDFS上的文件夹的名称
select * from order_partiton where event_month='2014-05'
在生产上,一般是数据经过清洗后存放在HDFS目录上,然后将目录的数据加载到分区表中
load data inpath '' into table order_partiton PARTITION (event_month='2014-06');
多级分区
create table order_mulit_partiton(
order_no string,
order_time string
)
PARTITIONED BY (event_month string, step string)
row format delimited fields terminated by '\t';
load data local inpath '/home/hadoop/data/order_created.txt' into table order_mulit_partiton PARTITION (event_month='2014-05', step='1');
select * from order_mulit_partiton
需求
将相同部门的人写到一个分区里面去
静态分区
CREATE TABLE emp_partition(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double
)
PARTITIONED BY (deptno int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
insert into table emp_partition partition(deptno=30)
select empno,ename,job,mgr,hiredate,sal,comm from ruozedata_emp where deptno=10;
select * from emp_partition where daptno=10;
动态分区
create table emp_dynamic_partition(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double
)
PARTITIONED BY (deptno int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
设置非严格模式;set hive.exec.dynamic.partition.mode=nonstrict
insert overwrite table emp_dynamic_partition partition(deptno)
select empno,ename,job,mgr,hiredate,sal,comm,deptno from ruozedata_emp;
select * from emp_dynamic_partition where deptno=XXX;
如何高性能的刷新分区数据
可以通过命令 MSCK [REPAIR] TABLE table_name ; 刷新分区
语法:ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location'][, PARTITION partition_spec [LOCATION 'location'], ...];