HIVE基础

一 Hive的HQL语法

1 DDL

1.1 DataBase

1.1.1 创建数据库语法
CREATE DATABASE [IF NOT EXISTS] 数据库名
  [COMMENT 注释信息]
  [LOCATION hdfs_path]
  [WITH DBPROPERTIES (property_name=property_value, ...)];
 	
e.g.
CREATE DATABASE IF NOT EXISTS hive
COMMENT 'the 1st database demo'
LOCATION 'hdfs://192.168.49.200:9000/user/hive/warehouse'
WITH DBPROPERTIES('name'='lixi');

CREATE DATABASE IF NOT EXISTS hive2
COMMENT 'the 2nd database demo'
WITH DBPROPERTIES('name'='rocklee');
1.1.2 删除数据库语法
DROP DATABASE [IF EXISTS] 数据库名 [CASCADE](级联删除);
1.1.3 修改数据库
ALTER DATABASE 数据库名 SET DBPROPERTIES (property_name=property_value, ...);   -- (Note: SCHEMA added in Hive 0.14.0) # 修改数据库的属性设置
 
ALTER (DATABASE) database_name SET OWNER [USER|ROLE] user_or_role;   -- (Note: Hive 0.13.0 and later; SCHEMA added in Hive 0.14.0) # 修改用户名和角色
  
ALTER (DATABASE) database_name SET LOCATION hdfs_path; -- (Note: Hive 2.2.1, 2.4.0 and later)
# 修改你数据存放在hdfs的路径
1.1.4 切换数据库
Use Database

USE database_name;
USE DEFAULT;

1.2 Table

1.2.1 建表
//1. 
CREATE [TEMPORARY(临时表)] [EXTERNAL(外部表)] TABLE [IF NOT EXISTS] [db_name.]table_name  # 表名
(
    列名 列数据类型  [COMMENT  注释],
    ...... ,
    ......
)
[COMMENT table_comment] #表批注
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] # 分区
[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS] #分桶
[
   [ROW FORMAT row_format] 
   [STORED AS file_format]
     | STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)]
] # 指定行和文件的格式化
[LOCATION hdfs_path] # 指定表存放的路径
[TBLPROPERTIES (property_name=property_value, ...)] # 指定表属性
[AS select_statement] # 在建表的时候,以当前的查询语句的结果作为表数据

//2. 
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
  LIKE existing_table_or_view_name
  [LOCATION hdfs_path]; # 创建一张新表,以原有的表或者视图作为数据。
1.2.1.1 建表的应用(重点)
//1. 内部表
CREATE TABLE IF NOT EXISTS `hive`.`t_user` (
`id` int COMMENT 'userid',
`name` string COMMENT 'username',
`age` int COMMENT 'user age'
) COMMENT 'user info'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','(字段分割)
STORED AS textfile   
;

LOAD DATA LOCAL INPATH '/home/user.txt' INTO TABLE `hive`.`t_user`;(hive是数据库名,可写可不写)

//2. 外部表
CREATE EXTERNAL TABLE IF NOT EXISTS `hive`.`t_user2` (
`id` int COMMENT 'userid',
`name` string COMMENT 'username',
`age` int COMMENT 'user age'
) COMMENT 'user info'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS textfile
LOCATION '/t_user2'(自定义外部表在hdfs的位置)
;

LOAD DATA LOCAL INPATH '/home/user.txt' INTO TABLE `hive`.`t_user2`;

//3. 总结:当我们删除了外部表和内部表之后得出结果:
3.1 内部表删除了数据以及元数据
3.2 外部表删除了元数据,但是数据本身并不会被删除

//4. 临时表
CREATE TEMPORARY TABLE IF NOT EXISTS `hive`.`t_user3` (
`id` int COMMENT 'userid',
`name` string COMMENT 'username',
`age` int COMMENT 'user age'
) COMMENT 'user info'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS textfile
;

LOAD DATA LOCAL INPATH '/home/user.txt' INTO TABLE `hive`.`t_user3`;

结论:
4.1 发现临时表是没有元数据的,也没有数据存放在hdfs上。那么他的元数据和数据都存在内存之中。
4.2 因为数据以及元数据都是存放在内存之中,一旦hive客户端和服务端断开连接,这些数据全部移除。


//5. like和as select区别
CREATE TABLE IF NOT EXISTS `hive`.`t_user2`
AS
SELECT * FROM `hive`.`t_user`; # 产生了新的目录和文件

CREATE EXTERNAL TABLE IF NOT EXISTS `hive`.`t_user3`
LIKE `hive`.`t_user`
LOCATION '/user/hive/warehouse/hive.db/t_user'; # 引用了原表的数据,一旦原表没有了,这个表也会受到影响(若不写LOCATION,则只复制表结构)
1.2.2 修改表(了解)
1. 重命名表
ALTER TABLE table_name RENAME TO new_table_name;

2. Alter Table Properties
ALTER TABLE table_name SET TBLPROPERTIES table_properties;
table_properties:
(property_name = property_value, property_name = property_value, ... )

2.1 修改表的批注信息
ALTER TABLE table_name SET TBLPROPERTIES ('comment' = new_comment);
1.2.3 修改列
//1. 修改列信息

ALTER TABLE table_name CHANGE [COLUMN] col_old_name col_new_name column_type
  [COMMENT col_comment] [FIRST|AFTER column_name] [CASCADE];

e.g.
ALTER TABLE `hive`.`t_1` CHANGE COLUMN `id` `uid` string COMMENT 'this is uid';

ALTER TABLE `hive`.`t_1` CHANGE COLUMN `uname` `uname` string COMMENT 'this is uname' FIRST;

ALTER TABLE `hive`.`t_1` CHANGE COLUMN `age` `age` int COMMENT 'this is age' AFTER `uname`;

tip:修改列字段的类型或者顺序对数据的格式是没有影响的,但是由于修改了元数据,会对查询的结果造成影响(因为表定义的数据类型不同,更改数据顺序会导致无法显示)


//2. 添加或者删除列字段
ALTER TABLE table_name ADD|REPLACE COLUMNS (col_name data_type [COMMENT col_comment], ...)
  [CASCADE] 
  
e.g.
ALTER TABLE `hive`.`t_1` ADD COLUMNS (`salary` double, `common` double); # 添加列
ALTER TABLE `hive`.`t_1` REPLACE COLUMNS (`salary` double, `common` double); # 用指定列讲原来的列全部替换


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-phVZ8JCX-1595216433089)(F:\桌面\笔记\笔记\hive\image\1593603860103.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fjOUc4Kn-1595216433092)(F:\桌面\笔记\笔记\hive\image\1593603930550.png)]

1.2.2 加载数据
//1. load
LOAD DATA [LOCAL(本地)] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION(分区) (partcol1=val1, partcol2=val2 ...)]
 
e.g.
LOAD DATA LOCAL INPATH '/home/user.txt' INTO TABLE `hive`.`t_1`;

LOAD DATA INPATH '/user.txt' INTO TABLE `hive`.`t_1`;

tip:
local是从本地文件系统中复制数据到表目录下
不加local是从HDFS文件系统中剪切数据到表目录下
load方式加载数据只能用于表是textfile格式的情况

//2. insert
insert into t_user3 values(4, 'narudo', 18);
tip:
我们发现这种传统的方式也可以插入一条数据,但是一条数据就是一个小文件被存放到hdfs中,还会之mr才能出结果。效率很低,并且大量小文件,实际应用中根本不会使用。

INSERT OVERWRITE(全覆盖) TABLE 表名 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] 
select_statement1 FROM from_statement;


INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement;

e.g.
CREATE TABLE `hive`.`t_user4` LIKE `hive`.`t_user3`;
INSERT OVERWRITE TABLE `hive`.`t_user4` IF NOT EXISTS
SELECT * FROM `hive`.`t_user3`;
1.2.3 多数据插入
将多张表的数据插入到一张表中
FROM from_statement
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2]
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...;

e.g.
CREATE TABLE `hive`.`t_user5` LIKE `hive`.`t_user3`;
CREATE TABLE `hive`.`t_user6` LIKE `hive`.`t_user3`;

FROM `hive`.`t_user3`
INSERT OVERWRITE TABLE `hive`.`t_user5` IF NOT EXISTS SELECT * 
INSERT INTO(追加到前一个表的后面) TABLE `hive`.`t_user6` IF NOT EXISTS SELECT * 
;
1.2.4 Write Data
INSERT OVERWRITE [LOCAL] DIRECTORY directory1
  [ROW FORMAT row_format] [STORED AS file_format] 
  SELECT ... FROM ...
e.g.
INSERT OVERWRITE DIRECTORY '/write'
ROW FORMAT DELIMITED
SELECT * FROM `hive`.`t_user5`;(将表中的数据写到hdfs中)
1.2.5删除表
#1删除表
DROP TABLE  表名;
#2先删除表再复制表结构
TRUNCATE TABLE 表名;

二 Hive的分区

1 分区的概念

说得直白一点,hive的分区其实在表目录下建立子目录。

1.2 快速建立分区表

CREATE TABLE IF NOT EXISTS `hive`.`t_part` (
`id` int COMMENT 'userid',
`name` string COMMENT 'username',
`age` int COMMENT 'user age'
) COMMENT 'user info'
PARTITIONED BY (`birth` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS textfile
;

LOAD DATA LOCAL INPATH '/home/user.txt' INTO TABLE `hive`.`t_part` PARTITION (`birth`='2020-07-01');




1.2.1 三级分区
CREATE TABLE IF NOT EXISTS `hive`.`t_part2` (
`id` int COMMENT 'userid',
`name` string COMMENT 'username',
`age` int COMMENT 'user age'
) COMMENT 'user info'
PARTITIONED BY (`year` string, `month` string, `day` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS textfile
;

LOAD DATA LOCAL INPATH '/home/user.txt' INTO TABLE `hive`.`t_part2` PARTITION (`year`='2020', `month`='01', `day`='01');

tip:
以上做的一些案例都是属于静态分区,所谓静态分区都是自己手动指定分区的值。
1.2.2 一些常用于处理分区的命令
show partitions tablename; # 显示实际数据中的分区目录
1.2.3 修改分区-添加分区
//1. 语法
ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location'][, PARTITION partition_spec [LOCATION 'location'], ...];
 
partition_spec:
  : (partition_column = partition_col_value, partition_column = partition_col_value, ...)

e.g. 添加一个分区
ALTER TABLE `hive`.`t_part` ADD PARTITION (birth='2008-08-08') location '/user/hive/warehouse/hive.db/t_part';
1.2.4 删除分区
ALTER TABLE `hive`.`t_part` DROP PARTITION(birth='2020-07-01');

tip:删除分区的元数据以及hdfs中的数据
1.2.5 修改分区的hfds的路径
ALTER TABLE table_name [PARTITION partition_spec] SET LOCATION "new location";

ALTER TABLE `hive`.`t_part` PARTITION(birth='2020-07-01')
SET LOCATION 'hdfs://qphone01:9000/birth=2020-07-01';

tip:
修改的是元数据中对应的分区读取数据的路径,但是并不会直接在hdfs中创建新的分区目录

2动态分区

2.1 什么叫动态分区

根据已有的字段来自动分区的方式,我们称之为动态分区。
我们hive默认有两种:严格模式(strict)和非严格模式(nonstrict)。在严格模式下,使用动态分区必须要有一个静态分区。

2.2配置hive-site.xml

<configuration>
<property>
    <name>javax.jdo.option.ConnectionURL</name>
    <value>jdbc:mysql://qphone02:3306/hive</value>
    <description>JDBC connect string for a JDBC metastore</description>
</property>

<property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>com.mysql.jdbc.Driver</value>
    <description>Driver class name for a JDBC metastore</description>
</property>

<property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>root</value>
    <description>username to use against metastore database</description>
</property>

<property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>123456</value>
    <description>password to use against metastore database</description>
</property>
<property>
    <name>hive.exec.mode.local.auto</name>
    <value>true</value>
 <description>让Hive决定是否在本地模式下自动运行</description>
</property>

<property>
    <name>hive.exec.mode.local.auto.inputbytes.max</name>
    <value>134217728</value>
    <description>
        当hive.exec.mode.local.auto=true,对于本地模式,输入字节应该小于此值。
    </description>
</property>

<property>
    <name>hive.exec.mode.local.auto.input.files.max</name>
    <value>4</value>
    <description>
        当hive.exec.mode.local.auto=true,本地模式下任务数应小于此。
    </description>
</property>
<property>
        <name>hive.exec.dynamic.partition</name>
        <value>true</value>
        <description>DML/DDL中是否允许动态分区。</description>
</property>
<property>
        <name>hive.exec.dynamic.partition.mode</name>
        <value>nonstrict</value>
        <description>
                在严格模式下,用户必须指定至少一个静态分区,以防用户意外覆盖所有分区。
        在非严格模式下,所有分区都允许是动态的。
        strict
        nonstrict
        </description>
</property>

<property>
    <name>hive.exec.max.dynamic.partitions</name>
    <value>1000</value>
    <description>总共允许创建的动态分区的最大数量。</description>
</property>

 <property>
    <name>hive.exec.max.dynamic.partitions.pernode</name>
    <value>100</value>
    <description>允许在每个mapper/reducer节点中创建的动态分区的最大数量。</description>
 </property>
</configuration>

2.3 演示动态分区

动态分区不能使用load加载,所以需要先建一个来源表,再从来源表复制数据到分区表
//1. 创建普通表一枚
CREATE TABLE IF NOT EXISTS `hive`.`t_dynamic_source` (
`id` int COMMENT 'userid',
`name` string COMMENT 'username',
`age` int COMMENT 'user age',
`birth` string COMMENT 'user birth'
) COMMENT 'dynamic source'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS textfile
;

//2. 创建一张分区表
CREATE TABLE IF NOT EXISTS `hive`.`t_dynamic_part` (
`id` int COMMENT 'userid',
`name` string COMMENT 'username',
`age` int COMMENT 'user age'
) COMMENT 'dynamic part'
PARTITIONED BY (`birth` string)(选择birth作为分区字段,所以建表时不能加入birth字段)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS textfile
;

//3.1 加载数据到原始表t_dynamic_source
LOAD DATA LOCAL INPATH '/root/part.txt' INTO TABLE `hive`.`t_dynamic_source`;
//3.2 将原始表的数据插入到分区表:动态分区
FROM `hive`.`t_dynamic_source`
INSERT OVERWRITE TABLE `hive`.`t_dynamic_part` PARTITION(`birth`)
SELECT *;INSERT OVERWRITE(覆盖)|into(追加) TABLE `hive`.`t_dynamic_part` PARTITION(`birth`)
SELECT * FROM `hive`.`t_dynamic_source`;

2.4 演示严格模式下的动态分区(混合分区)

1. 第一件事情:先修改hive-site.xml中的严格模式
2. 重新建立原表
CREATE TABLE IF NOT EXISTS `hive`.`t_dynamic_source2` (
`id` int COMMENT 'userid',
`name` string COMMENT 'username',
`age` int COMMENT 'user age',
`month` string COMMENT 'user month'
) COMMENT 'dynamic source2'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS textfile
;
3. 重新建立分区表
CREATE TABLE IF NOT EXISTS `hive`.`t_dynamic_part2` (
`id` int COMMENT 'userid',
`name` string COMMENT 'username',
`age` int COMMENT 'user age'
) COMMENT 'dynamic part'
PARTITIONED BY (`year` string, `month` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS textfile
;

//4.1 加载数据到原始表t_dynamic_source
LOAD DATA LOCAL INPATH '/root/part2.txt' INTO TABLE `hive`.`t_dynamic_source2`;
//4.2 将原始表的数据插入到分区表:动态分区
FROM `hive`.`t_dynamic_source2`
INSERT OVERWRITE TABLE `hive`.`t_dynamic_part2` PARTITION(`year`='2020'(为写死的),`month`(动态分析的字段))
SELECT * where `id` = 4;

三、hive的分桶

注:分桶不能用load加载

SELECT 语法

SELECT [ALL | DISTINCT] select_expr, select_expr, ...
  FROM table_reference
  [WHERE where_condition]
  [GROUP BY col_list]
  [ORDER BY col_list]
  [CLUSTER BY col_list
    | [DISTRIBUTE BY col_list] [SORT BY col_list]
  ]
 [LIMIT [offset,] rows]

1 介绍

说得直白一点,可以直观的以mr的分区和hive的分桶做对比,换言之,mr的分区就相当于是hive的分桶。分桶的最大的作用是帮助我们减少笛卡尔积。

2 分桶的使用

2.1 数据

1,lixi,man,18,java
2,lixi2,man,18,java
3,lixi3,man,18,java
4,lixi4,man,18,java
5,lixi5,man,18,java
6,lixi6,man,18,java
7,lixi7,man,18,java
8,lixi8,man,18,java
9,lixi9,man,18,java
10,lixi11,man,18,java
11,lixi12,man,18,java
12,lixi13,man,18,java
13,lixi14,man,18,java
14,lixi15,man,18,java
15,lixi16,man,18,java
16,lixi17,man,18,java
17,lixi18,man,18,java
18,lixi19,man,18,j

2.2 测试分桶

1. 创建原始数据表
create table `hive`.`t_stu`(
`sno` int,
`sname` string,
`sex` string,
`sage` int,
`sclass` string)
row format delimited
fields terminated by ','
stored as textfile;

load data local inpath '/root/stu.txt' into table `hive`.`t_stu`;

2. 分桶查询
- select * from t_stu cluster by(sno);
- select * from t_stu distribute by(sno) sort by (sno);
tip:第一句和第二句的作用是一样的

3. 手动的设置reduce的个数
set mapreduce.job.reduces=4;

4. 建立分桶表
create table t_buk7(
sno int,
sname string,
sex string,
sage int,
sclass string)
clustered by (sno) sorted by (sname desc) into 6 buckets
row format delimited
fields terminated by ','
stored as textfile;

5. 导入分桶的数据
insert into t_buk7
select * from hive.t_stu distribute by(sno) sort by (sname desc);

hive只有ORC文件支持ACID(事务)

CREATE TABLE 

3 抽样查询(了解)

3.1 按桶取样

//1. 按桶取样
TABLESAMPLE (BUCKET x OUT OF y [ON colname])

e.g.
SELECT * FROM t_buk TABLESAMPLE(BUCKET 1 OUT OF 4 ON sno) s;
tip:
从4个桶中抽取其中一个第一个桶中的数据

3.2 按百分比取样

//2. 按百分比取样
TABLESAMPLE (n PERCENT)

e.g.
SELECT * FROM t_buk TABLESAMPLE (0.5 PERCENT);

3.3 按容量来取样

SELECT * FROM t_buk TABLESAMPLE(2b);

3.4 按行取样

SELECT * FROM t_buk TABLESAMPLE(10 ROWS);

四 DML

1 加载数据

//1. load
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
 
e.g.
LOAD DATA LOCAL INPATH '/home/user.txt' INTO TABLE `hive`.`t_1`;

LOAD DATA INPATH '/user.txt' INTO TABLE `hive`.`t_1`;

tip:
local是从本地文件系统中复制数据到表目录下
不加local是从HDFS文件系统中剪切数据到表目录下
load方式加载数据只能用于表是textfile格式的情况

//2. insert
insert into t_user3 values(4, 'narudo', 18);
tip:
我们发现这种传统的方式也可以插入一条数据,但是一条数据就是一个小文件被存放到hdfs中,还会之mr才能出结果。效率很低,并且大量小文件,实际应用中根本不会使用。

INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] 
select_statement1 FROM from_statement;
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement;

e.g.
CREATE TABLE `hive`.`t_user4` LIKE `hive`.`t_user3`;
INSERT OVERWRITE TABLE `hive`.`t_user4` IF NOT EXISTS
SELECT * FROM `hive`.`t_user3`;

2 多数据插入

FROM from_statement
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2]
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...;

e.g.
CREATE TABLE `hive`.`t_user5` LIKE `hive`.`t_user3`;
CREATE TABLE `hive`.`t_user6` LIKE `hive`.`t_user3`;

FROM `hive`.`t_user3`
INSERT OVERWRITE TABLE `hive`.`t_user5` IF NOT EXISTS SELECT * 
INSERT OVERWRITE TABLE `hive`.`t_user6` IF NOT EXISTS SELECT * 
;

3 Write Data

INSERT OVERWRITE [LOCAL] DIRECTORY directory1
  [ROW FORMAT row_format] [STORED AS file_format] 
  SELECT ... FROM ...
e.g.
INSERT OVERWRITE DIRECTORY '/write'
ROW FORMAT DELIMITED
SELECT * FROM `hive`.`t_user5`;

4 Update/Delete(了解)(只有ORC

文件可以用)

5 Join

5.1 语法

SELECT [ALL | DISTINCT(去重)] select_expr, select_expr, ...
  FROM table_reference
  [WHERE where_condition]
  [GROUP BY col_list]
  [ORDER BY col_list]
  [CLUSTER BY col_list
    | [DISTRIBUTE BY col_list] [SORT BY col_list]
  ]
 [LIMIT [offset,] rows]
 [UNION/UNION ALL]
 
 join_table:
    table_reference [INNER] JOIN table_factor [join_condition]
  | table_reference {LEFT|RIGHT|FULL} [OUTER] JOIN table_reference join_condition
  | table_reference LEFT SEMI JOIN table_reference join_condition
  | table_reference CROSS JOIN table_reference [join_condition]

5.2 数据

//1. emp
1,lixi,1
2,zhangshuai,2
3,cuikai,3
4,xiaojianxiong,4
6,ranldo,10

//2. dept
1,Java
2,Bigdata
3,UI
4,HTML5
5,Python

5.3 测试

//1. 建立表
create table if not exists t_emp(
eid int,
ename string,
did int
)
row format delimited
fields terminated by ','
;

create table if not exists t_dept(
did int,
dname string
)
row format delimited
fields terminated by ','
;

//2. 导入数据
load data local inpath '/home/emp.txt' into table t_emp;
load data local inpath '/home/dept.txt' into table t_dept;


//3. 内连接
select
e.eid,
e.ename,
d.dname
from
t_emp e
join
t_dept d
on
e.did = d.did
;

//4. 左连接
select
e.eid,
e.ename,
d.dname
from
t_emp e
left join
t_dept d
on
e.did = d.did
;

//5. 右连接
select
e.eid,
e.ename,
d.dname
from
t_emp e
right join
t_dept d
on
e.did = d.did
;

//6. 全连接
select
e.eid,
e.ename,
d.dname
from
t_emp e
full join
t_dept d
on
e.did = d.did
;

//7. 左半连接 : 查询所有符合条件的左边的表的数据(右表数据不能查询,否则报错)
select
e.eid,
e.ename,
e.did
from
t_emp e
left semi join
t_dept d
on
e.did = d.did
;

6 Group By

6.1 准备数据

//1. emp
1,lixi,1
2,zhangshuai,2
3,cuikai,2
4,xiaojianxiong,2
6,ranldo,1

//2. dept
1,Java
2,Bigdata
3,UI
4,HTML5
5,Python

6.2 分组查询——每个部门的人数

SELECT
d.did,
COUNT(*)
FROM
t_emp e
JOIN
t_dept d
ON
e.did = d.did
GROUP BY
d.did
;

SELECT
d.dname,
COUNT(*)
FROM
t_emp e
JOIN
t_dept d
ON
e.did = d.did
group by
d.did, d.dname
;

tip:
我们发现在mysql中能够成功运行的sql代码,在hive中的不能运行。这个分组的sql中在hive里有一个要求,分组字段才能够做select的字段被查询出来。非分组字段在hive中不能使用!!!

7 Order By 和 Sort By的区别

- 局部排序,只保证单个的reduce内有序
e.g.
select * from a
distribute by (id) sort by(name);

- 全局有序,保证所有的reduce中的数据有序
e.g.
select * from a order by (name);

五 数据类型

1 Array

1.1 数据

重庆市	渝中区,江北区,南岸区,沙坪坝区,九龙坡区,渝北区,大渡口区,巴南区,北碚区
浙江省	杭州市,绍兴市,宁波市,台州市

1.2 操作

create table if not exists t_arr(
province string,
city array<string>
)
row format delimited 
fields terminated by '\t'
COLLECTION ITEMS TERMINATED BY ','
;

load data local inpath '/home/arr.txt' into table t_arr;

1.3 总结

1. 在建表的时候指定某字段的类型:array<string>
2. 一定指定COLLECTION ITEMS TERMINATED BY ','这个用于指定数组内的元素分隔符
3. 通过city[0]选取某个数组中的元素查询展示

2 Map

2.1 数据

马蓉	皮鞭:2,蜡烛:1,手铐:1 
宋喆	红酒:1,花生:1,皮皮虾:10

2.2 操作

create table if not exists t_map(
uname string,
item_id map<string, int>
)
row format delimited
fields terminated by '\t'
collection items terminated by ','
map keys terminated by ':';

load data local inpath '/home/t_map.txt' into table t_map;

select uname, item_id['皮鞭'] from t_map;

2.3 总结

1. 创建的时候指定map类型:map<string, int>,前面的是key的类型,后面的value的类型
2. map指定两种分隔符:
collection items terminated by ','用于指定map元素之间的分隔符
map keys terminated by ':';用于指定key和value之间的分隔符
3. 查询map的元素值的时候:item_id['key'],返回key对应的value

3 Struct

3.1 数据

林志玲	台湾省,台北市,台中街道
杨幂	上海市,浦东区,xxx街道

3.2 操作

create table if not exists t_str(
uname string,
item_id struct<province:string, city:string, street:string>
)
row format delimited
fields terminated by '\t'
collection items terminated by ',';

load data local inpath '/home/str.txt' into table t_str;

测试
select uname, item_id.province, item_id.city, item_id.street from t_str;

3.3 总结

1. 在创建的结构体的时候,指定结构体类型:struct<province:string, city:string, street:string>
2. 结构体的数据和数据的数组十分类似,结构体更像是一个对象
3. 设置的时候和数组一样,只需要指定collection items terminated by ',';表示属性的分割
4. 取结构体的属性值,字段名.属性名:addsss.province

4 explode: 行专列行数

hive> select explode(city) from t_arr;
OK
渝中区
江北区
南岸区
沙坪坝区
九龙坡区
渝北区
大渡口区
巴南区
北碚区
杭州市
绍兴市
宁波市
台州市

tip:
通过观察这个函数可以将数组中的一行数据转换位多行
但是,他不能和其他字段连用
hive> select province,explode(city) from t_arr;                        explode(爆炸)
FAILED: SemanticException [Error 10081]: UDTF's are not supported outside the SELECT clause, nor nested in expressions
hive>

//2. map
hive> select explode(item_id) as (itemname, count) from t_map;
OK
皮鞭    2
蜡烛    1
手铐    1
红酒    1
花生    1
皮皮虾  10

5 Lateral View

5.1 语法

LATERAL VIEW udtf(expression) tableAlias AS columnAlias (',' columnAlias)*
fromClause: FROM baseTable (lateralView)*

e.g. 测试的数组
SELECT province, ccity
FROM t_arr LATERAL VIEW explode(city) scity(虚拟视图别名) AS ccity(虚拟列别名);

hive> SELECT uname, itemname, count
    > FROM t_map LATERAL VIEW explode(item_id) item_tab AS itemname, count;
OK
马蓉    皮鞭    2
马蓉    蜡烛    1
马蓉    手铐    1
宋喆    红酒    1
宋喆    花生    1
宋喆    皮皮虾  10

六 Hive函数

1 介绍

内置函数:
自定义函数:
UDF : 用户自定义函数(重点)
UDAF : 用户自定义聚合函数
UDTF : explode

2 排名函数(开窗函数)

2.1 hive的3种排名函数

row_number():没有并列,相同名次顺序排列
rank():有并列,相同名次采取空位
dense_rank():有并列,相同名次不空位

2.2 数据

01 gp1802 84
02 gp1801 84
03 gp1802 84
04 gp1802 84
05 gp1801 81
06 gp1802 81
07 gp1802 81
08 gp1801 81
09 gp1802 81
10 gp1802 81
11 gp1803 81
12 gp1802 89
13 gp1802 89
14 gp1802 89
15 gp1803 89
16 gp1802 91
17 gp1802 97
18 gp1802 72
19 gp1804 73
20 gp1802 77
21 gp1802 71
22 gp1802 61
23 gp1803 65
24 gp1804 67
25 gp1804 62
26 gp1804 61
27 gp1802 91
28 gp1801 93
29 gp1802 91
30 gp1804 92
31 gp1803 41
32 gp1802 41
33 gp1802 42

2.3 建表

create table t_class(
sid string,
sclass string,
score int
)
row format delimited
fields terminated by ' '
;

load data local inpath '/home/class.txt' into table hive.t_class;

2.4 按班级分组并按分数排序(查询每个班级的成绩,按分数降序排序)

select
sclass,
sid,
score,
row_number() over(distribute by(sclass) sort by (score desc)) rank
from 
t_class
;

OK
gp1801  28      93      1
gp1801  02      84      2
gp1801  05      81      3
gp1801  08      81      4
gp1802  17      97      1
gp1802  29      91      2
gp1802  27      91      3
gp1802  16      91      4
gp1802  12      89      5
gp1802  13      89      6
gp1802  14      89      7
gp1802  01      84      8
gp1802  04      84      9
gp1802  03      84      10
gp1802  10      81      11
gp1802  06      81      12
gp1802  07      81      13
gp1802  09      81      14
gp1802  20      77      15
gp1802  18      72      16
gp1802  21      71      17
gp1802  22      61      18
gp1802  33      42      19
gp1802  32      41      20
gp1803  15      89      1
gp1803  11      81      2
gp1803  23      65      3
gp1803  31      41      4
gp1804  30      92      1
gp1804  19      73      2
gp1804  24      67      3
gp1804  25      62      4
gp1804  26      61      5

2.5 取每个班级的top3

select
clazz.sclass,
clazz.sid,
clazz.score,
clazz.rank
from
(
select
sclass,
sid,
score,
row_number() over(distribute by(sclass) sort by (score desc)) rank
from 
t_class
) clazz
where
clazz.rank <= 3;

2.6 rank和dense_rank

select
sclass,
sid,
score,
row_number() over(distribute by(sclass) sort by (score desc)) row_rank,
rank() over(distribute by(sclass) sort by (score desc)) rank,
dense_rank() over(distribute by(sclass) sort by (score desc)) dense_rank
from 
t_class
;

3 内置函数

3.1 命令

show functions; 查看hive中所有的内置函数
desc function func_name; 查看具体的函数的使用方式

3.2 数值

1.round : 将一个小数四舍五入返回
round(x[, d]) - round x to d decimal places
e.g.

hive> select round(3.55);
OK
4.0

2. floor:返回一个小数的向下取整的整数

floor(x) - Find the largest integer not greater than x

hive> select floor(3.99);
OK
3

3. ceil:返回一个小数的向上取整的整数
ceil(x) - Find the smallest integer not smaller than x
ceil
hive> select ceil(3.11);
OK
4

4. rand:产生一个0~1之间的随机小数
rand([seed]) - Returns a pseudorandom number between 0 and 1
hive> select rand();
OK
0.08310287888355339

3.3 数学

1. abs:绝对值
abs(x) - returns the absolute value of x

2. pow:平方
pow(x1, x2) - raise x1 to the power of x2
hive> select pow(2,3);
OK
8.0

3.4 条件

1. IF

IF(expr1,expr2,expr3) - If expr1 is TRUE (expr1 <> 0 and expr1 <> NULL) then IF() returns expr2; otherwise it returns expr3. IF() returns a numeric or string value, depending on the context in which it is used.

e.g.
hive> select if(1=1,"basketball", "football");
OK
basketball

2. case when

CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - When a = b, returns c; when a = d, return e; else return f

hive> select case 1 when 1 then "abc" else "def" end;
OK
abc

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dQGBhqJg-1595216433095)(F:\桌面\笔记\笔记\hive\image\case.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dpH7rpSq-1595216433101)(C:\Users\A\AppData\Roaming\Typora\typora-user-images\1593775900815.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-5vVwPfkn-1595216433105)(F:\桌面\笔记\笔记\hive\image\if.png)]

3.5 日期函数

1. current_date() : 获取到当前日期
e.g.
select current_date();

2. current_timestamp() : 获取到当前的时间戳,日期+时间
e.g.
hive> select  current_timestamp();
OK
2020-07-03 14:13:01.915

3. add_months(start_date, num_months) : 对指定的日期累加月份
e.g.
hive> select add_months(current_date(), 6);
OK
2021-01-03

4. date_add(start_date, num_days) : 对指定的日期累加天数
e.g.

hive> select date_add(current_date(), 6); 
OK
2020-07-09

5. date_sub(start_date, num_days) : 对指定的日期累减天数
e.g.
hive> select date_sub(current_date(), 1);
OK
2020-07-02

6. next_day(start_date, day_of_week) : 当前日期位起点,求下一个星期几是多少日期?
e.g.
hive> select next_day(current_date(), 'SATURDAY');
OK
2020-07-04

7. dayofmonth(date) : 一个月中的第几天
e.g.
hive> select dayofmonth(current_date());
OK
3

8. weekofyear(date) : 求当前日期为一年中的第几周
e.g.

hive> select weekofyear(current_date());
OK
27

9. minute/hour/day/month/year : 求一个指定的日期的分钟/小时///年
e.g.

hive> select minute(current_timestamp());
OK
23

10. date_format(date/timestamp/string, fmt) : 以指定的格式显示我们的日期
e.g.
hive> select date_format(current_timestamp(), 'yyyy/MM/dd:HH=mm=ss');
OK
2020/07/03:14=25=59

11. datediff(date1, date2) : 求两个时间差
e.g.

hive> select datediff('2020-07-03', '2020-07-02');
OK
1

12. to_unix_timestamp(date[, pattern]) : 将一个日期格式转换为一个unix的时间戳
e.g.
hive> select to_unix_timestamp(current_date());
OK
1593705600

13. from_unixtime(unix_time, format) :  将一个unix的时间戳转换为一个日期格式
e.g.
hive> select from_unixtime(1593705600);
OK
2020-07-03 00:00:00

14. to_date(datetime) :获取一个datetimedate部分
hive> select to_date('2011-11-11 11:11:11');
OK
2011-11-11

3.6 字符串

1. instr : 获取第一次出现的substr的字符串在str中的位置索引(1开始)
instr(str, substr) - Returns the index of the first occurance of substr in str
e.g.
select instr('lixi', 'l');

2. length : 获取字符串的长度
e.g.
select length('lixi')

3. substr/substring :截取字符串,从4这个位置开始,长度截取5个字符
e.g.
select substr('lixirocklee', 4, 5);

5. concat : 将n个字符串进行拼接
e.g.
select concat('www', 'baidu', 'com')


6. concat_ws : 将n个字符串进行拼接,以指定的分隔符
e.g.
select concat_ws(".", "www", "baidu", "com")

3.7 统计函数

1. index(arr, n) - 返回arr(n)对应的元素
e.g.
select index(array(1,2,3), 1)

2. sum/count/max/avg/min

3.8 特殊函数

1. array(n0, n1...) - 返回一个数组
hive> select array(1,2,3,4,5);
OK
[1,2,3,4,5]

2. collect_set(x) - 返回一个元素不重复的set集合
hive> select collect_set(age) from teacher;
OK
[0,1,2,3]

3. collect_list(x) - 返回一个元素可重复的list集合
hive> select collect_list(age) from teacher;
OK
[0,0,0,1,2,3]

4. explode(array) - 将一个数组转换为多行
hive> select explode(array(1,2,3,4,5));
OK
1
2
3
4
5

5. cast(type1 as type2) - 将数据类型type1转换为数据类型type2
hive> select length(cast(1.0 as int));

6. split(str, regex) - 使用regex正则表达式分割str,返回字符串数组
hive> select split('1,2,3,4,5', ',');
OK
["1","2","3","4","5"]

3.9 统计单词

tmp.word,
count(tmp.word) cnt
from
(
    select explode(array("lixi", "lixi", "rock", "lee", "lee", "rocklee")) word
) tmp
group by
tmp.word;

4 自定义函数

4.1 介绍

UDF:User Define Function.一对一的输入输出,非常实用。(讲)
UDAF:User Define Agregation Function.用户的自定义聚合函数。多对一的输入输出(spark sql)
UDTF:User Define Table-Generate Function.用户自定的表生成函数。

4.2 自定义UDF函数

4.2.1 创建maven项目并导入依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>cn.qphone</groupId>
    <artifactId>day04-hive</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>1.2.1</version>
        </dependency>
    </dependencies>
</project>
4.2.2 代码 : 求一个字符串的长度
package cn.qphone.udf;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;

/**
 * 求一个字符串的长度
 */
public class MyUDF extends UDF {
    public int evaluate(String str) {
        return StringUtils.isNotEmpty(str) ? str.length() : 0;
    }
}
4.2.3 打成jar包上传到hive中并通过命令导入到hive中
- 第一种方式:只在当前session有效
1. 将编写好的jar包上传到hive的classpath路径下。

hive> add jar /home/udf.jar;
Added [/home/udf.jar] to class path
Added resources: [/home/udf.jar]

2. 创建一个临时的自定义的函数名
hive> create temporary function `len` as 'cn.qphone.udf.MyUDF';

3. show functions;

4. 测试
hive> select len('lixi'), length('lixi');
OK
4       4

- 第二种方式:jar上传到hdfs,永久导入
1. 将jar包上传到hdfs的任意位置
2. 在hive中创建函数
CREATE FUNCTION `hive`.`len` AS 'cn.qphone.udf.MyUDF' USING JAR 'hdfs://192.168.49.200:9000/udf.jar';

5 关于Maven的环境配置

5.1 配置maven的环境变量

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dNITkPj5-1595216433108)(F:\桌面\笔记\笔记\hive\image\003.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qJYPMqK7-1595216433112)(F:\桌面\笔记\笔记\hive\image\004.png)]

5.3 修改maven的配置文件-settings.xml

<!-- maven的本地仓库路径 -->
<localRepository>D:\DeveopProgram\apache-maven-3.5.0\conf\respository</localRepository>

<!-- 阿里云仓库 -->
<mirrors>
    <!-- 阿里云仓库 -->
    <mirror>
        <id>alimaven</id>
        <mirrorOf>central</mirrorOf>
        <name>aliyun maven</name>
        <url>http://maven.aliyun.com/nexus/content/repositories/central/</url>
    </mirror>

    <!-- 中央仓库1 -->
    <mirror>
        <id>repo1</id>
        <mirrorOf>central</mirrorOf>
        <name>Human Readable Name for this Mirror.</name>
        <url>http://repo1.maven.org/maven2/</url>
    </mirror>

    <!-- 中央仓库2 -->
    <mirror>
        <id>repo2</id>
        <mirrorOf>central</mirrorOf>
        <name>Human Readable Name for this Mirror.</name>
        <url>http://repo2.maven.org/maven2/</url>
    </mirror>
</mirrors>

<profile>
    <id>jdk-1.8</id>
    <activation>
        <jdk>1.8</jdk>
    </activation>
    <properties>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <maven.compiler.compilerVersion>1.8</maven.compiler.compilerVersion>
    </properties>
</profile>

5.4 在idea中配置

1. File --> Setting --> 在搜索栏中输入maven搜索 --> 

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eUgzSsbD-1595216433114)(F:\桌面\笔记\笔记\hive\image\005.png)]

6 HiveUDF案例——生日转年龄

package UDF;

import org.apache.hadoop.hive.ql.exec.DDLTask;
import org.apache.hadoop.hive.ql.exec.UDF;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

public class BirUDF extends UDF {
    public  int evaluate(String birth){
        Date bir = null;
        SimpleDateFormat s = new SimpleDateFormat("yyyy-MM-dd");
        try {
             bir= s.parse(birth);
           
        } catch (ParseException e) {
            e.printStackTrace();
        }
        Calendar now = Calendar .getInstance();
        Calendar birs = Calendar.getInstance();
        birs.setTime(bir);
        if(now.get(Calendar.MONTH)>=birs.get(Calendar.MONTH)){
            return
                    now.get(Calendar .YEAR)-birs.get(Calendar.YEAR);
        }
        else
            return
                    now.get(Calendar .YEAR)-birs.get(Calendar.YEAR)-1;
    }
}

7 HiveUDF案例——key转value

7.1 数据

如:sex=1&height=180&weight=180&sal=50000

输入:
source:sex=1&height=180&weight=180&sal=50000
key : sal

7.2 代码

package cn.qphone.udf;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.json.JSONException;
import org.json.JSONObject;

import java.util.HashMap;
import java.util.Map;

/**           source
 * sex=1&heig&ht=180&weight=180sal=50000
 * sex=2&height=170&weight=140&sal=5000
 * sex=1&height=160&weight&sal=500
 *
 * "{"name":"lixi", "sex":"1"}"
 */
public class MyKey2ValueUDF extends UDF {

    /**
     * map的方式
     */
    /*public String evaluate(String log, String key) {
        //1. 创建存放结果对象
        Map<String, String> map = new HashMap<>();
        //2. 切出一个一个的元素
        String[] elements = log.split("&");
        //3. 遍历元素,获取到kv键值对
        for (String kvs : elements) {
            //4. 切分出键和值
            String[] kv = kvs.split("=");
            map.put(kv[0], kv[1]);
        }
        return map.get(key);
    }*/

    /**
     * json
     */
    public String evaluate(String log, String key) throws JSONException {
        //1.切出一个一个的元素
        String[] elements = log.split("&");
        //2. 遍历
        String json = "{";
        for (String kvs : elements) {
            //3. 切分出键和值
            String[] kv = kvs.split("=");
            //4. 拼凑json字符串
            json = json + "\"" + kv[0] + "\"" + ":" + "\"" + kv[1] + "\"" + ",";
        }
        json += "}";
        //5.获取json对象
        JSONObject object = new JSONObject(json);
        return (String) object.get(key);
    }
    
    public static void main(String[] args) throws JSONException {
        System.out.println(new MyKey2ValueUDF().evaluate("sex=1&height=180&weight=180&sal=50000", "sal"));
    }
}

8 json解析电影数据

8.1 原始数据

{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
{"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}
{"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}
{"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"}

8.2 读取出一个json对象,把他转换为一个java对象

1. 建表以及插入数据
create table if not exists t_json(
    log string
);
show tables;
load data local inpath '/home/json.txt' into table t_json;

select * from t_json;

2. 创建javaBean
class MovieBean {
    private String movie;
    private int rate;
    private long timeStamp;
    private int uid;

    public MovieBean() {
    }

    public MovieBean(String movie, int rate, long timeStamp, int uid) {
        this.movie = movie;
        this.rate = rate;
        this.timeStamp = timeStamp;
        this.uid = uid;
    }

    public String getMovie() {
        return movie;
    }

    public void setMovie(String movie) {
        this.movie = movie;
    }

    public int getRate() {
        return rate;
    }

    public void setRate(int rate) {
        this.rate = rate;
    }

    public long getTimeStamp() {
        return timeStamp;
    }

    public void setTimeStamp(long timeStamp) {
        this.timeStamp = timeStamp;
    }

    public int getUid() {
        return uid;
    }

    public void setUid(int uid) {
        this.uid = uid;
    }

    @Override
    public String toString() {
        return movie + '\t' + rate + '\t' + timeStamp + '\t' + uid;
    }
}

3. 将json对象转换为java对象
package cn.qphone.udf;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.avro.data.Json;
import org.apache.hadoop.hive.ql.exec.UDF;
import parquet.org.codehaus.jackson.map.ObjectMapper;

import java.rmi.server.ExportException;

/**
 * {"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
 */
public class MyJson2Java extends UDF {
    // jackson
    /*public String evaluate(String json_str) {
        try {
            ObjectMapper mapper = new ObjectMapper();
            MovieBean movieBean = mapper.readValue(json_str, MovieBean.class);
            return movieBean.toString();
        }catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }*/

    //fast json
    public String evaluate(String json_str) {
        MovieBean movieBean = JSON.parseObject(json_str, MovieBean.class);
        return movieBean.toString();
    }
    
    public static void main(String[] args) {
        System.out.println(new MyJson2Java().evaluate("{\"movie\":\"1193\",\"rate\":\"5\",\"timeStamp\":\"978300760\",\"uid\":\"1\"}"));
    }
}

class MovieBean {
    private String movie = "default";
    private int rate = 0;
    private long timeStamp = 0L;
    private int uid = 0;

    public MovieBean() {
    }

    public MovieBean(String movie, int rate, long timeStamp, int uid) {
        this.movie = movie;
        this.rate = rate;
        this.timeStamp = timeStamp;
        this.uid = uid;
    }

    public String getMovie() {
        return movie;
    }

    public void setMovie(String movie) {
        this.movie = movie;
    }

    public int getRate() {
        return rate;
    }

    public void setRate(int rate) {
        this.rate = rate;
    }

    public long getTimeStamp() {
        return timeStamp;
    }

    public void setTimeStamp(long timeStamp) {
        this.timeStamp = timeStamp;
    }

    public int getUid() {
        return uid;
    }

    public void setUid(int uid) {
        this.uid = uid;
    }

    @Override
    public String toString() {
        return movie + '\t' + rate + '\t' + timeStamp + '\t' + uid;
    }
}

8.3 ObjectMapper方法

e.g
ObjectMapper mapper = new ObjectMapper();
MovieBean movieBean = mapper.readValue(json_str, MovieBean.class);

mapper.readvalue(输入的内容,要转换的内容)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-KlTaE2hm-1595216433117)(F:\桌面\笔记\笔记\hive\image\ObjectMapper.png)]

8.4 fast json

//fast json依赖(1.2.46)
<dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.46</version>
</dependency>

//方法与objectMapper类似,可直接调用
movieBean movieBean = JSON.parseObject(js, UDF.movie.movieBean.class);
    
    
    

七 Hive-jdbc

1 依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>cn.qphone</groupId>
    <artifactId>day04-hive</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.8.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>1.2.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-jdbc</artifactId>
            <version>1.2.1</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.17</version>
        </dependency>
    </dependencies>
</project>

2 代码

package cn.qphone.jdbc;

import java.sql.*;

public class TestDemo {
    public static void main(String[] args) throws ClassNotFoundException, SQLException {
        //1. 4大参数:url、driver、user、password
        Class.forName("org.apache.hive.jdbc.HiveDriver");
        Connection connection = DriverManager.getConnection("jdbc:hive2://192.168.49.200:10000/hive", "root", "123456");
        //2. 查询
        PreparedStatement preparedStatement = connection.prepareStatement("select * from t_emp");
        ResultSet resultSet = preparedStatement.executeQuery();
        while (resultSet.next()) {
            int eid = resultSet.getInt("eid");
            String ename = resultSet.getString("ename");
            int did = resultSet.getInt("did");
            System.out.println(eid + "," + ename + "," + did);
        }
        resultSet.close();
        preparedStatement.close();
        connection.close();
    }
}
CSV、TSV格式?

文件存储格式
分桶的作用
分区下的查询

八 序列化和反序列化

1 什么是序列化和反序列化

SerDe是Serializer / Deserializer的缩写。 Hive将SerDe接口用于IO。 该接口既处理序列化和反序列化,又将序列化的结果解释为要处理的单个字段。

SerDe允许Hive从表中读取数据,并将其以任何自定义格式写回HDFS。 任何人都可以为自己的数据格式编写自己的SerDe。

有关SerDes的介绍,请参见Hive SerDe。

2 测试

2.1 加载CSV/TSV文件

TSV用tab(制表符)分割的文件
CSV一般是用“,”分割的文件(也可以用别的分割符号·)

CREATE TABLE t_csv1(
sid string, 
classname string, 
score int)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde(包名加类名)'
WITH SERDEPROPERTIES (
   "separatorChar"(分隔符) = ",",
   "quoteChar"     = "\"",
   "escapeChar"    = "\\"
)  
注:with serdeproperties 可以不写(但默认分割符是“,”)。
CREATE TABLE t_csv2(
sid string,  
classname string, 
score int)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde';

2.2 加载正则表达式的serde(了解)

CREATE TABLE t_rex(
id string, 
name string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "(.*)\\|\\|(.*)",
'output.format.string'='%1$s %2$s'
);

2.3 Json

1. 加载jar包

hive> add jar /home/json-serde-1.3.9-SNAPSHOT-jar-with-dependencies.jar;
Added [/home/json-serde-1.3.9-SNAPSHOT-jar-with-dependencies.jar] to class path
Added resources: [/home/json-serde-1.3.9-SNAPSHOT-jar-with-dependencies.jar]

2. 建表
CREATE TABLE t_json(
uid string, 
uname string,
age string)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe';

3. 建表:json2.txt
CREATE TABLE t_json2(
uname string,
score map<string,array<int>>)(这里会自动识别json格式进行转换)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe';

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9vLcFz36-1595216433120)(F:\桌面\笔记\笔记\hive\image\serde_json.png)]

3 文件存储格式

hive中除了textfile格式可以用LOADDATA方式导入,其余的都不行

1. textfile
create table t_1(
id int,
name string,
age int
)
row format delimited
fields terminated by ','
NULL DEFINED AS 'empty'
;

2. sequencefile(会压缩一部分)
create table t_b(
id int,
name string,
age int
)
row format delimited
fields terminated by ','
STORED AS SEQUENCEFILE
;

insert overwrite table t_b
select * from t_a;


3. orc(支持事务)
create table t_c(
id int,
name string,
age int
)
row format delimited
fields terminated by ','
STORED AS ORC
;

insert overwrite table t_c
select * from t_a;

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-PfgUlPCn-1595216433124)(F:\桌面\笔记\笔记\hive\image\存储格式.png)]

九 索引和视图

1 索引(了解)

	索引的本质其实说白了就是为了提升我们查询的性能,为某个字段建立索引,那么在查询这个字段的时候就比不建索引要快速。但是建立索引会有额外的开销,所以索引不能乱建。但是在hive中的索引建立其实比较鸡肋。

2 语法

CREATE INDEX index_name
  ON TABLE base_table_name (col_name, ...)
  AS index_type
  [WITH DEFERRED REBUILD]
  [IDXPROPERTIES (property_name=property_value, ...)]
  [IN TABLE index_table_name]
  [
     [ ROW FORMAT ...] STORED AS ...
     | STORED BY ...
  ]
  [LOCATION hdfs_path]
  [TBLPROPERTIES (...)]
  [COMMENT "index comment"];
  
e.g.创建
CREATE INDEX a_name_idx on table t_a(name) as 'compact'
WITH DEFERRED REBUILD;

e.g.删除
DROP INDEX a_name_idx ON t_a;

2 视图

2.1 语法

CREATE VIEW [IF NOT EXISTS] [db_name.]view_name [(column_name [COMMENT column_comment], ...) ]
  [COMMENT view_comment]
  [TBLPROPERTIES (property_name = property_value, ...)]
  AS SELECT ...;
  
e.g.
CREATE VIEW IF NOT EXISTS t_buk_v(
sno,
sname,
sex,
sage,
sclass)
AS 
SELECT * from t_buk
;

tip:
视图中的数据本质是向表进行查询的结果。如果表没有了,那么视图是查询不出数据了。

十 窗口行数/开窗函数(+++++)

1 什么是窗口函数?

	窗口函数又名开窗函数,属于分析函数的一种。用于解决复杂报表统计需求的功能强大的函数。窗口函数用于计算基于组的某种聚合值,它和聚合函数的不同之处是:对于每个组返回多行,而聚合函数对于每个组只返回一行。
    开窗函数指定了分析函数工作的数据窗口大小,这个数据窗口大小可能会随着行的变化而变化。

2 准备工作

2.1 数据

姓名,购买日期,购买数目
saml,2018-01-01,10
tony,2018-01-02,15
saml,2018-02-03,23
tony,2018-01-04,29
saml,2018-01-05,46
saml,2018-04-06,42
tony,2018-01-07,50
saml,2018-01-08,55
mart,2018-04-08,62
mart,2018-04-09,68
neil,2018-05-10,12
mart,2018-04-11,75
neil,2018-06-12,80
mart,2018-04-13,94

2.2 建表

Create table t_order(
name string,
orderdate string,
cnt int
)
row format delimited
fields terminated by ','
;

load data local inpath '/home/order.txt' into table t_order;

2.3 测试案例

SELECT
*,
count(*) over()
from
t_order;

2.4 partition by子句(分区)

SELECT
*,
count(*) over(partition by name)
from
t_order;

2.5 order by子句(排序)

SELECT
*,
count(*) over(partition by name order by orderdate)
from
t_order;

2.6 window子句

2.6.1 语法
如果要对窗口的结果做更细粒度的划分,那么就使用window字句,常见的有下面几个:

- PRECEDING:往前 
- FOLLOWING:往后 
- CURRENT ROW:当前行 
- UNBOUNDED:起点,
- UNBOUNDED PRECEDING 表示从前面的起点, 
- UNBOUNDED FOLLOWING:表示到后面的终点 
2.6.2 案例
use hive;
select
    name,
    orderdate,
    cost,
    sum(cost) over() as sample1,--所有行相加
    sum(cost) over(partition by name) as sample2,--按name分组,组内数据相加
    sum(cost) over(partition by name order by orderdate) as sample3,--按name分组,组内数据累加
    sum(cost) over(partition by name order by orderdate rows between UNBOUNDED PRECEDING AND current row )  as sample4 ,--和sample3一样,由起点到当前行的聚合
    sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING AND current row) as sample5, --当前行和前面一行做聚合
    sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING AND 1 FOLLOWING  ) as sample6,--当前行和前边一行及后面一行
    sum(cost) over(partition by name order by orderdate rows between current row and UNBOUNDED FOLLOWING ) as sample7 --当前行及后面所有行
from t_order;

2.7 ntile函数:将结果分片编号

select
    name,
    orderdate,
    cnt,
    ntile(3) over(partition by name) -- 按照name进行分组,在分组内将数据切成3份
from t_order;

2.8 LAG和LEAD函数

  • lag返回当前数据行的上一行数据
  • lead返回当前数据行的下一行数据
select
    name,
    orderdate,
    cnt,
    lag(orderdate,1,'yyyy-MM-dd') over(partition by name order by orderdate ) as time1
from t_order;

select 
name,
orderdate,
cnt,
lead(orderdate,1) over(partition by name order by orderdate) as time1
from t_order;

2.9 first_value和last_value

select 
name,
orderdate,
cnt,
first_value(orderdate) over(partition by name order by orderdate) as time1,
last_value(orderdate) over(partition by name order by orderdate) as time2
from t_order;

十一 其他

1 hive的命令(+++)

hive -e "sql"
hive -f xxxx.file

e.g.
[root@qphone01 bin]# hive -e "select * from t_order"
[root@qphone01 home]# hive -f hive.hql

2 hive的日志

- 系统日志 : hive.log
默认路径:/tmp/${user.name}

2.1 hive-site.xml

<property>
  <name>hive.querylog.location</name>
  <value>${HIVE_HOME}/log</value>
  <description>Location of Hive run time structured log file</description>
</property>

十二 总结

1 数据仓库的分层

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值