Hive学习笔记（1）基本操作

最新推荐文章于 2022-05-02 22:58:04 发布

磨刀大神

最新推荐文章于 2022-05-02 22:58:04 发布

阅读量549

点赞数 2

分类专栏： Hive 文章标签： Hive

本文链接：https://blog.csdn.net/weixin_43241054/article/details/90146712

版权

Hive 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

hive学习官方网站：
General Info
============
For the latest information about Hive, please visit out website at:
http://hive.apache.org/

Getting Started
===============
- Installation Instructions and a quick tutorial:
https://cwiki.apache.org/confluence/display/Hive/GettingStarted
- A longer tutorial that covers more features of HiveQL:
https://cwiki.apache.org/confluence/display/Hive/Tutorial
- The HiveQL Language Manual:
https://cwiki.apache.org/confluence/display/Hive/LanguageManual

1.显示数据库
show databases ;
2.创建数据库
create database db_hive ;
3. 显示表，显示表的详细信息，显示表的详细信息（格式化后），创建表
show tables ;
desc extended student ;
desc formated student ;
create table student(id int, name string) ROW FORMAT DELIMITED FIELDS
TERMINATED BY '\t' ;
4. 装载数据，最好加上数据库名
hive> load data local inpath '/opt/datas/student.txt' into
> table db_hive.student ;
5. 查看函数，查看函数使用方法
show functions ;
desc function upper ;
6. hive中操作文件系统
dfs -ls /user/hive/warehouse ;
dfs -rm -R /user/hive/warehouse/emp;
7. 查看hive的交互命令
hive -help
usage: hive
-d,--define <key=value> Variable subsitution to apply to hive
commands. e.g. -d A=B or --define A=B
--database <databasename> Specify the database to use
-e <quoted-query-string> SQL from command line
例如：[root@zhangbk hadoop]# hive -e 'select * from db_hive.student ;'
-f <filename> SQL from files
例如：vi /opt/datas/hivef.sql
select * from db_hive.student ;
[root@zhangbk datas]# hive -f /opt/datas/hivef.sql
也可将查询结果写入文件：[root@zhangbk datas]# hive -f /opt/datas/hivef.sql > /opt/datas/hivef.txt
-h <hostname> connecting to Hive Server on remote host
-H,--help Print help information
--hiveconf <property=value> Use value for given property
--hivevar <key=value> Variable subsitution to apply to hive
commands. e.g. --hivevar A=B
-i <filename> Initialization SQL file
例如：与用户自定义的udf相互使用。
-p <port> connecting to Hive Server on port number
-S,--silent Silent mode in interactive shell
-v,--verbose Verbose mode (echo executed SQL to the
console)
8. 交互命令hive cli
exit/quit ;
dfs -ls / ;
!ls ;查看操作系统本地的文件

================================================================
第二章笔记2
1. 创建表
create table if not exists db_hive.log_20190512(
ip string comment '',
user string,
req_url string comment 'user request url'
)
comment 'Web Access Logs'
row format delimited fields terminated by ','
-- collection items terminated by '\n'
STORED AS TEXTFILE
location '/user/hive/warehouse/db_hive.db/log_20190512' ;

复制表部分字段（分表）
create table db_hive.log_20190512_bak as
select ip,user from db_hive.log_20190512 ;

复制表
create table db_hive.log_20190512_bak like db_hive.log_20190512;

2. 删除数据库，表
drop database db_hive cascade ;
drop table student ;
================================================================
CREATE TABLE [IF NOT EXISTS] [db_name.]table_name
[(col_name data_type [COMMENT col_comment], ... [constraint_specification])]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[SKEWED BY (col_name, col_name, ...)
ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)
[STORED AS DIRECTORIES]
[
[ROW FORMAT row_format]
[STORED AS file_format]
| STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)]
]
[LOCATION hdfs_path]
[TBLPROPERTIES (property_name=property_value, ...)]
[AS select_statement];

CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
LIKE existing_table_or_view_name
[LOCATION hdfs_path];

data_type
: primitive_type
| array_type --常用
| map_type --常用
| struct_type
| union_type

primitive_type
: TINYINT
| SMALLINT
| INT --常用
| BIGINT
| BOOLEAN
| FLOAT
| DOUBLE --常用
| DOUBLE PRECISION
| STRING --常用
| BINARY
| TIMESTAMP
| DECIMAL
| DECIMAL(precision, scale)
| DATE
| VARCHAR
| CHAR

array_type
: ARRAY < data_type >

map_type
: MAP < primitive_type, data_type >

struct_type
: STRUCT < col_name : data_type [COMMENT col_comment], ...>

union_type
: UNIONTYPE < data_type, data_type, ... >

row_format
: DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char]
[MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
[NULL DEFINED AS char] -- (Note: Available in Hive 0.13 and later)
| SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]

file_format:
: SEQUENCEFILE
| TEXTFILE -- (Default, depending on hive.default.fileformat configuration)
| RCFILE
| ORC
| PARQUET
| AVRO
| JSONFILE
| INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname

constraint_specification:
: [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE ]
[, CONSTRAINT constraint_name FOREIGN KEY (col_name, ...) REFERENCES table_name(col_name, ...) DISABLE NOVALIDATE

=================================================================================================
员工表
create table if not exists default.emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
)
comment 'emp table'
row format delimited fields terminated by '\t' ;

部门表
create table if not exists default.dept(
deptno int,
dname string,
loc string
)
comment 'dept table'
row format delimited fields terminated by '\t' ;

load data local inpath '/opt/datas/emp.txt' overwrite into table default.emp ;
load data local inpath '/opt/datas/dept.txt' overwrite into table default.dept;

create table if not exists default.dept_cats as
select * from dept ;

清除表数据:truncate table dept_cats ;
修改表明：alter table dept_cates rename to dept_sub ;
删除表：drop table if exists dept_sub ;

Hive中表的类型
内部表（管理表），删除表，会删除表数据和元数据
外部表（托管表），删除表，只会删除元数据，不会删除数据
create external table if not exists default.ext_emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
)
comment 'emp table'
row format delimited fields terminated by '\t'
location '/user/hive/warehouse/ext_emp';--将数据文件放在此文件夹下

分区表
分区表实际上就是对应一个HDFS文件系统上的独立的文件夹，该文件夹下是该分区所有的数据文件，
Hive中的分区就是分目录，把一个大的数据集根据业务需要分割成小的数据集。
create table if not exists default.emp_partiton(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
)
partitioned by (month string, day string)
row format delimited fields terminated by '\t' ;

load data local inpath '/opt/datas/emp.txt'
overwrite into table emp_partition partition (month='201905',day='');
select * from emp_partition where month='201905' ;

无论内部表还是外部表，创建好表之后，可直接将数据文件放在表文件夹下即可，对于分区表
需要注意：
dfs -mkdir -p /user/hive/warehouse/dept_part/day=20190513 ;
dfs -put /opt/datas/dept.txt /user/hive/warehouse/dept_part/day=20190513 ;
alter table dept_part add partition (day='20190513') ;
=============================================================================================
导入数据：
1. 加载本地文件到hive表
load data local inpath '/opt/datas/emp.txt' overwrite into table emp ;
2. 加载hdfs文件到hive表
load data inpath '/opt/datas/emp.txt' overwrite into table emp ;
3. 创建表，通过insert加载
insert overwrite table emp
select * from emp_bak ;
4.将数据文件上传到hive表目录下

=============================================================================================
导出数据
insert overwrite local directory '/opt/datas/hive_emp'
row format delimited fields terminated by '\t'
collection items terminated by '\n'
select * from emp ;

hive -e "select * from emp ;" > /opt/datas/hive_emp2

==========================================================================
查询分析数据
select * from emp
where
group by
limit number ;
=, >=, <=, between and
is null ,is not null, in, not in
max, min, count,sum,avg