Hive 操作

最新推荐文章于 2023-09-05 19:42:13 发布

涵sir

最新推荐文章于 2023-09-05 19:42:13 发布

阅读量883

点赞数

本文链接：https://blog.csdn.net/weixin_40579109/article/details/111356002

版权

Hive 语法

DDL

Create Database

CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name
  [COMMENT database_comment]
  [LOCATION hdfs_path]
  [MANAGEDLOCATION hdfs_path]
  [WITH DBPROPERTIES (property_name=property_value, ...)];

create database if not exists db_name;

Drop Database

DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT|CASCADE];

drop database db_name;//删除空数据库
drop database if exists db_name;//判断数据库是否存在
drop database db_name cascade;//数据库不为空 用cascade强制删除

Alter Database

ALTER (DATABASE|SCHEMA) database_name SET DBPROPERTIES (property_name=property_value, ...);   -- (Note: SCHEMA added in Hive 0.14.0)
ALTER (DATABASE|SCHEMA) database_name SET OWNER [USER|ROLE] user_or_role;   -- (Note: Hive 0.13.0 and later; SCHEMA added in Hive 0.14.0)
ALTER (DATABASE|SCHEMA) database_name SET LOCATION hdfs_path; -- (Note: Hive 2.2.1, 2.4.0 and later)
ALTER (DATABASE|SCHEMA) database_name SET MANAGEDLOCATION hdfs_path; -- (Note: Hive 4.0.0 and later)

Use Database

USE database_name;
USE DEFAULT;

Show Database

SHOW (DATABASES|SCHEMAS) [LIKE 'identifier_with_wildcards'];

show databases;				   //显示数据库
show databases like 'db_name*';//过滤显示查询数据库
desc database db_name;		   //显示数据库信息
desc database extended db_name;//显示数据库详细信息 
select current_database();     //查看正在使用的数据库
show create database db_name ; //查看建库语句

Create Table

CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name    -- (Note: TEMPORARY available in Hive 0.14.0 and later)
  [(col_name data_type [column_constraint_specification] [COMMENT col_comment], ... [constraint_specification])]
  [COMMENT table_comment]
  [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
  [CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
  [SKEWED BY (col_name, col_name, ...)                  -- (Note: Available in Hive 0.10.0 and later)]
     ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)
     [STORED AS DIRECTORIES]
  [
   [ROW FORMAT row_format] 
   [STORED AS file_format]
     | STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)]  -- (Note: Available in Hive 0.6.0 and later)
  ]
  [LOCATION hdfs_path]
  [TBLPROPERTIES (property_name=property_value, ...)]   -- (Note: Available in Hive 0.6.0 and later)
  [AS select_statement];   -- (Note: Available in Hive 0.5.0 and later; not supported for external tables)
 
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
  LIKE existing_table_or_view_name
  [LOCATION hdfs_path];
 
	data_type
	  : primitive_type
	  | array_type
	  | map_type
	  | struct_type
	  | union_type  -- (Note: Available in Hive 0.7.0 and later)
	 
	primitive_type
	  : TINYINT
	  | SMALLINT
	  | INT
	  | BIGINT
	  | BOOLEAN
	  | FLOAT
	  | DOUBLE
	  | DOUBLE PRECISION -- (Note: Available in Hive 2.2.0 and later)
	  | STRING
	  | BINARY      -- (Note: Available in Hive 0.8.0 and later)
	  | TIMESTAMP   -- (Note: Available in Hive 0.8.0 and later)
	  | DECIMAL     -- (Note: Available in Hive 0.11.0 and later)
	  | DECIMAL(precision, scale)  -- (Note: Available in Hive 0.13.0 and later)
	  | DATE        -- (Note: Available in Hive 0.12.0 and later)
	  | VARCHAR     -- (Note: Available in Hive 0.12.0 and later)
	  | CHAR        -- (Note: Available in Hive 0.13.0 and later)
	 
	array_type
	  : ARRAY < data_type >
	 
	map_type
	  : MAP < primitive_type, data_type >
	 
	struct_type
	  : STRUCT < col_name : data_type [COMMENT col_comment], ...>
	 
	union_type
	   : UNIONTYPE < data_type, data_type, ... >  -- (Note: Available in Hive 0.7.0 and later)
	 
	row_format
	  : DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char]
	        [MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
	        [NULL DEFINED AS char]   -- (Note: Available in Hive 0.13 and later)
	  | SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]
	 
	file_format:
	  : SEQUENCEFILE
	  | TEXTFILE    -- (Default, depending on hive.default.fileformat configuration)
	  | RCFILE      -- (Note: Available in Hive 0.6.0 and later)
	  | ORC         -- (Note: Available in Hive 0.11.0 and later)
	  | PARQUET     -- (Note: Available in Hive 0.13.0 and later)
	  | AVRO        -- (Note: Available in Hive 0.14.0 and later)
	  | JSONFILE    -- (Note: Available in Hive 4.0.0 and later)
	  | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname
	 
	column_constraint_specification:
	  : [ PRIMARY KEY|UNIQUE|NOT NULL|DEFAULT [default_value]|CHECK  [check_expression] ENABLE|DISABLE NOVALIDATE RELY/NORELY ]
	 
	default_value:
	  : [ LITERAL|CURRENT_USER()|CURRENT_DATE()|CURRENT_TIMESTAMP()|NULL ] 
	 
	constraint_specification:
	  : [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE RELY/NORELY ]
	    [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE RELY/NORELY ]
	    [, CONSTRAINT constraint_name FOREIGN KEY (col_name, ...) REFERENCES table_name(col_name, ...) DISABLE NOVALIDATE 
	    [, CONSTRAINT constraint_name UNIQUE (col_name, ...) DISABLE NOVALIDATE RELY/NORELY ]
	    [, CONSTRAINT constraint_name CHECK [check_expression] ENABLE|DISABLE NOVALIDATE RELY/NORELY ]


#内部表
create table if not exists student
(id int ,name string) 
row format delimited fields terminated by ',';    

#外部表
create external table if not exists student 
(id int ,name string) 
row format delimited fields terminated by ',' 
location '/hive/data/';

#分区表
create table  if not exists student 
(id int ,name string) 
partitioned by (age int conmment 'partitioned comment') 
row format delimited fields terminated by ',';   #分区字段的字段名称，不能是表中的任意一个字段

#创建分桶表
create table if not exists  
student (id int ,name string,age int ) 
clustered by (age) sort by (age desc) 
into 10 buckets 
row format delimited fields terminated by ',';   #分桶字段一定要是表中的属性字段 

#like 方式
create table student like t_student ;   #复制一个表结构，分区表和分桶表也同样可以复制（分区表只能复制在创建表的时候的信息，之后添加的信息不能复制）

#CTAS
create table student as select * from t_student #创建表并复制

Drop Table

DROP TABLE [IF EXISTS] table_name [PURGE];     -- (Note: PURGE available in Hive 0.14.0 and later)

drop table if exists tb_name; // 删除数据和元数据

Truncate Table

TRUNCATE [TABLE] table_name [PARTITION partition_spec];
	partition_spec:
	  : (partition_column = partition_col_value, partition_column = partition_col_value, ...)

truncate table tb_name; // 只删除数据，元数据保留

Alter Table

修改表名：
ALTER TABLE table_name RENAME TO new_table_name;
修改表属性：
ALTER TABLE table_name SET TBLPROPERTIES table_properties;
	table_properties:
	  : (property_name = property_value, 'comment' = new_comment, ... )
修改表分隔符：
ALTER TABLE table_name [PARTITION partition_spec] SET SERDEPROPERTIES serde_properties;
	serde_properties:
	  : (property_name = property_value, 'field.delim' = ',', ... )

Alter Partition

添加分区：指定分区语句之间不能有逗号！！！看例子
ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location'][, PARTITION partition_spec [LOCATION 'location'], ...];
	partition_spec:
	  : (partition_column = partition_col_value, partition_column = partition_col_value, ...)
例如：0.8版本后可以一次添加多个partition
ALTER TABLE page_view ADD PARTITION (dt='2008-08-08', country='us') location '/path/to/us/part080808'
                          PARTITION (dt='2008-08-09', country='us') location '/path/to/us/part080809';

重命名分区：
ALTER TABLE table_name PARTITION partition_spec RENAME TO PARTITION partition_spec;

删除分区：添加分区的表 必须在表创建之初就创立了分区！ 一次删除多个分区 多个分区之间必须加逗号！！！
ALTER TABLE table_name DROP [IF EXISTS] PARTITION partition_spec[, PARTITION partition_spec, ...]
  [IGNORE PROTECTION] [PURGE];            -- (Note: PURGE available in Hive 1.2.0 and later, IGNORE PROTECTION not available 2.0.0 and later)

Alter Column

改变列的 名称/类型/位置/注释 Change Column Name/Type/Position/Comment
ALTER TABLE table_name [PARTITION partition_spec] CHANGE [COLUMN] col_old_name col_new_name column_type
  [COMMENT col_comment] [FIRST|AFTER column_name] [CASCADE|RESTRICT];

例子:
CREATE TABLE test_change (a int, b int, c int);
// First change column a's name to a1.
ALTER TABLE test_change CHANGE a a1 INT;
// Next change column a1's name to a2, its data type to string, and put it after column b.
ALTER TABLE test_change CHANGE a1 a2 STRING AFTER b;
// The new table's structure is:  b int, a2 string, c int.
// Then change column c's name to c1, and put it as the first column.
ALTER TABLE test_change CHANGE c c1 INT FIRST;
// The new table's structure is:  c1 int, b int, a2 string.
// Add a comment to column a1
ALTER TABLE test_change CHANGE a1 a1 INT COMMENT 'this is column a1';

添加列/替换列 Add/Replace Columns
ALTER TABLE table_name 
  [PARTITION partition_spec]                 -- (Note: Hive 0.14.0 and later)
  ADD|REPLACE COLUMNS (col_name data_type [COMMENT col_comment], ...)
  [CASCADE|RESTRICT]                         -- (Note: Hive 1.1.0 and later)

Partial Partition Specification 部分分区spec，类似动态分区，可以部分的指定列来change:
// 首先设置Hive属性
// hive.exec.dynamic.partition needs to be set to true to enable dynamic partitioning with ALTER PARTITION
SET hive.exec.dynamic.partition = true;
// 静态change
ALTER TABLE foo PARTITION (ds='2008-04-08', hr=11) CHANGE COLUMN dec_column_name dec_column_name DECIMAL(38,18);
// This will alter all existing partitions in the table with ds='2008-04-08' -- be sure you know what you are doing!
ALTER TABLE foo PARTITION (ds='2008-04-08', hr) CHANGE COLUMN dec_column_name dec_column_name DECIMAL(38,18);
// This will alter all existing partitions in the table -- be sure you know what you are doing!
ALTER TABLE foo PARTITION (ds, hr) CHANGE COLUMN dec_column_name dec_column_name DECIMAL(38,18);

Show Partitions

SHOW PARTITIONS table_name;

Show Create Table

显示创建给定表的CREATE TABLE语句
SHOW CREATE TABLE ([db_name.]table_name|view_name);

Show Columns

SHOW COLUMNS (FROM|IN) table_name [(FROM|IN) db_name];

Describe

desc database formatted db_name;
desc formatted tb_name;

DML

Import / Export

EXPORT命令将表或分区的数据以及元数据导出到指定的输出位置。
然后可以将这个输出位置移动到不同的Hadoop或Hive实例，并使用IMPORT命令从那里导入。
导出的元数据存放在目标目录中，数据文件存放在子目录中。
export时，目标表存在但未分区，则必须是空的，也就是导出到一个空路径下。

EXPORT TABLE tablename [PARTITION (part_column="value"[, ...])]
  TO 'export_target_path' [ FOR replication('eventid') ]

IMPORT [[EXTERNAL] TABLE new_or_original_tablename [PARTITION (part_column="value"[, ...])]]
  FROM 'source_path'
  [LOCATION 'import_target_path']

Examples：
// Simple export and import:
export table department to 'hdfs_exports_location/department';
import from 'hdfs_exports_location/department';
// Rename table on import:
export table department to 'hdfs_exports_location/department';
import table imported_dept from 'hdfs_exports_location/department';
// Export partition and import:
export table employee partition (emp_country="in", emp_state="ka") to 'hdfs_exports_location/employee';
import from 'hdfs_exports_location/employee';
// Export table and import partition:
export table employee to 'hdfs_exports_location/employee';
import table employee partition (emp_country="us", emp_state="tn") from 'hdfs_exports_location/employee';
//Specify the import location:
export table department to 'hdfs_exports_location/department';
import table department from 'hdfs_exports_location/department' 
       location 'import_target_location/department';
//Import as an external table:
export table department to 'hdfs_exports_location/department';
import external table department from 'hdfs_exports_location/department';

Loading files into tables

LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]

load data local inpath '/root/data/a.txt' into table stu;
load data inpath '/wordcount.txt' into table wordcount;

Inserting data into Hive Tables from queries

一、标准语法
覆盖模式：
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 FROM from_statement;
追加模式：
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement;
例子：
insert into table student_ptn partition(department=’SC’) select id ,name,age ,sex from student where dept=’ SC’;

二、多插入（multiple inserts）：
FROM from_statement
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2]
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...;

FROM from_statement
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2]
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] ...;

例子：
from student # 只用扫描一次student表
insert into table student_ptn partition(department=’SC’) select id ,name,age ,sex where dept=’ SC’; 
insert into talbe student_ptn partition(department=’AC’) select id ,name,age ,sex  where dept=’ AC’;
insert into talbe student_ptn partition(department=’ES’) select id ,name,age ,sex where dept=’ ES’;

动态分区插入和静态分区插入：
静态分区插入：要进行数据插入的数据的定义是手动指定的（分区在插入之前指定）
动态分区插入：用来解决静态分区插入的缺点。按照某个分区字段的值进行判断，每遇到一个不同的值，当前的程序自行进行判断来创建对应的分区

三、动态分区（dynamic partition inserts）:
INSERT OVERWRITE TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement;
INSERT INTO TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement;

例子：
静态写法：
insert into table student_ptn partition(department=’SC’) select id,name,age,sex from student where dept=’SC’;
动态写法：将分区字段作为最后一个字段写在from之前，多个分区字段就依次排在from之前
insert into table student_ptn partition(department) select id,name,age,sex,dept from student;

Writing data into the filesystem from queries

insert到外部文件系统
一、Standard syntax:
INSERT OVERWRITE [LOCAL] DIRECTORY directory1
  [ROW FORMAT row_format] [STORED AS file_format] (Note: Only available starting with Hive 0.11.0)
  SELECT ... FROM ...
 
二、multiple inserts:
FROM from_statement
INSERT OVERWRITE [LOCAL] DIRECTORY directory1 select_statement1
[INSERT OVERWRITE [LOCAL] DIRECTORY directory2 select_statement2] ...
 
#单重导出
insert overwrite local directory 'linux path' select * from t_name;
#多重导出
from t_name 
insert overwrite local directory 'linux path' select * where ...
insert overwrite local directory 'linux path' select * where...

Inserting values into tables from SQL

使用INSERT...VALUES表达式 insert data into tables directly.

INSERT INTO TABLE tablename [PARTITION (partcol1[=val1], partcol2[=val2] ...)] VALUES values_row [, values_row ...]

例子：
静态：
INSERT INTO TABLE pageviews PARTITION (datestamp = '2014-09-23')
  VALUES ('jsmith', 'mail.com', 'sports.com'), ('jdoe', 'mail.com', null);
动态： 
INSERT INTO TABLE pageviews PARTITION (datestamp)
  VALUES ('tjohnson', 'sports.com', 'finance.com', '2014-09-23'), ('tlee', 'finance.com', null, '2014-09-21');

Update/Delete

Hive尽量不要去修改数据

Merge

Merge从Hive2.2以后支持，不用

DQL(Queries)

Select

SELECT [ALL | DISTINCT] select_expr, select_expr, ...
  FROM table_reference
  [WHERE where_condition]
  [GROUP BY col_list]
  [ORDER BY col_list]
  [CLUSTER BY col_list
    | [DISTRIBUTE BY col_list] [SORT BY col_list]
  ]
 [LIMIT [offset,] rows]

Where

SELECT * FROM sales WHERE amount > 10 AND region = "US"

ALL and DISTINCT

默认为ALL，返回所有符合的结果
DISTINCT去除重复的rows

Partition Based Queries


SELECT page_views.*
FROM page_views
WHERE page_views.date >= '2008-03-01' AND page_views.date <= '2008-03-31'

HAVING

CREATE TABLE t1(col1 int, col2 int);

SELECT 
	col1 
FROM 
	t1 
GROUP BY 
	col1 
HAVING 
	SUM(col2) > 10

LIMIT

// LIMIT子句可用于约束SELECT语句返回的行数。
SELECT * FROM customers LIMIT 5

Group By

select后面跟着的必须为：the group by key——a，aggregation function——sum(b),要么是group key，要么是值的聚合

CREATE TABLE t1(a INTEGER, b INTGER);

SELECT
   a,
   sum(b)
FROM
   t1
GROUP BY
   a;

Order, Sort, Cluster, and Distribute By

order by：会对输入做全局排序，因此只有一个Reducer(多个Reducer无法保证全局有序)，
然而只有一个Reducer，会导致当输入规模较大时，消耗较长的计算时间。

sort by：不是全局排序，其在数据进入reducer前完成排序，因此，如果用sort by进行排序，
并且设置mapred.reduce.tasks>1，则sort by只会保证每个reducer的输出有序，并不保证全局有序。

distribute by：是控制在map端如何拆分数据给reduce端的。hive会根据distribute by后面列，
对应reduce的个数进行分发，默认是采用hash算法。sort by为每个reduce产生一个排序文件。
在有些情况下，你需要控制某个特定行应该到哪个reducer，这通常是为了进行后续的聚集操作。
distribute by刚好可以做这件事。因此，distribute by经常和sort by配合使用。

注：Distribute by和sort by的使用场景
1.Map输出的文件大小不均。
2.Reduce输出文件大小不均。
3.小文件过多。
4.文件超大。

cluster by：除了具有distribute by的功能外还兼具sort by的功能。
但是排序只能是倒叙排序，不能指定排序规则为ASC或者DESC。

distribute+sort 例子：
year	tempra
2008	30`C
2008	35`C
2008	32.5`C
2008	31.5`C
2008	31`C
2015	41`C
2015	39`C
2015	36`C
2015	33`C
2015	35`C
2015	37`C
根据年份和气温对气象数据进行排序，以确保所具有相同年份的行最终都在一个reduce分区中。
select * from temperature distribute by year sort by year asc,tempra desc;
year	tempra
2008	35`C
2008	32.5`C
2008	31`C
2008	31.5`C
2008	30`C
2015	41`C
2015	39`C
2015	37`C
2015	36`C
2015	35`C
2015	33`C

JOIN

#内连接：两个表的交集
select a.*,b.* from a inner join b on a.id=b.id;  

#左外链接：以join左侧的表为基础表  左侧的表的所有数据都会显示  右侧可以关联上的就会补全  关联不上 null补充
select a.*,b.* from a left join b on a.id=b.id;

#右外链接：以join右侧的表为基础
select a.*,b.* from a right join b on a.id=b.id;

#全外链接：取两个表的并集
select a.*,b.* from a full join b on a.id=b.id;

#半连接，以一种高效的方式实现了不相关的IN/EXISTS子查询语义。 
#取左半表的数据，左表中在右表中出现关联上的数据
select * from a left semi join b on a.id=b.id;
相当于：
select * from a where a.id in (select b.id from b)

# 多个表链式JOIN
SELECT a.val, b.val, c.val FROM a JOIN b ON (a.key = b.key1) JOIN c ON (c.key = b.key2)

Union

UNION用于将多个SELECT语句的结果组合为单个结果集。
select中字段须一致，每个select语句返回的列的数量和名字必须相同。

select * from a union selecet * from b;
select * from a union all selecet * from b;

union：表示去重连接
union all ：表示不去重连接