Hive - 基础2 DDL&DML& Function
-
DDL
`CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name -- (Note: TEMPORARY available in Hive ``0.14``.``0` `and later)`` ``[(col_name data_type [COMMENT col_comment], ... [constraint_specification])]`` ``[COMMENT table_comment]`` ``[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]`` ``[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]`` ``[SKEWED BY (col_name, col_name, ...) -- (Note: Available in Hive ``0.10``.``0` `and later)]`` ``ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)`` ``[STORED AS DIRECTORIES]`` ``[`` ``[ROW FORMAT row_format] `` ``[STORED AS file_format]`` ``| STORED BY ``'storage.handler.class.name'` `[WITH SERDEPROPERTIES (...)] -- (Note: Available in Hive ``0.6``.``0` `and later)`` ``]`` ``[LOCATION hdfs_path]`` ``[TBLPROPERTIES (property_name=property_value, ...)] -- (Note: Available in Hive ``0.6``.``0` `and later)`` ``[AS select_statement]; -- (Note: Available in Hive ``0.5``.``0` `and later; not supported ``for` `external tables)` `CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name`` ``LIKE existing_table_or_view_name`` ``[LOCATION hdfs_path];` `data_type`` ``: primitive_type`` ``| array_type`` ``| map_type`` ``| struct_type`` ``| union_type -- (Note: Available in Hive ``0.7``.``0` `and later)` `primitive_type`` ``: TINYINT`` ``| SMALLINT`` ``| INT`` ``| BIGINT`` ``| BOOLEAN`` ``| FLOAT`` ``| DOUBLE`` ``| DOUBLE PRECISION -- (Note: Available in Hive ``2.2``.``0` `and later)`` ``| STRING`` ``| BINARY -- (Note: Available in Hive ``0.8``.``0` `and later)`` ``| TIMESTAMP -- (Note: Available in Hive ``0.8``.``0` `and later)`` ``| DECIMAL -- (Note: Available in Hive ``0.11``.``0` `and later)`` ``| DECIMAL(precision, scale) -- (Note: Available in Hive ``0.13``.``0` `and later)`` ``| DATE -- (Note: Available in Hive ``0.12``.``0` `and later)`` ``| VARCHAR -- (Note: Available in Hive ``0.12``.``0` `and later)`` ``| CHAR -- (Note: Available in Hive ``0.13``.``0` `and later)` `array_type`` ``: ARRAY < data_type >` `map_type`` ``: MAP < primitive_type, data_type >` `struct_type`` ``: STRUCT < col_name : data_type [COMMENT col_comment], ...>` `union_type`` ``: UNIONTYPE < data_type, data_type, ... > -- (Note: Available in Hive ``0.7``.``0` `and later)` `row_format`` ``: DELIMITED [FIELDS TERMINATED BY ``char` `[ESCAPED BY ``char``]] [COLLECTION ITEMS TERMINATED BY ``char``]`` ``[MAP KEYS TERMINATED BY ``char``] [LINES TERMINATED BY ``char``]`` ``[NULL DEFINED AS ``char``] -- (Note: Available in Hive ``0.13` `and later)`` ``| SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, property_name=property_value, ...)]` `file_format:`` ``: SEQUENCEFILE`` ``| TEXTFILE -- (Default, depending on hive.``default``.fileformat configuration)`` ``| RCFILE -- (Note: Available in Hive ``0.6``.``0` `and later)`` ``| ORC -- (Note: Available in Hive ``0.11``.``0` `and later)`` ``| PARQUET -- (Note: Available in Hive ``0.13``.``0` `and later)`` ``| AVRO -- (Note: Available in Hive ``0.14``.``0` `and later)`` ``| JSONFILE -- (Note: Available in Hive ``4.0``.``0` `and later)`` ``| INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname` `constraint_specification:`` ``: [, PRIMARY KEY (col_name, ...) DISABLE NOVALIDATE ]`` ``[, CONSTRAINT constraint_name FOREIGN KEY (col_name, ...) REFERENCES table_name(col_name, ...) DISABLE NOVALIDATE `
- 创建表
# Alter Column ALTER TABLE table_name [PARTITION partition_spec] CHANGE [COLUMN] col_old_name col_new_name column_type [COMMENT col_comment] [FIRST|AFTER column_name] [CASCADE|RESTRICT];
# 正常创建 CREATE TABLE IF NOT EXISTS hive_emp(EMPNO int,ENAME string,JOB string,MGR int,HIREDATE date,SAL int,COMM int,DEPTNO int)ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; # 复制表结构 CREATE TABLE IF NOT EXISTS hive_emp_copy LIKE hive_emp; # CTAS复制表结构和内容 - create table as select CREATE TABLE emp_ctas AS SELECT * FROM hive_emp;
- 修改表
#Alter Table Constraints ALTER TABLE table_name ADD CONSTRAINT constraint_name PRIMARY KEY (column, ...) DISABLE NOVALIDATE; ALTER TABLE table_name ADD CONSTRAINT constraint_name FOREIGN KEY (column, ...) REFERENCES table_name(column, ...) DISABLE NOVALIDATE RELY; ALTER TABLE table_name DROP CONSTRAINT constraint_name; ... # Alter Column ALTER TABLE table_name [PARTITION partition_spec] CHANGE [COLUMN] col_old_name col_new_name column_type [COMMENT col_comment] [FIRST|AFTER column_name] [CASCADE|RESTRICT];
# 修改表名 ALTER TABLE hive_emp RENAME TO emp; #修改列信息 ALTER TABLE hive_emp_copy CHANGE COLUMN ENAME EMP_NAME string COMMENT '员工姓名'
- 删除表
# 删除表数据 TRUNCATE TABLE hive_emp_copy; # 删除表结构和数据 DROP TABLE IF EXISTS hive_emp_copy;
Hive外部表和内部表
Hive默认创建的表为内部表,内部表被drop,直接啥都没了
外部表需要手动指定external关键字,drop之后,数据库数据还在,一般创建表时会指定Location进行存放数据
-
DML
- Load加载数据
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)] LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)] [INPUTFORMAT 'inputformat' SERDE 'serde'] (3.0 or later)
LOAD DATA LOCAL INPATH '/emp.data' OVERWRITE INTO TABLE emp
- insert data into table from query 从查询结果集导入数据
Standard syntax: INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 FROM from_statement; INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement; Hive extension (multiple inserts): FROM from_statement INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 [INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] [INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...; FROM from_statement INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 [INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] [INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] ...; Hive extension (dynamic partition inserts): INSERT OVERWRITE TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement; INSERT INTO TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement;
INSERT OVERWRITE INTO TABLE emp_ctas SELECT * FROM emp
- Writing data into the filesystem from queries 导出数据
Standard syntax: INSERT OVERWRITE [LOCAL] DIRECTORY directory1 [ROW FORMAT row_format] [STORED AS file_format] (Note: Only available starting with Hive 0.11.0) SELECT ... FROM ... Hive extension (multiple inserts): FROM from_statement INSERT OVERWRITE [LOCAL] DIRECTORY directory1 select_statement1 [INSERT OVERWRITE [LOCAL] DIRECTORY directory2 select_statement2] ... row_format : DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char] [MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char] [NULL DEFINED AS char] (Note: Only available starting with Hive 0.13)
INSERT OVERWRITE LOCAL DIRECTORY '/hive/data/emp' ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' SELECT * FROM emp;
- export/import 导出/导入表结构和数据
EXPORT TABLE tablename [PARTITION (part_column="value"[, ...])] TO 'export_target_path' [ FOR replication('eventid') ] IMPORT [[EXTERNAL] TABLE new_or_original_tablename [PARTITION (part_column="value"[, ...])]] FROM 'source_path' [LOCATION 'import_target_path']
EXPORT TABLE emp TO '/hive/data/emp'; IMPORT EXTERNAL TABLE emp_import FROM '/hive/data/emp' LOCATION '/hive/db/external/emp_import'
-
内置函数
hive提供了很多的内置函数,具体看官方文档
show functions;
desc function split;
desc function extended explode;
返回类型 | 签名 | 描述 |
---|---|---|
BIGINT | round(double a) | 返回BIGINT最近的double值。 |
BIGINT | floor(double a) | 返回最大BIGINT值等于或小于double。 |
BIGINT | ceil(double a) | 它返回最小BIGINT值等于或大于double。 |
string | concat(string A, string B,…) | 它返回从A后串联B产生的字符串 |
string | substr(string A, int start) | 它返回一个起始,从起始位置的子字符串,直到A.结束 |
string | substr(string A, int start, int length) | 返回从给定长度的起始start位置开始的字符串。 |
string | upper(string A) | 它返回从转换的所有字符为大写产生的字符串。 |
value of <type> | cast(<expr> as <type>) | 它把表达式的结果expr<类型>如cast(‘1’作为BIGINT)代表整体转换为字符串’1’。如果转换不成功,返回的是NULL。 |
string | from_unixtime(int unixtime) | 转换的秒数从Unix纪元(1970-01-0100:00:00 UTC)代表那一刻,在当前系统时区的时间戳字符的串格式:“1970-01-01 00:00:00” |
string | to_date(string timestamp) | 返回一个字符串时间戳的日期部分:to_date(“1970-01-01 00:00:00”) = “1970-01-01” |
int | year(string date) | 返回年份部分的日期或时间戳字符串:year(“1970-01-01 00:00:00”) = 1970, year(“1970-01-01”) = 1970 |
int | month(string date) | 返回日期或时间戳记字符串月份部分:month(“1970-11-01 00:00:00”) = 11, month(“1970-11-01”) = 11 |
int | day(string date) | 返回日期或时间戳记字符串当天部分:day(“1970-11-01 00:00:00”) = 1, day(“1970-11-01”) = 1 |
string | get_json_object(string json_string, string path) | 提取从基于指定的JSON路径的JSON字符串JSON对象,并返回提取的JSON字符串的JSON对象。如果输入的JSON字符串无效,返回NULL。 |
T | explode(ARRAY<T> a) | Explodes an array to multiple rows. Returns a row-set with a single column (col), one row for each element from the array.行转列 |
# 2.0
SELECT round(1.535);
# 2
SELECT ceil(1.535);
# 2
SELECT ceiling(1.535);
SELECT concat ('abc',12345);
# 正向取bcdefg12345
SELECT substr('abcdefg12345',2);
# 逆向取 45
SELECT substr('abcdefg12345',-2);
# abcde,下标从1开始
SELECT substr('abcdefg12345',1,5);
# substring同substr
SELECT substring('abcdefg12345',1,5);
SELECT upper('abCDefG');
SELECT cast('123' as int);
# 2019-07-23 11:23:18
SELECT from_unixtime(1563852198);
# 2019-07-23SELECT year('2019-07-23 11:23:18');
SELECT month('2019-07-23 11:23:18');
SELECT day('2019-07-23 11:23:18');
SELECT to_date('2019-07-23 11:23:18');
SELECT year('2019-07-23 11:23:18');
SELECT month('2019-07-23 11:23:18');
SELECT day('2019-07-23 11:23:18');
# 提取json
# [{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}]
SELECT get_json_object('{
"store": {
"fruit": [{
"weight": 8,
"type": "apple"
}, {
"weight": 9,
"type": "pear"
}],
"bicycle": {
"price": 19.95,
"color": "red"
}
},
"email": "amy@only_for_json_udf_test.net",
"owner": "amy"
}','$.store.fruit') fruit;
SELECT explode(split('Explodes an array to multiple rows. Returns a row-set with a single column (col), one row for each element from the array.',' ')) as word;
- 一个小的wordcount例子
SELECT tmp.word,count(*) as count_num FROM (
SELECT explode(split('Explodes an array to multiple rows. Returns a row-set with a single column (col), one row for each element from the array.',' ')) word
) tmp group by tmp.word