1、插入于法 insert into 和insert overwrite
INSERT INTO TABLE tablename1
[PARTITION (partcol1=val1, partcol2=val2 ...)]
select_statement1 FROM from_statement;
INSERT OVERWRITE TABLE tablename1
[PARTITION (partcol1=val1, partcol2=val2 ...)
[IF NOT EXISTS]]
select_statement1 FROM from_statement;
INSERT INTO
用于附加数据到配置单元表和分区表和,INSERT OVERWRITE
用来从表中删除现有的数据并插入新的数据。
2、静态插入
2.1 无分区插入
出入新增值
INSERT INTO employee VALUES
(13,'Mari',51,'M'),
(14,'Pat',34,'F');
从查询语句中插入
INSERT INTO employee_tmp SELECT * FROM employee;
2.2 带分区表
2.2.1插入特定分区
INSERT INTO zipcodes PARTITION(state='FL') VALUES
(891,'US','TAMPA',33605);
3、动态分区插入
在插入前,需要开启hive的动态插入功能,
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
在这里,必须将partition列保留为最后一列。
动态分区插入主要是由于如果分区太多,手动写入分区很麻烦,因此可以基于查询参数推测除需要创建的分区。
insert overwrite table sport2 partition(year) select split(n1,'\t')[0],split(n1,'\t')[2],split(n1,'\t')[3],split(n1,'\t')[4],split(n1,'\t')[5],split(n1,'\t')[6],split(n1,'\t')[7],split(n1,'\t')[1] from sport;
在上面的语句中,partition(year)
和在查询中和分区匹配的split(n1,'\t')[1]
名称是不一样的,这也说明了在查询插入过程中,是基于位置插入,而不是名称插入。
4、静动混合
如果同时使用静态和动态,这个必须静态在前,动态在后。
INSERT OVERWRITE TABLE order_partition
partition (year='2019',month)
SELECT order_id, order_date, order_status,
substr(order_date,1,4) ye, substr(order_date,5,2) mon
FROM orders
WHERE substr(order_date,1,4) ="2019";
单个查询语句中创建表并加载数据
create table 2012_sport as select n1,n2,n3 from sport2 where year=2012;
describe formatted 2012_sport;
OK
2021-05-18 21:05:02,303 INFO [9b64abd8-c822-4803-ab03-dd4a27670202 main] mapred.FileInputFormat: Total input files to process : 1
# col_name data_type comment
n1 string
n2 string
n3 string
# Detailed Table Information
Database: default
Owner: root
CreateTime: Tue May 18 21:03:22 CST 2021
LastAccessTime: UNKNOWN
Retention: 0
Location: hdfs://hacluster/user/hive/warehouse/2012_sport
Table Type: MANAGED_TABLE
Table Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
numFiles 1
numRows 74
rawDataSize 1768
totalSize 1842
transient_lastDdlTime 1621343003
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat: org.apache.hadoop.mapred.TextInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Storage Desc Params:
serialization.format 1
Time taken: 0.385 seconds, Fetched: 32 row(s)
这个功能适合从一个宽表中导出一个小表
导出数据
insert overwrite local directory '/tmp/ca_eemm'
select name from sport2