Loading files into tables(将数据加载到表中)
CREATE TABLE employee LIKE sensor.employee;
LOAD DATA LOCAL INPATH '/home/hadoop-jrq/hive-course/employee.txt' OVERWRITE INTO TABLE employee;
LOAD DATA LOCAL INPATH '/home/hadoop-jrq/hive-course/employee.txt' INTO TABLE employee; ----不加OVERWRITE就是追加数据
CREATE TABLE sensor_format LIKE sensor.sensor_format;
--1、path可以是一个文件目录
--2、原始的文件将move到表所在的LOCATION中
LOAD DATA INPATH 'hdfs://master:8020/user/hadoop-jrq/hive-course/omneo' OVERWRITE INTO TABLE sensor_format;
LOAD DATA INPATH 'hdfs://master:8020/user/hadoop-jrq/hive-course/omneo' INTO TABLE sensor_format;
CREATE TABLE sensor_managed_partition LIKE sensor.sensor_managed_partition;
LOAD DATA INPATH 'hdfs://master:8020/user/hadoop-jrq/hive-course/omneo' OVERWRITE INTO TABLE sensor_managed_partition PARTITION(year=2019, month=201908, day=20190830);
LOAD DATA INPATH 'hdfs://master:8020/user/hadoop-jrq/hive-course/omneo' INTO TABLE sensor_managed_partition PARTITION(year=2019, month=201908, day=20190830);
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2, ...)]
INSERT data into Hive tables from queries
从查询中插入数据到Hive表
LOAD files into table的方法会删除了原始文件,如果不想删除原始文件的话,则可以通过INSERT data的方式导数据
– ETL会经常用到的一种方式,创建临时表来导数据
--创建一张外部表,但是表的存储位置是在hive默认的位置,存储格式是PARQUET
CREATE EXTERNAL TABLE IF NOT EXISTS dml.sensor(
id STRING,
event_id STRING,
event_type STRING,
part_name STRING,
part_number STRING,
version STRING,
payload STRING)
STORED AS PARQUET;
--创建一张一样的临时的外部表
CREATE EXTERNAL TABLE IF NOT EXISTS sensor_temp (
id STRING,
event_id STRING,
event_type STRING,
part_name STRING,
part_number STRING,
version STRING,
payload STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION 'hdfs://master:8020/user/hadoop-jrq/hive-course/omneo-temp/';
--将sensor_tmp中的数据导入到sensor中,如果sensor中有数据的话则删除,使用sensor_tmp数据覆盖掉原来的数据
-- 一种目的,原始文件不会被删除,另一种目的是文件类型的转换
INSERT OVERWRITE TABLE sensor SELECT * FROM sensor_temp;
--将sensor_tmp中的数据追加到sensor表中
INSERT INTO TABLE sensor SELECT * FROM sensor_temp;
--语法:
INSERT OVERWRITE|INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2, ...) [IF NOT EXISTS]] SELECT select_statement FROM from_statement
--创建一个分区表
CREATE EXTERNAL TABLE IF NOT EXISTS sensor_partition(
id STRING,
event_id STRING,
event_type STRING,
part_name STRING,
part_number STRING,
version STRING,
payload STRING)
PARTITIONED BY(year INT, month INT, day INT)
STORED AS PARQUET;
--将sensor_tmp中的数据导入到sensor_partition中的数据导入到sensor分区year=2019, month=201908, day=20190830中
INSERT OVERWRITE TABLE sensor_partition PARTITION (year=2019, month=201908, day=20190830) SELECT * FROM sensor_temp; -- 导数据到指定的分区
--将sensor_tmp中的数据追加到sensor_partition表的某个分区中
INSERT INTO TABLE sensor_partition PARTITION (year=2019, month=201908, day=20190830) SELECT * FROM sensor_temp;
– 另一个例子:创建一张表
CREATE EXTERNAL TABLE IF NOT EXISTS sensor_group_20190830(
id STRING,
event_type STRING,
part_name STRING,
cnt INT)
STORED AS AVRO;
-- 从一个表的分区中导数据进去
INSERT OVERWRITE TABLE sensor_group_20190830
SELECT
id, event_type, part_name, count(*) AS cnt
FROM sensor_partition WHERE day = 20190830
GROUP BY id, event_type, part_name; -- 按照这三个字段进行分区
--相当于上面的CREATE + INSERT, 但是只能创建内部表
CREATE TABLE IF NOT EXISTS sensor_group_20190830_a
STORED AS AVRO
AS SELECT
id, event_type, part_name, count(*) AS cnt
FROM sensor_partition WHERE day = 20190830
GROUP BY id, event_type, part_name;
-- 使用多个 INSERT 字句
CREATE EXTERNAL TABLE IF NOT EXISTS sensor_alert LIKE sensor_partition;
FROM sensor_temp
INSERT OVERWRITE TABLE sensor_partition PARTITION (year=2019, month=201908, day=20190830) SELECT id, event_id, event_type, part_name, part_number, version, payload
INSERT OVERWRITE TABLE sensor_alert PARTITION (year=2019, month=201908, day=20190830) SELECT id, event_id, event_type, part_name, part_number, version, payload WHERE event_type='ALERT';
FROM from_statement
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2, ...) [IF NOT EXISTS]] SELECT select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION (partcol1=val1, partcol2=val2, ...) [IF NOT EXISTS]] SELECT select_statement2]
[INSERT INTO TABLE tablename2 [PARTITION (partcol1=val1, partcol2=val2, ...)] SELECT select_statement2]
...;
动态分区与静态分区
备注:
如果只有动态分区,没有静态分区,在执行的时候会报错,这时候设置下面两个参数就行了
set hive.mapred.supports.subdirectories=true;
set mapred.input.dir.recursive=true;
–静态分区
INSERT OVERWRITE TABLE sensor_partition PARTITION (year=2019, month=201908, day=20190809) SELECT * FROM sensor_temp;
CREATE EXTERNAL TABLE IF NOT EXISTS sensor_event_type_partition (
id STRING,
event_id STRING,
part_name STRING,
part_number STRING,
version STRING,
payload STRING)
PARTITIONED BY(year INT, month INT, event_type STRING)
STORED AS PARQUET;
动态的查找分区数据 year静态的,后面两个是动态的
INSERT OVERWRITE TABLE sensor_event_type_partition PARTITION (year=2019, month, event_type) SELECT id, event_id, part_name, part_number, version, payload, month, event_type FROM sensor_partition; -- month, event_type必须加上,而且是在最后两列
我的理解就是,例如year=2019就是静态的,不变的,而如果变成year,那么就是动态分区了
Writing data into the filesystem from queries
从查询中将数据写入文件系统
数据量大的话则写到HDFS的文件中
INSERT OVERWRITE DIRECTORY '/user/hadoop-jrq/hive-course/employee'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':'
STORED AS TEXTFILE
SELECT * FROM dml.employee;
--数据量比较小的话则写到本地文件中
INSERT OVERWRITE LOCAL DIRECTORY '/home/hadoop-jrq/hive-course/employee'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':'
STORED AS TEXTFILE
SELECT * FROM dml.employee;
-- 将hive中的数据导入到hdfs的某个文件中去
INSERT OVERWRITE DIRECTORY '/user/hadoop-jrq/hive-course/employee-parquet'
STORED AS PARQUET
SELECT * FROM dml.employee;
Inserting values into tables from SQL
从SQL中将值插入表中
-- 往一个分区中写一条记录 value对应表的每一个字段
INSERT INTO TABLE sensor_partition PARTITION (year=2019, month=201908, day=20190830) VALUES("testid", "testEventId", "event_type", "part_name", "part_number", "version", "payload");
提高hive性能的两个点
bucket
引入:
--下面的查询虽然在日期的过滤上查询非常快,但是一个分区的数据量比较到,那么过滤id的话就会比较慢了
SELECT * FROM dml.sensor_partition WHERE day = 20190830 AND id = 'f277';
-- 我们可以通过bucket来提升上面的SQL的性能
CREATE TABLE dml.sensor_partition_bucket(
id STRING,
event_id STRING,
event_type STRING,
part_name STRING,
part_number STRING,
version STRING,
payload STRING)
PARTITIONED BY(year INT, month INT, day INT)
CLUSTERED BY(id) INTO 12 BUCKETS -- 对ID分12个桶,按照ID的hash值进行分区的
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
SET hive.enforce.bucketing = true; --需要先设置,否则查询不到数据,Hive 2.x 后就不需要了
SELECT * FROM dml.sensor_partition_bucket WHERE day = 20190830 AND id = 'f277';
-- 导数据
INSERT OVERWRITE TABLE dml.sensor_partition_bucket PARTITION (year=2019, month=201908, day=20190830) SELECT * FROM dml.sensor_temp;
--因为 "f277".hashCode() % 12 所以下面的语句只要去/user/hive/warehouse/dml.db/sensor_partition_bucket/year=2019/month=201908/day=20190830/000010_0这个文件中扫描就行
SELECT * FROM dml.sensor_partition_bucket WHERE day = 20190830 AND id = 'f277';
--加上SORTED BY(event_time)就是为了使得在一个bucket中的数据是按照event_time升序排列
CREATE TABLE sensor_partition_bucket(
id STRING,
event_id STRING,
event_type STRING,
part_name STRING,
part_number STRING,
version STRING,
payload STRING,
event_time STRING)
PARTITIONED BY(year INT, month INT, day INT)
CLUSTERED BY(id) SORTED BY(event_time) INTO 12 BUCKETS
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
bucket table的两个作用
1.数据sampling(取样)
–数据的取样
–tablesample是抽样语句,语法:TABLESAMPLE(BUCKET x OUT OF y ON bucket column)
--y必须是table总bucket数的倍数或者因子。hive根据y的大小,决定抽样的比例。
--例如,table总共分了12份,当y=6时,抽取 (12/6=)2个bucket的数据,当y=24时,抽取(12/24=)1/2个bucket的数据。
--x表示从哪个bucket开始抽取。例 如,table总bucket数为12,tablesample(bucket 4 out of 6),表示总共抽取(12/6=)2个bucket的数据,
--分别为第4个bucket和第(4+6=)10个bucket的数据。
SELECT * FROM sensor_partition_bucket TABLESAMPLE(BUCKET 4 OUT OF 6 ON id);
2.提升某些查询操作效率,例如mapside join
Skewed Table(倾斜的表,为了解决数据倾斜问题)
有这么一种场景:一张表中的一列或者某几列的某些值占比特别的大
CREATE TABLE dml.skewed_single (key STRING, value STRING)
SKEWED BY (key) ON ("1", "5", "6") -- 1,5,6 就是占比特别大的key
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
-- 创建外部表用以导数据
CREATE EXTERNAL TABLE skewed_test_tmp (
key STRING,
value STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/user/hadoop-jrq/hive-course/skewed';
--Skewed Table:只是含有倾斜的信息的表
INSERT OVERWRITE TABLE dml.skewed_single SELECT * FROM dml.skewed_test_tmp;
使得性能更加明显的
List Bucketing Table是Skewed Table,此外,它告诉hive使用列表桶的特点:为倾斜值创建子目录。
CREATE TABLE list_bucket (key STRING, value STRING)
SKEWED BY (key) ON ("1", "5", "6") STORED AS DIRECTORIES -- 为每一个倾斜值指定目录
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
-- 导数据,效果就是数据会分开存储
INSERT OVERWRITE TABLE dml.list_bucket SELECT * FROM dml.skewed_test_tmp;
--需要设置下面的属性才可以查询,否则查询不出来数据,因为他和分区不一样
set hive.mapred.supports.subdirectories=true;
set mapred.input.dir.recursive=true;
SELECT * FROM dml.list_bucket WHERE key=1; --只会扫描/user/hive/warehouse/dml.db/list_bucket/key=1这个文件目录
--语法:
CREATE TABLE table_name (key STRING, value STRING)
SKEWED BY (key) ON (1,5,6) [STORED AS DIRECTORIES];
总结,创建表的语句
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name -- (Note: TEMPORARY available in Hive 0.14.0 and later)
[(col_name data_type [COMMENT col_comment], ... [constraint_specification(约束的意思)])]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[SKEWED BY (col_name, col_name, ...) -- (Note: Available in Hive 0.10.0 and later)]
ON ((col_value, col_value, ...), (col_value, col_value, ...), ...)
[STORED AS DIRECTORIES]
[
[ROW FORMAT row_format]
[STORED AS file_format]
| STORED BY 'storage.handler.class.name' [WITH SERDEPROPERTIES (...)] -- (Note: Available in Hive 0.6.0 and later)
]
[LOCATION hdfs_path]
[TBLPROPERTIES (property_name=property_value, ...)] -- (Note: Available in Hive 0.6.0 and later)
[AS select_statement]; -- (Note: Available in Hive 0.5.0 and later; not supported for external tables)
-- 另一个表和一个表的数据结构相似,那么久可以用like的方式进行创建
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
LIKE existing_table_or_view_name
[LOCATION hdfs_path];
Hive 和 HBase交互
1.启动HBase
2.hive和hbase的交互方式
hive启动的时候需要指定一些参数和一些依赖的jar包
hive --auxpath $HIVE_HOME/lib/zookeeper-3.4.6.jar,$HIVE_HOME/lib/hive-hbase-handler-2.3.3.jar,$HIVE_HOME/lib/hbase-server-1.1.1.jar --hiveconf hbase.zookeeper.quorum=master,slave1,slave2
-- 创建一张表
CREATE TABLE pokes (foo INT, bar STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
LOAD DATA LOCAL INPATH '/home/hadoop-jrq/hive-course/kv3.txt' OVERWRITE INTO TABLE pokes;
CREATE TABLE hbase_table_1(key int, value string)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' -- 存储的文件格式是hbase的文件格式
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:val") -- 字段的映射关系 第一个字段映射成rowkey,第二个字段映射成cf1:val
TBLPROPERTIES ("hbase.table.name" = "xyz", "hbase.mapred.output.outputtable" = "xyz"); -- hbase.table.name创建的表名
-- pokes数据插入到hbase_table_1
INSERT OVERWRITE TABLE hbase_table_1 SELECT * FROM pokes;
SELECT * FROM hbase_table_1;
SCAN 'xyz'
--Column Mapping
--多个column和多个column family的映射
CREATE TABLE hbase_table_2(key int, value1 string, value2 int, value3 int)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES (
"hbase.columns.mapping" = ":key,a:b,a:c,d:e" -- 一一映射
);
-- 插入一条数据
INSERT OVERWRITE TABLE hbase_table_2 SELECT foo, bar, foo+1, foo+2 FROM pokes;
--Hive中的Map对应的是HBase中的Column Family
CREATE TABLE hbase_table_map(value map<string,int>, row_key int)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES (
"hbase.columns.mapping" = "cf:,:key"
);
INSERT OVERWRITE TABLE hbase_table_map SELECT map(bar, foo), foo FROM pokes;
–从已经存在的HBASE表sensor中创建一张hive表
-- avro有这个bug,3.0版本的时候才会结束,不是avro的话就不会报错
CREATE EXTERNAL TABLE sensor_hbase_avro
ROW FORMAT SERDE 'org.apache.hadoop.hive.hbase.HBaseSerDe'
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES (
"hbase.columns.mapping" = ":key,v:event_id",
"v.event_id.serialization.type" = "avro",
"v.event_id.avro.schema.url" = "hdfs://master:8020/user/hadoop-jrq/hive-course/sensor.avsc")
TBLPROPERTIES (
"hbase.table.name" = "sensor",
"hbase.mapred.output.outputtable" = "sensor",
"hbase.struct.autogenerate"="true");
--上面的语句会报错,是hive的一个bug
https://issues.apache.org/jira/browse/HIVE-17829?src=confmacro