hive格式介绍
demo
- Textfile:行式存储,这是hive表的默认存储格式,默认不做数据压缩,磁盘开销大,数据解析开销大,数据不支持分片(即代表着会带来无法对数据进行并行操作)
- ORC:行列式存储,将数据按行分块,每个块按列存储,其中每个块都存储着一个索引,支持none和zlib和snappy这3种压缩方式,默认采用zlib压缩方式,不支持切片,orc存储格式能提高hive表的读取写入和处理的性能。
- Parquet:列式存储,是一个面向列的二进制文件格式(不可直接读取),文件中包含数据和元数据,所以该存储格式是自解析的,在大型查询时效率很快高效,parquet主要用在存储多层嵌套式数据上提供良好的性能支持,默认采用uncompressed不压缩方式。
hive主要支持gzip、zlib、snappy、lzo 这四种压缩方式。
hive建表,创建一个csv格式的表 & 插入数据
CREATE EXTERNAL TABLE IF NOT EXISTS `test_format`(
`code` string COMMENT 'import id',
`vin` string COMMENT 'import name',
`vichel` string COMMENT 'import message')
row format delimited fields terminated by ','
LINES TERMINATED BY '\n'
stored as textfile;
hive> show create table test_format;
OK
CREATE EXTERNAL TABLE `test_format`(
`code` string COMMENT 'import id',
`vin` string COMMENT 'import name',
`vichel` string COMMENT 'import message')
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'=',',
'line.delim'='\n',
'serialization.format'=',')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/test_format'
TBLPROPERTIES (
'transient_lastDdlTime'='1681719612')
# 插入数据
load data local inpath '/home/hadoop/test.txt' overwrite into table test_format;
hive建表,创建一个parquest格式的表
CREATE EXTERNAL TABLE IF NOT EXISTS test_parquet(
`code` string COMMENT 'import id',
`vin` string COMMENT 'import name',
`vichel` string COMMENT 'import message')
STORED AS PARQUET;
hive> show create table test_parquet;
OK
CREATE EXTERNAL TABLE `test_parquet`(
`code` string COMMENT 'import id',
`vin` string COMMENT 'import name',
`vichel` string COMMENT 'import message')
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/test_parquet'
TBLPROPERTIES (
'transient_lastDdlTime'='1681719650')
hive建表,创建一个orc格式的表
CREATE EXTERNAL TABLE IF NOT EXISTS test_orc(
`code` string COMMENT 'import id',
`vin` string COMMENT 'import name',
`vichel` string COMMENT 'import message')
STORED AS ORC;
hive> show create table test_orc;
OK
CREATE EXTERNAL TABLE `test_orc`(
`code` string COMMENT 'import id',
`vin` string COMMENT 'import name',
`vichel` string COMMENT 'import message')
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
LOCATION
'hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/test_orc'
TBLPROPERTIES (
'transient_lastDdlTime'='1681720136')
然后将csv数据插入到parquet格式和orc格式的表中
insert overwrite table test_parquet select * from test_format;
insert overwrite table test_orc select * from test_format;
数据导出到本地
insert overwrite local directory '/home/hadoop/orc' select * from test_orc;
insert overwrite local directory '/home/hadoop/parquet' select * from test_parquet;
非常简单的教程:https://www.yiibai.com/hive/hive_create_table.html
建表语句参考:https://blog.csdn.net/hyj_king/article/details/126776080
数据转换过程:https://www.freesion.com/article/9733618966/
hive的数据导出的几种方式:https://blog.csdn.net/hyj_king/article/details/96849837
ssb数据的转换
hive侧各个表的结构
# customer表
# csv
CREATE TABLE IF NOT EXISTS `customer_csv` (
`c_custkey` int COMMENT "",
`c_name` string COMMENT "",
`c_address` string COMMENT "",
`c_city` string COMMENT "",
`c_nation` string COMMENT "",
`c_region` string COMMENT "",
`c_phone` string COMMENT "",
`c_mktsegment` string COMMENT ""
)
row format delimited fields terminated by ','
LINES TERMINATED BY '\n'
STORED AS textfile;
# orc
CREATE TABLE IF NOT EXISTS `customer_orc` (
`c_custkey` int COMMENT "",
`c_name` string COMMENT "",
`c_address` string COMMENT "",
`c_city` string COMMENT "",
`c_nation` string COMMENT "",
`c_region` string COMMENT "",
`c_phone` string COMMENT "",
`c_mktsegment` string COMMENT ""
)
STORED AS orc;
# parquet
CREATE TABLE IF NOT EXISTS `customer_parquet` (
`c_custkey` int COMMENT "",
`c_name` string COMMENT "",
`c_address` string COMMENT "",
`c_city` string COMMENT "",
`c_nation` string COMMENT "",
`c_region` string COMMENT "",
`c_phone` string COMMENT "",
`c_mktsegment` string COMMENT ""
)
STORED AS parquet;
dates表
CREATE TABLE IF NOT EXISTS `dates_csv` (
`d_datekey` int COMMENT "",
`d_date` string COMMENT "",
`d_dayofweek` string COMMENT "",
`d_month` string COMMENT "",
`d_year` int COMMENT "",
`d_yearmonthnum` int COMMENT "",
`d_yearmonth` string COMMENT "",
`d_daynuminweek` int COMMENT "",
`d_daynuminmonth` int COMMENT "",
`d_daynuminyear` int COMMENT "",
`d_monthnuminyear` int COMMENT "",
`d_weeknuminyear` int COMMENT "",
`d_sellingseason` string COMMENT "",
`d_lastdayinweekfl` int COMMENT "",
`d_lastdayinmonthfl` int COMMENT "",
`d_holidayfl` int COMMENT "",
`d_weekdayfl` int COMMENT ""
)
row format delimited fields terminated by ','
LINES TERMINATED BY '\n'
STORED AS textfile;
CREATE TABLE IF NOT EXISTS `dates_orc` (
`d_datekey` int COMMENT "",
`d_date` string COMMENT "",
`d_dayofweek` string COMMENT "",
`d_month` string COMMENT "",
`d_year` int COMMENT "",
`d_yearmonthnum` int COMMENT "",
`d_yearmonth` string COMMENT "",
`d_daynuminweek` int COMMENT "",
`d_daynuminmonth` int COMMENT "",
`d_daynuminyear` int COMMENT "",
`d_monthnuminyear` int COMMENT "",
`d_weeknuminyear` int COMMENT "",
`d_sellingseason` string COMMENT "",
`d_lastdayinweekfl` int COMMENT "",
`d_lastdayinmonthfl` int COMMENT "",
`d_holidayfl` int COMMENT "",
`d_weekdayfl` int COMMENT ""
)
STORED AS orc;
CREATE TABLE IF NOT EXISTS `dates_parquet` (
`d_datekey` int COMMENT "",
`d_date` string COMMENT "",
`d_dayofweek` string COMMENT "",
`d_month` string COMMENT "",
`d_year` int COMMENT "",
`d_yearmonthnum` int COMMENT "",
`d_yearmonth` string COMMENT "",
`d_daynuminweek` int COMMENT "",
`d_daynuminmonth` int COMMENT "",
`d_daynuminyear` int COMMENT "",
`d_monthnuminyear` int COMMENT "",
`d_weeknuminyear` int COMMENT "",
`d_sellingseason` string COMMENT "",
`d_lastdayinweekfl` int COMMENT "",
`d_lastdayinmonthfl` int COMMENT "",
`d_holidayfl` int COMMENT "",
`d_weekdayfl` int COMMENT ""
)
STORED AS parquet;
lineorder表
CREATE TABLE IF NOT EXISTS `lineorder_csv` (
`lo_orderkey` int COMMENT "",
`lo_linenumber` int COMMENT "",
`lo_custkey` int COMMENT "",
`lo_partkey` int COMMENT "",
`lo_suppkey` int COMMENT "",
`lo_orderdate` int COMMENT "",
`lo_orderpriority` string COMMENT "",
`lo_shippriority` int COMMENT "",
`lo_quantity` int COMMENT "",
`lo_extendedprice` int COMMENT "",
`lo_ordtotalprice` int COMMENT "",
`lo_discount` int COMMENT "",
`lo_revenue` int COMMENT "",
`lo_supplycost` int COMMENT "",
`lo_tax` int COMMENT "",
`lo_commitdate` int COMMENT "",
`lo_shipmode` string COMMENT ""
)
row format delimited fields terminated by ','
LINES TERMINATED BY '\n'
STORED AS textfile;
CREATE TABLE IF NOT EXISTS `lineorder_orc` (
`lo_orderkey` int COMMENT "",
`lo_linenumber` int COMMENT "",
`lo_custkey` int COMMENT "",
`lo_partkey` int COMMENT "",
`lo_suppkey` int COMMENT "",
`lo_orderdate` int COMMENT "",
`lo_orderpriority` string COMMENT "",
`lo_shippriority` int COMMENT "",
`lo_quantity` int COMMENT "",
`lo_extendedprice` int COMMENT "",
`lo_ordtotalprice` int COMMENT "",
`lo_discount` int COMMENT "",
`lo_revenue` int COMMENT "",
`lo_supplycost` int COMMENT "",
`lo_tax` int COMMENT "",
`lo_commitdate` int COMMENT "",
`lo_shipmode` string COMMENT ""
)
STORED AS orc;
CREATE TABLE IF NOT EXISTS `lineorder_parquet` (
`lo_orderkey` int COMMENT "",
`lo_linenumber` int COMMENT "",
`lo_custkey` int COMMENT "",
`lo_partkey` int COMMENT "",
`lo_suppkey` int COMMENT "",
`lo_orderdate` int COMMENT "",
`lo_orderpriority` string COMMENT "",
`lo_shippriority` int COMMENT "",
`lo_quantity` int COMMENT "",
`lo_extendedprice` int COMMENT "",
`lo_ordtotalprice` int COMMENT "",
`lo_discount` int COMMENT "",
`lo_revenue` int COMMENT "",
`lo_supplycost` int COMMENT "",
`lo_tax` int COMMENT "",
`lo_commitdate` int COMMENT "",
`lo_shipmode` string COMMENT ""
)
STORED AS parquet;
part表
CREATE TABLE IF NOT EXISTS `part_csv` (
`p_partkey` int COMMENT "",
`p_name` string COMMENT "",
`p_mfgr` string COMMENT "",
`p_category` string COMMENT "",
`p_brand` string COMMENT "",
`p_color` string COMMENT "",
`p_type` string COMMENT "",
`p_size` int COMMENT "",
`p_container` string COMMENT ""
)row format delimited fields terminated by ','
LINES TERMINATED BY '\n'
STORED AS textfile;
CREATE TABLE IF NOT EXISTS `part_orc` (
`p_partkey` int COMMENT "",
`p_name` string COMMENT "",
`p_mfgr` string COMMENT "",
`p_category` string COMMENT "",
`p_brand` string COMMENT "",
`p_color` string COMMENT "",
`p_type` string COMMENT "",
`p_size` int COMMENT "",
`p_container` string COMMENT ""
)
STORED AS orc;
CREATE TABLE IF NOT EXISTS `part_parquet` (
`p_partkey` int COMMENT "",
`p_name` string COMMENT "",
`p_mfgr` string COMMENT "",
`p_category` string COMMENT "",
`p_brand` string COMMENT "",
`p_color` string COMMENT "",
`p_type` string COMMENT "",
`p_size` int COMMENT "",
`p_container` string COMMENT ""
)
STORED AS parquet;
supplier表
CREATE TABLE IF NOT EXISTS `supplier_csv` (
`s_suppkey` int COMMENT "",
`s_name` string COMMENT "",
`s_address` string COMMENT "",
`s_city` string COMMENT "",
`s_nation` string COMMENT "",
`s_region` string COMMENT "",
`s_phone` string COMMENT ""
)row format delimited fields terminated by ','
LINES TERMINATED BY '\n'
STORED AS textfile;
CREATE TABLE IF NOT EXISTS `supplier_orc` (
`s_suppkey` int COMMENT "",
`s_name` string COMMENT "",
`s_address` string COMMENT "",
`s_city` string COMMENT "",
`s_nation` string COMMENT "",
`s_region` string COMMENT "",
`s_phone` string COMMENT ""
)STORED AS orc;
CREATE TABLE IF NOT EXISTS `supplier_parquet` (
`s_suppkey` int COMMENT "",
`s_name` string COMMENT "",
`s_address` string COMMENT "",
`s_city` string COMMENT "",
`s_nation` string COMMENT "",
`s_region` string COMMENT "",
`s_phone` string COMMENT ""
)STORED AS parquet;
数据生成
https://github.com/electrum/ssb-dbgen
make编译
(customer.tbl)
dbgen -s 1 -T c
(part.tbl)
dbgen -s 1 -T p
(supplier.tbl)
dbgen -s 1 -T s
(date.tbl)
dbgen -s 1 -T d
(fact table lineorder.tbl)
dbgen -s 1 -T l
(for all SSBM tables)
dbgen -s 1 -T a
To generate the refresh (insert/delete) data set:
(create delete.[1-4] and lineorder.tbl.u[1-4] with refreshing fact 0.05%)
dbgen -s 1 -r 5 -U 4
将生成数据插入到hive表中,然后生成的orc和parquet再导出来。
load data local inpath '/home/hadoop/dates' overwrite into table dates_csv;
insert overwrite table dates_orc select * from dates_csv;
insert overwrite table dates_parquet select * from dates_csv;
insert overwrite local directory '/home/hadoop/orc' select * from dates_orc;
insert overwrite local directory '/home/hadoop/parquet' select * from dates_parquet;
load data local inpath '/home/hadoop/data/supplier' overwrite into table supplier_csv;
insert overwrite table supplier_orc select * from supplier_csv;
insert overwrite table supplier_parquet select * from supplier_csv;
insert overwrite local directory '/home/hadoop/data/orc_supplier' select * from supplier_orc;
insert overwrite local directory '/home/hadoop/data/parquet_supplier' select * from supplier_parquet;
load data local inpath '/home/hadoop/customer' overwrite into table customer_csv;
insert overwrite table customer_orc select * from customer_csv;
insert overwrite table customer_parquet select * from customer_csv;
insert overwrite local directory '/home/hadoop/data/orc_customer' select * from customer_orc;
insert overwrite local directory '/home/hadoop/data/parquet_customer' select * from customer_parquet;
load data local inpath '/home/hadoop/part' overwrite into table part_csv;
insert overwrite table part_orc select * from part_csv;
insert overwrite table part_parquet select * from part_csv;
insert overwrite local directory '/home/hadoop/data/orc_part' select * from part_orc;
insert overwrite local directory '/home/hadoop/data/parquet_part' select * from part_parquet;
load data local inpath '/home/hadoop/data/lineorder' overwrite into table lineorder_csv;
insert overwrite table lineorder_orc select * from lineorder_csv;
insert overwrite table lineorder_parquet select * from lineorder_csv;
insert overwrite local directory '/home/hadoop/data/orc_lineorder' select * from lineorder_orc;
insert overwrite local directory '/home/hadoop/data/parquet_lineorder' select * from lineorder_parquet;
数据在oss,进行文件格式转换
c参考:https://blog.csdn.net/BabyFish13/article/details/78851067