hive来进行格式转换

hive格式介绍

demo

  1. Textfile:行式存储,这是hive表的默认存储格式,默认不做数据压缩,磁盘开销大,数据解析开销大,数据不支持分片(即代表着会带来无法对数据进行并行操作)
  2. ORC:行列式存储,将数据按行分块,每个块按列存储,其中每个块都存储着一个索引,支持none和zlib和snappy这3种压缩方式,默认采用zlib压缩方式,不支持切片,orc存储格式能提高hive表的读取写入和处理的性能。
  3. Parquet:列式存储,是一个面向列的二进制文件格式(不可直接读取),文件中包含数据和元数据,所以该存储格式是自解析的,在大型查询时效率很快高效,parquet主要用在存储多层嵌套式数据上提供良好的性能支持,默认采用uncompressed不压缩方式。

hive主要支持gzip、zlib、snappy、lzo 这四种压缩方式。

hive建表,创建一个csv格式的表 & 插入数据

CREATE EXTERNAL TABLE IF NOT EXISTS `test_format`(
  `code` string COMMENT 'import id', 
  `vin` string COMMENT 'import name', 
  `vichel` string COMMENT 'import message')
row format delimited fields terminated by ',' 
LINES TERMINATED BY '\n' 
stored as textfile;

hive> show create table test_format;
OK
CREATE EXTERNAL TABLE `test_format`(
  `code` string COMMENT 'import id',
  `vin` string COMMENT 'import name',
  `vichel` string COMMENT 'import message')
ROW FORMAT SERDE
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
  'field.delim'=',',
  'line.delim'='\n',
  'serialization.format'=',')
STORED AS INPUTFORMAT
  'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/test_format'
TBLPROPERTIES (
  'transient_lastDdlTime'='1681719612')
# 插入数据
load data local inpath '/home/hadoop/test.txt' overwrite into table test_format;

hive建表,创建一个parquest格式的表

CREATE EXTERNAL TABLE IF NOT EXISTS  test_parquet(
  `code` string COMMENT 'import id', 
  `vin` string COMMENT 'import name', 
  `vichel` string COMMENT 'import message') 
STORED AS PARQUET;

hive> show create table test_parquet;
OK
CREATE EXTERNAL TABLE `test_parquet`(
  `code` string COMMENT 'import id',
  `vin` string COMMENT 'import name',
  `vichel` string COMMENT 'import message')
ROW FORMAT SERDE
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  'hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/test_parquet'
TBLPROPERTIES (
  'transient_lastDdlTime'='1681719650')

hive建表,创建一个orc格式的表

CREATE EXTERNAL TABLE IF NOT EXISTS  test_orc(
  `code` string COMMENT 'import id', 
  `vin` string COMMENT 'import name', 
  `vichel` string COMMENT 'import message') 
STORED AS ORC;

hive> show create table test_orc;
OK
CREATE EXTERNAL TABLE `test_orc`(
  `code` string COMMENT 'import id',
  `vin` string COMMENT 'import name',
  `vichel` string COMMENT 'import message')
ROW FORMAT SERDE
  'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
LOCATION
  'hdfs://emr-header-1.cluster-49091:9000/user/hive/warehouse/test_orc'
TBLPROPERTIES (
  'transient_lastDdlTime'='1681720136')

然后将csv数据插入到parquet格式和orc格式的表中

insert overwrite table test_parquet select * from test_format;
insert overwrite table test_orc select * from test_format;

数据导出到本地

insert overwrite local directory '/home/hadoop/orc' select * from test_orc;
insert overwrite local directory '/home/hadoop/parquet' select * from test_parquet;

非常简单的教程:https://www.yiibai.com/hive/hive_create_table.html
建表语句参考:https://blog.csdn.net/hyj_king/article/details/126776080
数据转换过程:https://www.freesion.com/article/9733618966/
hive的数据导出的几种方式:https://blog.csdn.net/hyj_king/article/details/96849837

ssb数据的转换

hive侧各个表的结构

# customer表
# csv
CREATE TABLE IF NOT EXISTS `customer_csv` (
    `c_custkey` int  COMMENT "",
    `c_name` string  COMMENT "",
    `c_address` string  COMMENT "",
    `c_city` string  COMMENT "",
    `c_nation` string  COMMENT "",
    `c_region` string  COMMENT "",
    `c_phone` string  COMMENT "",
    `c_mktsegment` string  COMMENT ""
)
row format delimited fields terminated by ',' 
LINES TERMINATED BY '\n' 
STORED AS textfile;

# orc
CREATE TABLE IF NOT EXISTS `customer_orc` (
    `c_custkey` int  COMMENT "",
    `c_name` string  COMMENT "",
    `c_address` string  COMMENT "",
    `c_city` string  COMMENT "",
    `c_nation` string  COMMENT "",
    `c_region` string  COMMENT "",
    `c_phone` string  COMMENT "",
    `c_mktsegment` string  COMMENT ""
)
STORED AS orc;

# parquet
CREATE TABLE IF NOT EXISTS `customer_parquet` (
    `c_custkey` int  COMMENT "",
    `c_name` string  COMMENT "",
    `c_address` string  COMMENT "",
    `c_city` string  COMMENT "",
    `c_nation` string  COMMENT "",
    `c_region` string  COMMENT "",
    `c_phone` string  COMMENT "",
    `c_mktsegment` string  COMMENT ""
)
STORED AS parquet;

dates表

CREATE TABLE IF NOT EXISTS `dates_csv` (
    `d_datekey` int  COMMENT "",
    `d_date` string  COMMENT "",
    `d_dayofweek` string  COMMENT "",
    `d_month` string  COMMENT "",
    `d_year` int  COMMENT "",
    `d_yearmonthnum` int  COMMENT "",
    `d_yearmonth` string  COMMENT "",
    `d_daynuminweek` int  COMMENT "",
    `d_daynuminmonth` int  COMMENT "",
    `d_daynuminyear` int  COMMENT "",
    `d_monthnuminyear` int  COMMENT "",
    `d_weeknuminyear` int  COMMENT "",
    `d_sellingseason` string  COMMENT "",
    `d_lastdayinweekfl` int  COMMENT "",
    `d_lastdayinmonthfl` int  COMMENT "",
    `d_holidayfl` int  COMMENT "",
    `d_weekdayfl` int  COMMENT ""
) 
row format delimited fields terminated by ',' 
LINES TERMINATED BY '\n' 
STORED AS textfile;

CREATE TABLE IF NOT EXISTS `dates_orc` (
    `d_datekey` int  COMMENT "",
    `d_date` string  COMMENT "",
    `d_dayofweek` string  COMMENT "",
    `d_month` string  COMMENT "",
    `d_year` int  COMMENT "",
    `d_yearmonthnum` int  COMMENT "",
    `d_yearmonth` string  COMMENT "",
    `d_daynuminweek` int  COMMENT "",
    `d_daynuminmonth` int  COMMENT "",
    `d_daynuminyear` int  COMMENT "",
    `d_monthnuminyear` int  COMMENT "",
    `d_weeknuminyear` int  COMMENT "",
    `d_sellingseason` string  COMMENT "",
    `d_lastdayinweekfl` int  COMMENT "",
    `d_lastdayinmonthfl` int  COMMENT "",
    `d_holidayfl` int  COMMENT "",
    `d_weekdayfl` int  COMMENT ""
) 
STORED AS orc;

CREATE TABLE IF NOT EXISTS `dates_parquet` (
    `d_datekey` int  COMMENT "",
    `d_date` string  COMMENT "",
    `d_dayofweek` string  COMMENT "",
    `d_month` string  COMMENT "",
    `d_year` int  COMMENT "",
    `d_yearmonthnum` int  COMMENT "",
    `d_yearmonth` string  COMMENT "",
    `d_daynuminweek` int  COMMENT "",
    `d_daynuminmonth` int  COMMENT "",
    `d_daynuminyear` int  COMMENT "",
    `d_monthnuminyear` int  COMMENT "",
    `d_weeknuminyear` int  COMMENT "",
    `d_sellingseason` string  COMMENT "",
    `d_lastdayinweekfl` int  COMMENT "",
    `d_lastdayinmonthfl` int  COMMENT "",
    `d_holidayfl` int  COMMENT "",
    `d_weekdayfl` int  COMMENT ""
) 
STORED AS parquet;

lineorder表

CREATE TABLE IF NOT EXISTS `lineorder_csv` (
    `lo_orderkey` int  COMMENT "",
    `lo_linenumber` int  COMMENT "",
    `lo_custkey` int  COMMENT "",
    `lo_partkey` int  COMMENT "",
    `lo_suppkey` int  COMMENT "",
    `lo_orderdate` int  COMMENT "",
    `lo_orderpriority` string  COMMENT "",
    `lo_shippriority` int  COMMENT "",
    `lo_quantity` int  COMMENT "",
    `lo_extendedprice` int  COMMENT "",
    `lo_ordtotalprice` int  COMMENT "",
    `lo_discount` int  COMMENT "",
    `lo_revenue` int  COMMENT "",
    `lo_supplycost` int  COMMENT "",
    `lo_tax` int  COMMENT "",
    `lo_commitdate` int  COMMENT "",
    `lo_shipmode` string  COMMENT ""
)
row format delimited fields terminated by ',' 
LINES TERMINATED BY '\n' 
STORED AS textfile;

CREATE TABLE IF NOT EXISTS `lineorder_orc` (
    `lo_orderkey` int  COMMENT "",
    `lo_linenumber` int  COMMENT "",
    `lo_custkey` int  COMMENT "",
    `lo_partkey` int  COMMENT "",
    `lo_suppkey` int  COMMENT "",
    `lo_orderdate` int  COMMENT "",
    `lo_orderpriority` string  COMMENT "",
    `lo_shippriority` int  COMMENT "",
    `lo_quantity` int  COMMENT "",
    `lo_extendedprice` int  COMMENT "",
    `lo_ordtotalprice` int  COMMENT "",
    `lo_discount` int  COMMENT "",
    `lo_revenue` int  COMMENT "",
    `lo_supplycost` int  COMMENT "",
    `lo_tax` int  COMMENT "",
    `lo_commitdate` int  COMMENT "",
    `lo_shipmode` string  COMMENT ""
)
STORED AS orc;

CREATE TABLE IF NOT EXISTS `lineorder_parquet` (
    `lo_orderkey` int  COMMENT "",
    `lo_linenumber` int  COMMENT "",
    `lo_custkey` int  COMMENT "",
    `lo_partkey` int  COMMENT "",
    `lo_suppkey` int  COMMENT "",
    `lo_orderdate` int  COMMENT "",
    `lo_orderpriority` string  COMMENT "",
    `lo_shippriority` int  COMMENT "",
    `lo_quantity` int  COMMENT "",
    `lo_extendedprice` int  COMMENT "",
    `lo_ordtotalprice` int  COMMENT "",
    `lo_discount` int  COMMENT "",
    `lo_revenue` int  COMMENT "",
    `lo_supplycost` int  COMMENT "",
    `lo_tax` int  COMMENT "",
    `lo_commitdate` int  COMMENT "",
    `lo_shipmode` string  COMMENT ""
)
STORED AS parquet;

part表

CREATE TABLE IF NOT EXISTS `part_csv` (
    `p_partkey` int  COMMENT "",
    `p_name` string  COMMENT "",
    `p_mfgr` string  COMMENT "",
    `p_category` string  COMMENT "",
    `p_brand` string  COMMENT "",
    `p_color` string  COMMENT "",
    `p_type` string  COMMENT "",
    `p_size` int  COMMENT "",
    `p_container` string  COMMENT ""
)row format delimited fields terminated by ',' 
LINES TERMINATED BY '\n' 
STORED AS textfile;

CREATE TABLE IF NOT EXISTS `part_orc` (
    `p_partkey` int  COMMENT "",
    `p_name` string  COMMENT "",
    `p_mfgr` string  COMMENT "",
    `p_category` string  COMMENT "",
    `p_brand` string  COMMENT "",
    `p_color` string  COMMENT "",
    `p_type` string  COMMENT "",
    `p_size` int  COMMENT "",
    `p_container` string  COMMENT ""
)
STORED AS orc;
CREATE TABLE IF NOT EXISTS `part_parquet` (
    `p_partkey` int  COMMENT "",
    `p_name` string  COMMENT "",
    `p_mfgr` string  COMMENT "",
    `p_category` string  COMMENT "",
    `p_brand` string  COMMENT "",
    `p_color` string  COMMENT "",
    `p_type` string  COMMENT "",
    `p_size` int  COMMENT "",
    `p_container` string  COMMENT ""
)
STORED AS parquet;

supplier表

CREATE TABLE IF NOT EXISTS `supplier_csv` (
    `s_suppkey` int  COMMENT "",
    `s_name` string  COMMENT "",
    `s_address` string  COMMENT "",
    `s_city` string  COMMENT "",
    `s_nation` string  COMMENT "",
    `s_region` string  COMMENT "",
    `s_phone` string  COMMENT ""
)row format delimited fields terminated by ',' 
LINES TERMINATED BY '\n' 
STORED AS textfile;

CREATE TABLE IF NOT EXISTS `supplier_orc` (
    `s_suppkey` int  COMMENT "",
    `s_name` string  COMMENT "",
    `s_address` string  COMMENT "",
    `s_city` string  COMMENT "",
    `s_nation` string  COMMENT "",
    `s_region` string  COMMENT "",
    `s_phone` string  COMMENT ""
)STORED AS orc;

CREATE TABLE IF NOT EXISTS `supplier_parquet` (
    `s_suppkey` int  COMMENT "",
    `s_name` string  COMMENT "",
    `s_address` string  COMMENT "",
    `s_city` string  COMMENT "",
    `s_nation` string  COMMENT "",
    `s_region` string  COMMENT "",
    `s_phone` string  COMMENT ""
)STORED AS parquet;

数据生成

https://github.com/electrum/ssb-dbgen
make编译

(customer.tbl)
dbgen -s 1 -T c

(part.tbl)
dbgen -s 1 -T p

(supplier.tbl)
dbgen -s 1 -T s

(date.tbl)
dbgen -s 1 -T d

(fact table lineorder.tbl)
dbgen -s 1 -T l

(for all SSBM tables)
dbgen -s 1 -T a

To generate the refresh (insert/delete) data set:
(create delete.[1-4] and lineorder.tbl.u[1-4] with refreshing fact 0.05%)
dbgen -s 1 -r 5 -U 4

将生成数据插入到hive表中,然后生成的orc和parquet再导出来。

load data local inpath '/home/hadoop/dates' overwrite into table dates_csv;
insert overwrite table dates_orc  select * from dates_csv;
insert overwrite table dates_parquet  select * from dates_csv;
insert overwrite local directory '/home/hadoop/orc' select * from dates_orc;
insert overwrite local directory '/home/hadoop/parquet' select * from dates_parquet;

load data local inpath '/home/hadoop/data/supplier' overwrite into table supplier_csv;
insert overwrite table supplier_orc  select * from supplier_csv;
insert overwrite table supplier_parquet  select * from supplier_csv;
insert overwrite local directory '/home/hadoop/data/orc_supplier' select * from supplier_orc;
insert overwrite local directory '/home/hadoop/data/parquet_supplier' select * from supplier_parquet;

load data local inpath '/home/hadoop/customer' overwrite into table customer_csv;
insert overwrite table customer_orc  select * from customer_csv;
insert overwrite table customer_parquet  select * from customer_csv;
insert overwrite local directory '/home/hadoop/data/orc_customer' select * from customer_orc;
insert overwrite local directory '/home/hadoop/data/parquet_customer' select * from customer_parquet;


load data local inpath '/home/hadoop/part' overwrite into table part_csv;
insert overwrite table part_orc  select * from part_csv;
insert overwrite table part_parquet  select * from part_csv;
insert overwrite local directory '/home/hadoop/data/orc_part' select * from part_orc;
insert overwrite local directory '/home/hadoop/data/parquet_part' select * from part_parquet;

load data local inpath '/home/hadoop/data/lineorder' overwrite into table lineorder_csv;
insert overwrite table lineorder_orc  select * from lineorder_csv;
insert overwrite table lineorder_parquet  select * from lineorder_csv;
insert overwrite local directory '/home/hadoop/data/orc_lineorder' select * from lineorder_orc;
insert overwrite local directory '/home/hadoop/data/parquet_lineorder' select * from lineorder_parquet;

数据在oss,进行文件格式转换

c参考:https://blog.csdn.net/BabyFish13/article/details/78851067

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值