目的:预将hive中 null 与 empty string 统一,便捷后续开发
问题:orc 文件 使用 SET SERDEPROPERTIES('serialization.null.format' = '') 失效
解决方案:探索 null 再各个file_format 的存储方式
RCFile
(一)原始状态
create table test.rcfile (id int ,name string) STORED AS rcfile;
show create table test.rcfile;
CREATE TABLE `test.rcfile`(
`id` int,
`name` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.RCFileInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.RCFileOutputFormat'
LOCATION
'hdfs://devCluster/user/hive/warehouse/test.db/rcfile'
TBLPROPERTIES (
'COLUMN_STATS_ACCURATE'='{\"BASIC_STATS\":\"true\"}',
'numFiles'='0',
'numRows'='0',
'rawDataSize'='0',
'totalSize'='0',
'transient_lastDdlTime'='1623394872')
插入数据,查看 null and empty string 存储形式
insert overwrite table test.rcfile values ('1','ddd');
insert into table test.rcfile values (2,'');
insert into table test.rcfile values (3,null);
insert into table test.rcfile values (4,'\N');
insert into table test.rcfile values (5,'\\N');
insert into table test.rcfile values ('qwe','ppp');
insert into table test.rcfile values ('sdd','ooo');
SELECT
*
from
test.rcfile
;
发现,id 的数据类型是 int ,插入数值的string类型成功,但是插入字符串后其值变成null。
(二)使用参数
使用 SET SERDEPROPERTIES('serialization.null.format' = '')参数
create table test.rcfile_par (id int ,name string) STORED AS rcfile;
alter table test.rcfile_par SET SERDEPROPERTIES('serialization.null.format' = '');
show create table test.rcfile_par;
CREATE TABLE `test.rcfile_par`(
`id` int,
`name` string)
ROW FORMAT DELIMITED
NULL DEFINED AS ''
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.RCFileInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.RCFileOutputFormat'
LOCATION
'hdfs://devCluster/user/hive/warehouse/test.db/rcfile_par'
TBLPROPERTIES (
'last_modified_by'='qll',
'last_modified_time'='1623398000',
'numFiles'='0',
'numRows'='0',
'rawDataSize'='0',
'totalSize'='0',
'transient_lastDdlTime'='1623398000')
插入数据,查看 null and empty string 存储形式
insert overwrite table test.rcfile_par values ('1','ddd');
insert into table test.rcfile_par values (2,'');
insert into table test.rcfile_par values (3,null);
insert into table test.rcfile_par values (4,'\N');
insert into table test.rcfile_par values (5,'\\N');
insert into table test.rcfile_par values ('qwe','ppp');
insert into table test.rcfile_par values ('sdd','ooo');
SELECT
*
from
test.rcfile_par
;
发现,id 的数据类型是 int ,插入数值的string类型成功,但是插入字符串后其值变成null。空值并未和 empty string 统一,即SET SERDEPROPERTIES('serialization.null.format' = '') 并未起作用。