一、测试数据
{
"bibliographic-data":{
"classification-data":{
"classification-ipcr":{
"further-classification":[
{
"main-group":"11",
"class":"01",
"subgroup":"11",
"subclass":"L",
"section":"H"
},
{
"main-group":"11",
"class":"02",
"subgroup":"34",
"subclass":"S",
"section":"H"
}
],
"main-classification":{
"main-group":"31",
"class":"01",
"subgroup":"05",
"subclass":"L",
"section":"H"
}
}
},
"invention-title":[
{
"data-format":"original",
"text":发呆发翻腾而过热狗",
"lang":"CN"
}
],
"parties":{
"agency":[
{
"data-format":"original",
"sequence":1,
"rep-type":"agency",
"name":"务所",
"lang":"CN"
}
],
"inventors":[
{
"data-format":"original",
"sequence":1,
"name":"发的发电",
"lang":"CN"
},
{
"data-format":"original",
"sequence":2,
"name":"都发大水",
"lang":"CN"
},
{
"data-format":"original",
"sequence":3,
"name":"地区",
"lang":"CN"
}
],
"agents":[
{
"data-format":"original",
"sequence":1,
"rep-type":"agent",
"name":"发动v法",
"lang":"CN"
}
]
},
"application-reference":{
"date":20200619,
"country":"CN",
"doc-number":"202021161647.X"
},
"patent-type":"熬到VVT",
"publication-reference":{
"date":20210323,
"country":"CN",
"kind":"U",
"doc-number":"3479"
}
},
"update-ts":20210831,
"abstract":[
{
"data-format":"original",
"text":"fdfdfdfd",
"lang":"CN"
}
],
"patent-legal":{
"legal-update-date":20210323,
"legal-status":[
"3"
]
}
}
可以看出上述json十分复杂,如果函数对JSON字符串进行解析,灵活性比较高,但是对于如果整个文件就是一个JSON文件,在使用起来就相对比较麻烦。Hive中为了简化对于JSON文件的处理,内置了一种专门用于解析JSON文件的Serde解析器, 在创建表时,只要指定使用 JSONSerde 解析表的文件,就会自动将 JSON 文件中的每一列进行解析。
二、下载JSONSerde包
1.github地址
GitHub - rcongiu/Hive-JSON-Serde: Read - Write JSON SerDe for Apache Hive.
2.jar包下载地址
Index of /hive-json-serde (congiu.net)
将 JAR 放入 Hive/lib 或者在 Hive 中使用 ADD JAR
hive -e "add jar /app/hive/lib/json-serde-1.3.8-jar-with-dependencies.jar"
hive -e "add jar /app/hive/lib/json-udf-1.3.8-jar-with-dependencies.jar"
三、使用hive-json-schema自动生成建表语句
虽然可以使用Serde进行自动解析,但是如此复杂的json字符串,定义表结构是个很麻烦的时,而且容易出错,所以用程序自动化创建再好不过了。
1.github地址
GitHub - quux00/hive-json-schema: Tool to generate a Hive schema from a JSON example doc
2.使用方式:
-
使用maven打包
mvn package
-
执行如下命令
java -jar target/json-hive-schema-1.0-jar-with-dependencies.jar F:\大数据\数据源\json\data\data.json mytable
-
生成create table 命令
CREATE TABLE mytable ( abstract array<struct<data-format:string, lang:string, text:string>>, bibliographic-data struct<application-reference:struct<country:string, date:int, doc-number:string>, classification-data:struct<classification-ipcr:struct<further-classification:array<struct<class:string, main-group:string, sectio n:string, subclass:string, subgroup:string>>, main-classification:struct<class:string, main-group:string, section:string, subclass:string, subgroup:string>>>, invention-title:array<struct<data-format:string, lang:string, text:string >>, parties:struct<agency:array<struct<data-format:string, lang:string, name:string, rep-type:string, sequence:int>>, agents:array<struct<data-format:string, lang:string, name:string, rep-type:string, sequence:int>>, inventors:array <struct<data-format:string, lang:string, name:string, sequence:int>>>, patent-type:string, publication-reference:struct<country:string, date:int, doc-number:string, kind:string>>, patent-legal struct<legal-status:array<string>, legal-update-date:int>, update-ts int) ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe';
四、hive建表语句中出现关键字处理
1.hive建表字段出现date关键字处理
可以使用``反引号引起来,因为在hive -e 命令中所以使用`进行转义
hive -e "use mydb;CREATE TABLE mytable (abstract array<struct<data_format:string, lang_r:string, text:string>>,bibliographic_data struct<application_reference:struct<country:string,\`date\`:int, doc_number:string>, classification_data:struct<classification_ipcr:struct<further_classification:array<struct<class:string, main_group:string, section:string, subclass:string, subgroup:string>>, main_classification:struct<class:string, main_group:string, section:string, subclass:string, subgroup:string>>>, invention_title:array<struct<data_format:string, lang_r:string, text:string>>, parties:struct<agency:array<struct<data_format:string, lang_r:string, name:string, rep_type:string, sequence:int>>, agents:array<struct<data_format:string, lang_r:string, name:string, rep_type:string, sequence:int>>, inventors:array<struct<data_format:string, lang_r:string, name:string, sequence:int>>>, patent_type:string, publication_reference:struct<country:string,\`date\`:int, doc_number:string, kind:string>>, patent_legal struct<legal_status:array<string>, legal_update_date:int>,update_ts int)ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' with serdeproperties('mapping.data_format' = 'data-format','mapping.bibliographic_data' = 'bibliographic-data','mapping.application_reference' = 'application-reference','mapping.doc_number' = 'doc-number','mapping.classification_data' = 'classification-data','mapping.classification_ipcr' = 'classification-ipcr','mapping.further_classification' = 'further-classification','mapping.main_group' = 'main-group','mapping.main_classification' = 'main-classification','mapping.main_group' = 'main-group','mapping.invention_title' = 'invention-title','mapping.data_format' = 'data-format','mapping.rep_type' = 'rep-type','mapping.patent_type' = 'patent-type','mapping.publication_reference' = 'publication-reference','mapping.update_ts' = 'update-ts','mapping.patent_legal' = 'patent-legal','mapping.legal_status' = 'legal-status','mapping.legal_update_date' = 'legal-update-date','mapping.lang_r' = 'lang') STORED AS TEXTFILE;"
2.hive创建表中字段出现-处理
with serdeproperties('mapping.data_format' = 'data-format','mapping.bibliographic_data' = 'bibliographic-data','mapping.application_reference' = 'application-reference','mapping.doc_number' = 'doc-number','mapping.classification_data' = 'classification-data','mapping.classification_ipcr' = 'classification-ipcr','mapping.further_classification' = 'further-classification','mapping.main_group' = 'main-group','mapping.main_classification' = 'main-classification','mapping.main_group' = 'main-group','mapping.invention_title' = 'invention-title','mapping.data_format' = 'data-format','mapping.rep_type' = 'rep-type','mapping.patent_type' = 'patent-type','mapping.publication_reference' = 'publication-reference','mapping.update_ts' = 'update-ts','mapping.patent_legal' = 'patent-legal','mapping.legal_status' = 'legal-status','mapping.legal_update_date' = 'legal-update-date','mapping.lang_r' = 'lang')
使用serdeproperties对字段名进行映射
3.加载数据
hive -e "use mydb;load data local inpath '/app/shell/datasource/json/mydata/mydata20210331_bakfile/outFile/data.json' into table mytable"
4.查看数据
hive -e "use mydb;select * from mytable limit 1;"