导入HDFS的数据到Hive

1. 通过Hive view

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
CREATE EXTERNAL TABLE  if  not exists finance.json_serde_optd_table (
   retCode string,
   retMsg string,
   data array<struct< secid:string,= ""  tradedate:date,= ""  optid:string,= ""  ticker:string,= ""  secshortname:string,= ""  exchangecd:string,= ""  presettleprice: double ,= ""  precloseprice: double ,= ""  openprice: double ,= ""  highestprice: double ,= ""  lowestprice: double ,= ""  closeprice: double ,= ""  settlprice: double ,= ""  turnovervol: double ,= ""  turnovervalue: double ,= ""  openint: int = "" >>)
ROW FORMAT SERDE  'org.apache.hive.hcatalog.data.JsonSerDe'
LOCATION  'hdfs://wdp.xxxxx.cn:8020/nifi/finance1/optd/' ;
create table  if  not exists finance.tb_optd
as
SELECT b.data.secID,
         b.data.tradeDate,
         b.data.optID,
         b.data.ticker,
         b.data.secShortName,
         b.data.exchangeCD,
         b.data.preSettlePrice,
         b.data.preClosePrice,
         b.data.openPrice,
         b.data.highestPrice,
         b.data.lowestPrice,
         b.data.closePrice,
         b.data.settlPrice,
         b.data.turnoverVol,
         b.data.turnoverValue,
         b.data.openInt
FROM finance.json_serde_optd_table LATERAL VIEW explode(json_serde_optd_table.data) b AS data;
1
  

2. 通过Zeppelin

 

1
2
%dep
z.load( "/usr/hdp/2.4.2.0-258/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar" );

 

1
2
3
4
// 定义导入的hive对象集合
 
case  class  HiveConfig(database: String, modelName: String, hdfsPath: String, schema: String, schema_tb: String);
var hiveConfigList = List[HiveConfig]();
1
  
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
// 创建equd数据结构
// 定义json结构
val schema_json_equd_serde = "" "  retCode string,
                               retMsg string,
                               data array<struct< secid= ""  := ""  string,= ""  tradedate= ""  date,= ""  ticker= ""  secshortname= ""  exchangecd= ""  precloseprice= ""  double ,= ""  actprecloseprice:= ""  openprice= ""  highestprice= ""  lowestprice= ""  closeprice= ""  turnovervol= ""  turnovervalue= ""  dealamount= ""  int ,= ""  turnoverrate= ""  accumadjfactor= ""  negmarketvalue= ""  marketvalue= ""  pe= ""  pe1= ""  pb= ""  isopen= ""  int = "" >> "" ";
var schema_equd = "" "b.data.secID,
                     b.data.ticker,
                     b.data.secShortName,
                     b.data.exchangeCD,
                     b.data.tradeDate,
                     b.data.preClosePrice,
                     b.data.actPreClosePrice,
                     b.data.openPrice,
                     b.data.highestPrice,
                     b.data.lowestPrice,
                     b.data.closePrice,
                     b.data.turnoverVol,
                     b.data.turnoverValue,
                     b.data.dealAmount,
                     b.data.turnoverRate,
                     b.data.accumAdjFactor,
                     b.data.negMarketValue,
                     b.data.marketValue,
                     b.data.PE,
                     b.data.PE1,
                     b.data.PB,
                     b.data.isOpen "" ";
hiveConfigList  = hiveConfigList :+ HiveConfig( "finance" "equd" "hdfs://wdp.xxxxx.cn:8020/nifi/finance1/" , schema_json_equd_serde, schema_equd);

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
// 创建idxd数据结构
// 定义json结构
val schema_json_idxd_serde = "" "  retCode string,
                               retMsg string,
                               data array<struct< indexid:string,= ""  tradedate:date,= ""  ticker:string,= ""  porgfullname:string,= ""  secshortname:string,= ""  exchangecd:string,= ""  precloseindex: double ,= ""  openindex: double ,= ""  lowestindex: double ,= ""  highestindex: double ,= ""  closeindex: double ,= ""  turnovervol: double ,= ""  turnovervalue: double ,= ""  chg: double ,= ""  chgpct: double = "" >> "" ";
var schema_idxd = "" "b.data.indexID,
                     b.data.tradeDate,
                     b.data.ticker,
                     b.data.porgFullName,
                     b.data.secShortName,
                     b.data.exchangeCD,
                     b.data.preCloseIndex,
                     b.data.openIndex,
                     b.data.lowestIndex,
                     b.data.highestIndex,
                     b.data.closeIndex,
                     b.data.turnoverVol,
                     b.data.turnoverValue,
                     b.data.CHG,
                     b.data.CHGPct "" ";
hiveConfigList = hiveConfigList :+ HiveConfig( "finance" "idxd" "hdfs://wdp.xxxxx.cn:8020/nifi/finance1/" , schema_json_idxd_serde, schema_idxd);

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
// 循环加载数据中
   def loadDataToHive(args:HiveConfig){
     val loadPath = args.hdfsPath + args.modelName;
     val tb_json_serde =  "json_serde_"  + args.modelName + "_table" ;
     val tb=  "tb_"  + args.modelName;
     val hiveContext =  new  org.apache.spark.sql.hive.HiveContext(sc)
     if (args.database !=  ""  && args.schema !=  "" ) {
         print( "正在创建项目..."  + args.modelName)
         hiveContext.sql( "CREATE DATABASE IF NOT EXISTS "  + args.database);
         print( "正在构造扩展模型..." );
         hiveContext.sql( "CREATE TABLE IF NOT EXISTS "  + args.database +  "."  + tb_json_serde +  "("  + args.schema +  ") row format serde 'org.apache.hive.hcatalog.data.JsonSerDe' LOCATION "  "'"  + loadPath +  "/'" );
         println( "CREATE TABLE IF NOT EXISTS "  + args.database +  "."  + tb +  " as select "  + args.schema_tb +  " from "  + args.database +  "."  + tb_json_serde +  " LATERAL VIEW explode("  + tb_json_serde +  ".data) b AS data" );
         hiveContext.sql( "CREATE TABLE IF NOT EXISTS "  + args.database +  "."  + tb +  " as select "  + args.schema_tb +  " from "  + args.database +  "."  + tb_json_serde +  " LATERAL VIEW explode("  + tb_json_serde +  ".data) b AS data" );
         println(args.modelName +  " 扩展模型加载已完成!" );
     }
   }
   hiveConfigList.size;
   hiveConfigList.foreach { x => loadDataToHive(x) };

 

 3. 第二种取法

由于data是json数据里的一个数组,所以上面的转换复杂了一点。下面这种方法是先把json里data数组取出来放到hdfs,然后直接用下面的语句放到hive:

用splitjson 来提取、分隔 data 数组

NewImage

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
CREATE EXTERNAL TABLE  if  not exists finance.awen_optd (
   secid string,
   tradedate date,
   optid string,
   ticker string,
   secshortname string,
   exchangecd string,
   presettleprice  double ,
   precloseprice  double ,
   openprice  double ,
   highestprice  double ,
   lowestprice  double ,
   closeprice  double ,
   settlprice  double ,
   turnovervol  double ,
   turnovervalue  double ,
   openint  int )
ROW FORMAT SERDE  'org.apache.hive.hcatalog.data.JsonSerDe'
LOCATION  'hdfs://wdp.xxxx.cn:8020/nifi/finance2/optd/' ;

 

 本文转自疯吻IT博客园博客,原文链接:http://www.cnblogs.com/fengwenit/p/5631455.html,如需转载请自行联系原作者


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值