Impala与Hbase整合用于ETL过程尝试(1)

一、      目标

最近我接到这样一个需求:做一张大宽表(100+字段)的加工、存储,宽表由多个副表join关联生成,目前基于Impala类的sql over hadoop实现,由于其中某一个副表信息变动导致大宽表需要执行回溯,现有的技术框架下均不支持update操作,被迫全表擦除重入(truncate+insert),代价比较大,能否进行部分字段的更新?我想到hbase存宽表 ,Impal与其结合可通过sql方式进行更新,包括查询和数据回写都可以SQL方式实现,让人感知不到后台是hbase.

另外还有一个潜在的考虑方面,hbase并不适合于类impala的即席统计,需要做一次输出转储,那么将hbase中的数据再导出至 Impala的过程耗时会有多久?

我这里先观察功能,后验证性能。

二、      基础数据准备

1.      在hbase中创建表,并塞入样本数据

 

create 'cust_info', 'cf_01', 'cf_02','cf_03'

 

#塞入1记录,30个字段

put 'cust_info',  '1000000001', 'cf_01:col_01_01', 'val_01_01_1000000001'

put 'cust_info',  '1000000001', 'cf_01:col_01_02', 'val_01_02_1000000001'

put 'cust_info',  '1000000001', 'cf_01:col_01_03', 'val_01_03_1000000001'

put 'cust_info',  '1000000001', 'cf_01:col_01_04', 'val_01_04_1000000001'

put 'cust_info',  '1000000001', 'cf_01:col_01_05', 'val_01_05_1000000001'

put 'cust_info',  '1000000001', 'cf_01:col_01_06', 'val_01_06_1000000001'

put 'cust_info',  '1000000001', 'cf_01:col_01_07', 'val_01_07_1000000001'

put 'cust_info',  '1000000001', 'cf_01:col_01_08',  'val_01_08_1000000001'

put 'cust_info',  '1000000001', 'cf_01:col_01_07', 'val_01_09_1000000001'

put 'cust_info',  '1000000001', 'cf_01:col_01_10', 'val_01_10_1000000001'

 

put 'cust_info',  '1000000001', 'cf_02:col_02_01', 'val_02_01_1000000001'

put 'cust_info',  '1000000001', 'cf_02:col_02_02', 'val_02_02_1000000001'

put 'cust_info',  '1000000001', 'cf_02:col_02_03', 'val_02_03_1000000001'

put 'cust_info',  '1000000001', 'cf_02:col_02_04', 'val_02_04_1000000001'

put 'cust_info',  '1000000001', 'cf_02:col_02_05', 'val_02_05_1000000001'

put 'cust_info',  '1000000001', 'cf_02:col_02_06', 'val_02_06_1000000001'

put 'cust_info',  '1000000001', 'cf_02:col_02_07', 'val_02_07_1000000001'

put 'cust_info',  '1000000001', 'cf_02:col_02_08',  'val_02_08_1000000001'

put 'cust_info',  '1000000001', 'cf_02:col_02_07', 'val_02_09_1000000001'

put 'cust_info',  '1000000001', 'cf_02:col_02_10', 'val_02_10_1000000001'

 

put 'cust_info',  '1000000001', 'cf_03:col_03_01', 'val_03_01_1000000001'

put 'cust_info',  '1000000001', 'cf_03:col_03_02', 'val_03_02_1000000001'

put 'cust_info',  '1000000001', 'cf_03:col_03_03', 'val_03_03_1000000001'

put 'cust_info',  '1000000001', 'cf_03:col_03_04', 'val_03_04_1000000001'

put 'cust_info',  '1000000001', 'cf_03:col_03_05', 'val_03_05_1000000001'

put 'cust_info',  '1000000001', 'cf_03:col_03_06', 'val_03_06_1000000001'

put 'cust_info',  '1000000001', 'cf_03:col_03_07', 'val_03_07_1000000001'

put 'cust_info',  '1000000001', 'cf_03:col_03_08', 'val_03_08_1000000001'

put 'cust_info',  '1000000001', 'cf_03:col_03_07', 'val_03_09_1000000001'

put 'cust_info',  '1000000001', 'cf_03:col_03_10', 'val_03_10_1000000001'

 

put 'cust_info',  '1000000002', 'cf_01:col_01_01', 'val_01_01_1000000002'

put 'cust_info',  '1000000002', 'cf_01:col_01_02', 'val_01_02_1000000002'

put 'cust_info',  '1000000002', 'cf_01:col_01_03', 'val_01_03_1000000002'

put 'cust_info',  '1000000002', 'cf_01:col_01_04', 'val_01_04_1000000002'

put 'cust_info',  '1000000002', 'cf_01:col_01_05', 'val_01_05_1000000002'

put 'cust_info',  '1000000002', 'cf_01:col_01_06', 'val_01_06_1000000002'

put 'cust_info',  '1000000002', 'cf_01:col_01_07', 'val_01_07_1000000002'

put 'cust_info',  '1000000002', 'cf_01:col_01_08', 'val_01_08_1000000002'

put 'cust_info',  '1000000002', 'cf_01:col_01_07', 'val_01_09_1000000002'

put 'cust_info',  '1000000002', 'cf_01:col_01_10', 'val_01_10_1000000002'

 

put 'cust_info',  '1000000002', 'cf_02:col_02_01', 'val_02_01_1000000002'

put 'cust_info',  '1000000002', 'cf_02:col_02_02', 'val_02_02_1000000002'

put 'cust_info',  '1000000002', 'cf_02:col_02_03',  'val_02_03_1000000002'

put 'cust_info',  '1000000002', 'cf_02:col_02_04', 'val_02_04_1000000002'

put 'cust_info',  '1000000002', 'cf_02:col_02_05', 'val_02_05_1000000002'

put 'cust_info',  '1000000002', 'cf_02:col_02_06', 'val_02_06_1000000002'

put 'cust_info',  '1000000002', 'cf_02:col_02_07', 'val_02_07_1000000002'

put 'cust_info',  '1000000002', 'cf_02:col_02_08', 'val_02_08_1000000002'

put 'cust_info',  '1000000002', 'cf_02:col_02_07', 'val_02_09_1000000002'

put 'cust_info',  '1000000002', 'cf_02:col_02_10', 'val_02_10_1000000002'

 

put 'cust_info',  '1000000002', 'cf_03:col_03_01', 'val_03_01_1000000002'

put 'cust_info',  '1000000002', 'cf_03:col_03_02', 'val_03_02_1000000002'

put 'cust_info',  '1000000002', 'cf_03:col_03_03', 'val_03_03_1000000002'

put 'cust_info',  '1000000002', 'cf_03:col_03_04', 'val_03_04_1000000002'

put 'cust_info',  '1000000002', 'cf_03:col_03_05', 'val_03_05_1000000002'

put 'cust_info',  '1000000002', 'cf_03:col_03_06', 'val_03_06_1000000002'

put 'cust_info',  '1000000002', 'cf_03:col_03_07', 'val_03_07_1000000002'

put 'cust_info',  '1000000002', 'cf_03:col_03_08', 'val_03_08_1000000002'

put 'cust_info',  '1000000002', 'cf_03:col_03_07',  'val_03_09_1000000002'

put 'cust_info',  '1000000002', 'cf_03:col_03_10', 'val_03_10_1000000002'

 

put 'cust_info',  '1000000003', 'cf_01:col_01_01', 'val_01_01_1000000003'

put 'cust_info',  '1000000003', 'cf_01:col_01_02', 'val_01_02_1000000003'

put 'cust_info',  '1000000003', 'cf_01:col_01_03', 'val_01_03_1000000003'

put 'cust_info',  '1000000003', 'cf_01:col_01_04', 'val_01_04_1000000003'

put 'cust_info',  '1000000003', 'cf_01:col_01_05', 'val_01_05_1000000003'

put 'cust_info',  '1000000003', 'cf_01:col_01_06', 'val_01_06_1000000003'

put 'cust_info',  '1000000003', 'cf_01:col_01_07', 'val_01_07_1000000003'

put 'cust_info',  '1000000003', 'cf_01:col_01_08',  'val_01_08_1000000003'

put 'cust_info',  '1000000003', 'cf_01:col_01_07', 'val_01_09_1000000003'

put 'cust_info',  '1000000003', 'cf_01:col_01_10', 'val_01_10_1000000003'

 

put 'cust_info',  '1000000003', 'cf_02:col_02_01', 'val_02_01_1000000003'

put 'cust_info',  '1000000003', 'cf_02:col_02_02', 'val_02_02_1000000003'

put 'cust_info',  '1000000003', 'cf_02:col_02_03', 'val_02_03_1000000003'

put 'cust_info',  '1000000003', 'cf_02:col_02_04', 'val_02_04_1000000003'

put 'cust_info',  '1000000003', 'cf_02:col_02_05', 'val_02_05_1000000003'

put 'cust_info',  '1000000003', 'cf_02:col_02_06', 'val_02_06_1000000003'

put 'cust_info',  '1000000003', 'cf_02:col_02_07', 'val_02_07_1000000003'

put 'cust_info',  '1000000003', 'cf_02:col_02_08', 'val_02_08_1000000003'

put 'cust_info',  '1000000003', 'cf_02:col_02_07', 'val_02_09_1000000003'

put 'cust_info',  '1000000003', 'cf_02:col_02_10', 'val_02_10_1000000003'

 

put 'cust_info',  '1000000003', 'cf_03:col_03_01', 'val_03_01_1000000003'

put 'cust_info',  '1000000003', 'cf_03:col_03_02', 'val_03_02_1000000003'

put 'cust_info',  '1000000003', 'cf_03:col_03_03', 'val_03_03_1000000003'

put 'cust_info',  '1000000003', 'cf_03:col_03_04',  'val_03_04_1000000003'

put 'cust_info',  '1000000003', 'cf_03:col_03_05', 'val_03_05_1000000003'

put 'cust_info',  '1000000003', 'cf_03:col_03_06', 'val_03_06_1000000003'

put 'cust_info',  '1000000003', 'cf_03:col_03_07', 'val_03_07_1000000003'

put 'cust_info',  '1000000003', 'cf_03:col_03_08', 'val_03_08_1000000003'

put 'cust_info',  '1000000003', 'cf_03:col_03_07', 'val_03_09_1000000003'

put 'cust_info',  '1000000003', 'cf_03:col_03_10', 'val_03_10_1000000003'

 

2.      在hive中创建外部表

 

CREATE EXTERNAL TABLE hbase_cust_info(

cust_id string,

col_01_01 string,

col_01_02 string,

col_01_03 string,

col_01_04 string,

col_01_05 string,

col_01_06 string,

col_01_07 string,

col_01_08 string,

col_01_09 string,

col_01_10 string,

col_02_01 string,

col_02_02 string,

col_02_03 string,

col_02_04 string,

col_02_05 string,

col_02_06 string,

col_02_07 string,

col_02_08 string,

col_02_09 string,

col_02_10 string,

col_03_01 string,

col_03_02 string,

col_03_03 string,

col_03_04 string,

col_03_05 string,

col_03_06 string,

col_03_07 string,

col_03_08 string,

col_03_09 string,

col_03_10 string

) ROW FORMAT SERDE'org.apache.hadoop.hive.hbase.HBaseSerDe'

STORED BY'org.apache.hadoop.hive.hbase.HBaseStorageHandler'

WITH SERDEPROPERTIES("hbase.columns.mapping" = ":key, cf_01:col_01_01, cf_01:col_01_02,cf_01:col_01_03, cf_01:col_01_04, cf_01:col_01_05, cf_01:col_01_06,cf_01:col_01_07, cf_01:col_01_08, cf_01:col_01_07, cf_01:col_01_10,cf_02:col_02_01, cf_02:col_02_02, cf_02:col_02_03, cf_02:col_02_04,cf_02:col_02_05, cf_02:col_02_06, cf_02:col_02_07, cf_02:col_02_08,cf_02:col_02_07, cf_02:col_02_10, cf_03:col_03_01, cf_03:col_03_02,cf_03:col_03_03, cf_03:col_03_04, cf_03:col_03_05, cf_03:col_03_06,cf_03:col_03_07, cf_03:col_03_08, cf_03:col_03_07, cf_03:col_03_10")

TBLPROPERTIES("hbase.table.name"= "cust_info");

 

3.      在hive和impala-shell中查询测试

很不幸,在impala-shell中,遇到一个问题。这个在以前没有遭遇到。

Query: describe hbase_cust_info

+-----------+--------+---------+

| name     | type   | comment |

+-----------+--------+---------+

| cust_id  | string |         |

| col_01_01 | string |         |

| col_01_02 | string |         |

| col_01_03 | string |         |

| col_01_04 | string |         |

| col_01_05 | string |         |

| col_01_06 | string |         |

| col_01_07 | string |         |

| col_01_09 | string |         |

| col_01_08 | string |         |

| col_01_10 | string |         |

| col_02_01 | string |         |

| col_02_02 | string |         |

| col_02_03 | string |         |

| col_02_04 | string |         |

| col_02_05 | string |         |

| col_02_06 | string |         |

| col_02_07 | string |         |

| col_02_09 | string |         |

| col_02_08 | string |         |

| col_02_10 | string |         |

| col_03_01 | string |         |

| col_03_02 | string |         |

| col_03_03 | string |         |

| col_03_04 | string |         |

| col_03_05 | string |         |

| col_03_06 | string |         |

| col_03_07 | string |         |

| col_03_09 | string |         |

| col_03_08 | string |         |

| col_03_10 | string |         |

+-----------+--------+---------+

Fetched 31 row(s) in 0.02s

[bd-133:21000] > SELECT * FROMhbase_cust_info ;

Query: select * FROM hbase_cust_info

{color:red}ERROR:NoSuchMethodError: org.apache.hadoop.hbase.Cell.getTagsLength()I{color}

三、      问题解决

 看错误提示与hbase的类有关,进一步分析应该是hbase版本冲突,导致加载包出现偏差。

通过关键字搜hbase*.0.9*.jar(当前已升级到1.0,故老的版本是不需要的),/var/lib/impala是我在之前手动安装impala期间拷贝的包,将其重命名后解决

mv /var/lib/impala/hbase-client-0.98.6-cdh5.3.3.jar   /var/lib/impala/hbase-client-0.98.6-cdh5.3.3.jar_

mv /var/lib/impala/hbase-hadoop-compat-0.98.6-cdh5.3.3.jar   /var/lib/impala/hbase-hadoop-compat-0.98.6-cdh5.3.3.jar_

mv /var/lib/impala/hbase-testing-util-0.98.6-cdh5.3.3.jar   /var/lib/impala/hbase-testing-util-0.98.6-cdh5.3.3.jar_

mv /var/lib/impala/hbase-hadoop2-compat-0.98.6-cdh5.3.3-tests.jar  /var/lib/impala/hbase-hadoop2-compat-0.98.6-cdh5.3.3-tests.jar_

mv /var/lib/impala/hbase-examples-0.98.6-cdh5.3.3.jar   /var/lib/impala/hbase-examples-0.98.6-cdh5.3.3.jar_

mv /var/lib/impala/hbase-common-0.98.6-cdh5.3.3.jar   /var/lib/impala/hbase-common-0.98.6-cdh5.3.3.jar_

mv /var/lib/impala/hbase-prefix-tree-0.98.6-cdh5.3.3.jar   /var/lib/impala/hbase-prefix-tree-0.98.6-cdh5.3.3.jar_

mv /var/lib/impala/hbase-protocol-0.98.6-cdh5.3.3.jar   /var/lib/impala/hbase-protocol-0.98.6-cdh5.3.3.jar_

mv /var/lib/impala/hbase-server-0.98.6-cdh5.3.3.jar   /var/lib/impala/hbase-server-0.98.6-cdh5.3.3.jar_

mv /var/lib/impala/hbase-hadoop-compat-0.98.6-cdh5.3.3-tests.jar  /var/lib/impala/hbase-hadoop-compat-0.98.6-cdh5.3.3-tests.jar_

mv /var/lib/impala/hbase-thrift-0.98.6-cdh5.3.3.jar   /var/lib/impala/hbase-thrift-0.98.6-cdh5.3.3.jar_

mv /var/lib/impala/hbase-server-0.98.6-cdh5.3.3-tests.jar   /var/lib/impala/hbase-server-0.98.6-cdh5.3.3-tests.jar_

mv /var/lib/impala/hbase-hadoop2-compat-0.98.6-cdh5.3.3.jar   /var/lib/impala/hbase-hadoop2-compat-0.98.6-cdh5.3.3.jar_

mv /var/lib/impala/hbase-it-0.98.6-cdh5.3.3.jar             /var/lib/impala/hbase-it-0.98.6-cdh5.3.3.jar_

mv /var/lib/impala/hbase-shell-0.98.6-cdh5.3.3.jar   /var/lib/impala/hbase-shell-0.98.6-cdh5.3.3.jar_

mv /var/lib/impala/hbase-common-0.98.6-cdh5.3.3-tests.jar   /var/lib/impala/hbase-common-0.98.6-cdh5.3.3-tests.jar_

mv /var/lib/impala/hbase-it-0.98.6-cdh5.3.3-tests.jar   /var/lib/impala/hbase-it-0.98.6-cdh5.3.3-tests.jar_

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值