测试数据:http://www.nber.org/patents/apat63_99.zip
测试环境:hadoop-2.3 + hive-0.13.1 + hbase-0.98.4
测试效率:6列6亿的Hive表数据半小时
- 创建hfile.hql
drop table hbase_splits;
CREATE EXTERNAL TABLE IF NOT EXISTS hbase_splits(partition STRING)
ROW FORMAT
SERDE 'org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveNullValueSequenceFileOutputFormat'
LOCATION '/tmp/hbase_splits_out';
-- create a location to store the resulting HFiles
drop table hbase_hfiles;
CREATE TABLE if not exists hbase_hfiles(rowkey STRING, pageviews STRING, bytes STRING)
STORED AS
INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.hbase.HiveHFileOutputFormat'
TBLPROPERTIES('hfile.family.path' = '/tmp/hbase_hfiles/w');
ADD JAR /root/hive-0.13.1/lib/hive-contrib-0.13.1.jar;
SET mapred.reduce.tasks=1;
CREATE TEMPORARY FUNCTION row_seq AS 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';
-- input file contains ~4mm records. Sample it so as to produce 5 input splits.
INSERT OVERWRITE TABLE hbase_splits
SELECT PATENT FROM
(SELECT PATENT, row_seq() AS seq FROM apat tablesample(bucket 1 out of 1000 on PATENT) s
order by PATENT
limit 10000000) x
WHERE (seq % 300) = 0
ORDER BY PATENT
LIMIT 4;
-- after this is finished, combined the splits file:
dfs -rmr /tmp/hbase_splits;
dfs -cp /tmp/hbase_splits_out/* /tmp/hbase_splits;
ADD JA