HBase和Hive都提供了thrift服务,作为给其它非JVM语言访问的接口,其使用本身非常简单。以下是封装好的两个访问接口,数据最终被组织成一个列表,列表里的每个元素都是一个字典,这样元数据和数据放在一起,虽然占用的内存多了一些,但是使用起来方便了许多,并且从效果上也拉近了和关系数据库的距离,这点类似于MongoDB。
如果要使用以下这两个工具类,必须首先将HBase和Hive对应的Thrift Python模块安装好:或者将模块放置于PYTHONPATH中,或者通过sys.path.insert来动态加入。
首先是HBase的代码,其中包含了存储数据和查询数据的接口:
'''
Created on 2013-3-20
@author: panfei
'''
from thrift.transport import TSocket
from thrift.protocol import TBinaryProtocol
from hbase import Hbase
from hbase.ttypes import Mutation
class HBaseEngine(object):
HBASE_ROWKEY = 'rowkey'
def __init__(self, host='192.168.1.141', port=9090):
self.transport = TSocket.TSocket(host, port)
self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
self.client = Hbase.Client(self.protocol)
self.transport.open()
def __enter__(self):
self.open()
return self
def __exit__(self, type, value, traceback):
self.close()
def close(self):
self.transport.close()
def open(self):
if not self.is_open():
self.transport.open()
def is_open(self):
return self.transport.isOpen()
def get_value_by_tbl_rowid(self, table, row):
row_results = self.client.getRow(table, row)
dict_record = None
if row_results:
row_result = row_results[0]
dict_record = row_result.columns
for key in dict_record:
dict_record[key] = dict_record[key].value
dict_record.update({HBaseEngine.HBASE_ROWKEY: row_result.row})
return dict_record # return data
def get_values_by_tbl_rowids(self, table, rows):
row_results = self.client.getRows(table, rows)
datas = []
for row_result in row_results:
dict_record = row_result.columns
for key in dict_record:
dict_record[key] = dict_record[key].value
dict_record.update({HBaseEngine.HBASE_ROWKEY: row_result.row})
datas.append(dict_record)
return datas
def get_value_by_tbl_rowid_cf_cs(self, table, row, columns):
row_results = self.client.getRowWithColumns(table, row, columns) # it's a list
dict_record = None
if row_results:
row_result = row_results[0]
dict_record = row_result.columns
for key in dict_record:
dict_record[key] = dict_record[key].value
dict_record.update({HBaseEngine.HBASE_ROWKEY: row_result.row})
return dict_record # return data
def get_values_by_tbl_rowids_cf_cs(self, table, rows, columns):
row_results = self.client.getRowsWithColumns(table, rows, columns)
datas = []
for row_result in row_results:
dict_record = row_result.columns
for key in dict_record:
dict_record[key] = dict_record[key].value
dict_record.update({HBaseEngine.HBASE_ROWKEY: row_result.row})
datas.append(dict_record)
return datas
def scan_by_range(self, table, start_row, stop_row, columns):
scanner = self.client.scannerOpenWithStop(table, start_row, stop_row, columns)
row_results = self.client.scannerGet(scanner)
datas = []
while row_results:
row_result = row_results[0]
dict_record = row_result.columns
for key in dict_record:
dict_record[key] = dict_record[key].value
dict_record.update({HBaseEngine.HBASE_ROWKEY:row_result.row})
datas.append(dict_record)
row_results = self.client.scannerGet(scanner)
self.client.scannerClose(scanner)
return datas
def put_row_column_value(self, table, row, column, value):
mutations = [Mutation(column=column, value=value)]
self.client.mutateRow(table, row, mutations)
def put_row_columns_values(self, table, row, columns, values):
mutations = [Mutation(column=col, value=val) for col, val in zip(columns, values)]
self.client.mutateRow(table, row, mutations)
@staticmethod
def copy_from_tbl2tbl(src_engine, src_tbl, dest_engine, dest_tbl, rows):
'''
src_table and dest_table must have the same column family
'''
datas = src_engine.get_values_by_tbl_rowids(src_tbl, rows)
for data in datas:
rowkey = data.pop(HBaseEngine.HBASE_ROWKEY)
items = data.items()
columns = [item[0] for item in items]
values = [item[1] for item in items]
dest_engine.put_row_columns_values(dest_tbl, rowkey, columns, values)
if __name__ == '__main__':
rows = ['1001_20130315', '1001_20130316']
with HBaseEngine(host='192.168.1.141') as hbase_engine , HBaseEngine(host='192.168.1.236') as hbase_engine2:
HBaseEngine.copy_from_tbl2tbl(hbase_engine, 'dau_realtime', hbase_engine2, 'dau_realtime', rows)
datas = hbase_engine.get_value_by_tbl_rowid('dau_realtime', '1001_20130316')
print(datas)
以下是Hive访问接口,只有查询接口:
'''
Created on 2013-3-20
@author: panfei
'''
from hive import ThriftHive
from thrift.transport import TSocket
from thrift.transport import TTransport
from thrift.protocol import TBinaryProtocol
class HiveEngine(object):
'''
assume the result won't be too big
'''
def __init__(self, host='192.168.1.141', port=10000):
self.transport = TSocket.TSocket(host, port)
self.transport = TTransport.TBufferedTransport(self.transport)
self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
self.client = ThriftHive.Client(self.protocol)
self.transport.open()
def __enter__(self):
self.open()
return self
def __exit__(self, type, value, traceback):
self.close()
def is_open(self):
return self.transport.isOpen()
def open(self):
if not self.is_open():
self.transport.open()
def close(self):
self.transport.close()
def execute(self, query, sep='\t'):
self.client.execute(query)
current_schema = self.get_current_schema()
return [dict(zip(current_schema, line.split(sep))) for line in self.client.fetchAll()]
def get_current_schema(self):
schema = self.client.getSchema()
return [fieldSchema.name for fieldSchema in schema.fieldSchemas]
if __name__ == '__main__':
with HiveEngine(host='192.168.1.141') as hive_engine:
lines = hive_engine.execute("select hour(ts) as dau, count(1) cnt from twadv where ds='2013-03-15' group by hour(ts)")
for line in lines:
print(line)
几点说明:
1. 通过使用with语法,将连接的建立和关闭隐藏起来,使得代码更加简洁。前提是在类定义中加入__enter__和__exit__方法的实现。
2. 通过将Hive、HBase的查询结果都转化为字典格式,方便HBase对于Hive查询结果的访问,也便于统计功能的实现。