使用Python通过Thrift接口访问HBase和Hive

31 篇文章 0 订阅
13 篇文章 0 订阅

HBase和Hive都提供了thrift服务,作为给其它非JVM语言访问的接口,其使用本身非常简单。以下是封装好的两个访问接口,数据最终被组织成一个列表,列表里的每个元素都是一个字典,这样元数据和数据放在一起,虽然占用的内存多了一些,但是使用起来方便了许多,并且从效果上也拉近了和关系数据库的距离,这点类似于MongoDB。

如果要使用以下这两个工具类,必须首先将HBase和Hive对应的Thrift Python模块安装好:或者将模块放置于PYTHONPATH中,或者通过sys.path.insert来动态加入。


首先是HBase的代码,其中包含了存储数据和查询数据的接口:

'''
Created on 2013-3-20

@author: panfei
'''

from thrift.transport import TSocket
from thrift.protocol import TBinaryProtocol
from hbase import Hbase
from hbase.ttypes import Mutation

class HBaseEngine(object):
    
    HBASE_ROWKEY = 'rowkey'
    
    def __init__(self, host='192.168.1.141', port=9090):
        self.transport = TSocket.TSocket(host, port)
        self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
        self.client = Hbase.Client(self.protocol)
        self.transport.open()
    
    def __enter__(self):
        self.open()
        return self
    
    def __exit__(self, type, value,  traceback):
        self.close()

    def close(self):
        self.transport.close()
    
    def open(self):
        if not self.is_open():
            self.transport.open()

    def is_open(self):
        return self.transport.isOpen()
    
    def get_value_by_tbl_rowid(self, table, row):
        row_results = self.client.getRow(table, row)
        dict_record = None
        if row_results:
            row_result = row_results[0]
            dict_record = row_result.columns
            for key in dict_record:
                dict_record[key] = dict_record[key].value
            dict_record.update({HBaseEngine.HBASE_ROWKEY: row_result.row})
        return dict_record # return data
    
    def get_values_by_tbl_rowids(self, table, rows):
        row_results = self.client.getRows(table, rows)
        datas = []
        for row_result in row_results:
            dict_record = row_result.columns
            for key in dict_record:
                dict_record[key] = dict_record[key].value
            dict_record.update({HBaseEngine.HBASE_ROWKEY: row_result.row})
            datas.append(dict_record)
        return datas
        
    def get_value_by_tbl_rowid_cf_cs(self, table, row, columns):
        row_results = self.client.getRowWithColumns(table, row, columns) # it's a list
        dict_record = None
        if row_results:
            row_result = row_results[0]
            dict_record = row_result.columns
            for key in dict_record:
                dict_record[key] = dict_record[key].value
            dict_record.update({HBaseEngine.HBASE_ROWKEY: row_result.row})
        return dict_record # return data
    
    def get_values_by_tbl_rowids_cf_cs(self, table, rows, columns):
        row_results = self.client.getRowsWithColumns(table, rows, columns)
        datas = []
        for row_result in row_results:
            dict_record = row_result.columns
            for key in dict_record:
                dict_record[key] = dict_record[key].value
            dict_record.update({HBaseEngine.HBASE_ROWKEY: row_result.row})
            datas.append(dict_record)
        return datas
    
    def scan_by_range(self, table, start_row, stop_row, columns):
        scanner = self.client.scannerOpenWithStop(table, start_row, stop_row, columns)
        row_results = self.client.scannerGet(scanner)
        datas = []
        while row_results:
            row_result = row_results[0]
            dict_record = row_result.columns
            for key in dict_record:
                dict_record[key] = dict_record[key].value
            dict_record.update({HBaseEngine.HBASE_ROWKEY:row_result.row})
            datas.append(dict_record)
            row_results = self.client.scannerGet(scanner)
        self.client.scannerClose(scanner)
        return datas
        
    def put_row_column_value(self, table, row, column, value):
        mutations = [Mutation(column=column, value=value)]
        self.client.mutateRow(table, row, mutations)
    
    def put_row_columns_values(self, table, row, columns, values):
        mutations = [Mutation(column=col, value=val) for col, val in zip(columns, values)]
        self.client.mutateRow(table, row, mutations)
    
    @staticmethod
    def copy_from_tbl2tbl(src_engine, src_tbl, dest_engine, dest_tbl, rows):
        '''
        src_table and dest_table must have the same column family
        '''
        datas = src_engine.get_values_by_tbl_rowids(src_tbl, rows)
        for data in datas:
            rowkey = data.pop(HBaseEngine.HBASE_ROWKEY)
            items = data.items()
            columns = [item[0] for item in items]
            values = [item[1] for item in items]
            dest_engine.put_row_columns_values(dest_tbl, rowkey, columns, values)


if __name__ == '__main__':    
    rows = ['1001_20130315', '1001_20130316']
    with HBaseEngine(host='192.168.1.141') as hbase_engine , HBaseEngine(host='192.168.1.236') as hbase_engine2:
        HBaseEngine.copy_from_tbl2tbl(hbase_engine, 'dau_realtime', hbase_engine2, 'dau_realtime', rows)
        datas = hbase_engine.get_value_by_tbl_rowid('dau_realtime', '1001_20130316')
        print(datas)

以下是Hive访问接口,只有查询接口:

'''
Created on 2013-3-20

@author: panfei
'''
from hive import ThriftHive
from thrift.transport import TSocket
from thrift.transport import TTransport
from thrift.protocol import TBinaryProtocol


class HiveEngine(object):
    '''
        assume the result won't be too big
    '''
    def __init__(self, host='192.168.1.141', port=10000):
        self.transport = TSocket.TSocket(host, port)
        self.transport = TTransport.TBufferedTransport(self.transport)
        self.protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
        self.client = ThriftHive.Client(self.protocol)
        self.transport.open()
    
    def __enter__(self):
        self.open()
        return self
    
    def __exit__(self, type, value,  traceback):
        self.close()
    
    def is_open(self):
        return self.transport.isOpen()
    
    def open(self):
        if not self.is_open():
            self.transport.open()
        
    def close(self):
        self.transport.close()
        
    def execute(self, query, sep='\t'):
        self.client.execute(query)
        current_schema = self.get_current_schema()
        return [dict(zip(current_schema, line.split(sep))) for line in self.client.fetchAll()]
    
    def get_current_schema(self):
        schema = self.client.getSchema()
        return [fieldSchema.name for fieldSchema in schema.fieldSchemas]
    
if __name__ == '__main__':
    with HiveEngine(host='192.168.1.141') as hive_engine:
        lines = hive_engine.execute("select hour(ts) as dau, count(1) cnt from twadv where ds='2013-03-15' group by hour(ts)")
        for line in lines:
            print(line)

几点说明:

1. 通过使用with语法,将连接的建立和关闭隐藏起来,使得代码更加简洁。前提是在类定义中加入__enter__和__exit__方法的实现。

2. 通过将Hive、HBase的查询结果都转化为字典格式,方便HBase对于Hive查询结果的访问,也便于统计功能的实现。

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值