PythonConnectHbaseThroughThrift
Python通过Thrift连接Hbase操作
安装 Thrift
pip install thrift
pip install hbase-thrift
坑坑坑
- 安装完成之后第一次运行,报错误:
in <module> from hbase import Hbase File
"C:\Users\tianxiao\AppData\Local\Programs\Python\Python36\lib\site-packages\hbase\Hbase.py", line 2066
except IOError, io: ^ SyntaxError: invalid syntax
原因是python版本带来的语法兼容性问题,下载python3的Hbase文件,替换Hbase文件/usr/local/lib/python3.6/dist-packages/hbase/Hbase.py
和ttypes.py
下载链接:https://github.com/626626cdllp/infrastructure/tree/master/hbase
- 运行以下代码出错
from thrift.transport import TSocket,TTransport
from thrift.protocol import TBinaryProtocol
from hbase import Hbase
# thrift默认端口是9090
socket = TSocket.TSocket('192.1.1.1',9090)
socket.setTimeout(5000)
transport = TTransport.TBufferedTransport(socket)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = Hbase.Client(protocol)
socket.open()
print(client.getTableNames())
print(client.get('test','row1','cf:a'))
报另一个错误:thrift.transport.TTransport.TTransportException: TSocket read 0 bytes
原因是thrift 的server端和client端的协议不匹配造成的,python要使用TCompactProtocol
,而不能使用TBinaryProtocol
,
连接 Thrift
from thrift import Thrift
from thrift.transport import TSocket
from thrift.transport import TTransport
from thrift.protocol import TBinaryProtocol
#使用时定义前缀规则,避免热点问题
from md5_rowkey import getHashID
#定义连接网址和端口
host = '127.0.0.1'
port = 10012
# Make socket 建立套接口
socket = TSocket.TSocket(host, port)
#Buffering is critical. Raw sockets are very slow 缓冲是至关重要的。原始套接口非常慢
transport = TTransport.TBufferedTransport(socket)
# Wrap in a protocol 包装协议
protocol = TBinaryProtocol.TBinaryProtocol(transport)
# Create a client to use the protocol encoder 创建客户端以使用协议编码器
client = Hbase.Client(protocol)
# Connect! 连接
socket.open()
# 连接处理数据
tableName = 'prefix_recom_item_v1'
### 具体操作可直接调用Hbase库即可
## 主要可直接调用以下函数:
def scannerOpenWithPrefix(self, tableName, startAndPrefix, columns)
def getRowWithColumns(self, tableName, row, columns)
def send_getRowTs(self, tableName, row, timestamp)
for weng_id in tqdm(weng_ids):
result = getRowWithColumns(tableName, row, ['item:stit', 'item:tags', 'item:url'])
row = getHashID('weng_' + weng_id)
if not result:
print('此 wengid 不存在,', result)
nonum += 1
else:
weng_location[weng_id] = result.row
lat = result.columns['item:lat'].value
lng = result.columns['item:lng'].value
weng_location[weng_id].append([lat,lng])
附加文件
md5_rowkey.py
文件
'''
主要是 对hbase 的hash ,
同Java 保持一致
'''
from hashlib import md5
import hashlib
def getHashID(rowkey):
'''
:param rowkey: sting
:return:
'''
if type(rowkey) != str:
print('You input data is not string.')
return None
else:
rowkey =rowkey
hash_md5 = hashlib.md5(bytes(rowkey,encoding='utf-8'))
bt_rowkey = hash_md5.digest()
i = 0
for j in bt_rowkey:
if j> 128:
i+= -j%256
# print('j:', -j%256)
else:
i+= j
# print('j:', j)
prefix = 1000 + i%500
return str(prefix)+'_'+rowkey
if __name__ == '__main__':
user_id ='CC05FE29-ADD2-46FA-B81B-40F9B4288747'
print(getHashID(user_id))