一个简单粗暴的方法从MySQL数据库抽取数据到Hbase实现的过程:

rowKey利用MySQL表的主键ID特性作为HBASE的id

code:

####

#!/usr/bin/env python

#coding=utf-8

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

import MySQLdb

import datetime,time

sys.path.append('/usr/lib/python2.6/site-packages/hbase')

from thrift import Thrift

from thrift.transport import TSocket

from thrift.transport import TTransport

from thrift.protocol import TBinaryProtocol

from hbase import Hbase

from hbase.ttypes import *

import csv

from hbase.ttypes import ColumnDescriptor, Mutation, BatchMutation, TRegionInfo

from hbase.ttypes import IOError, AlreadyExists


def client_conn():

    transport=TSocket.TSocket("172.16.10.87",9090)

    transport=TTransport.TBufferedTransport(transport)

    protocol=TBinaryProtocol.TBinaryProtocol(transport)

    client=Hbase.Client(protocol)

    transport.open()

    return client

if __name__=="__main__":

client=client_conn()

conn = MySQLdb.connect(host="172.161.110.10", user="dlan", passwd="root123", port=5029, db='coolqi', charset='utf8')

cur = conn.cursor()

sql="select * from ca_record where ca_time>=STR_TO_DATE('20170720','%Y%m%d')"

print sql

cur.execute(sql)

data=cur.fetchall()

for k in xrange(len(data)):

            datalist2=[]

            rowKey=data[k][0]

   print rowKey

            user_id=data[k][1]

            ca_result=data[k][2]

            ca_time=data[k][3]

            real_name=data[k][4]

            id_card=data[k][5]

            sex=data[k][6]

            datalist=[user_id,ca_result,ca_time,real_name,id_card,sex]

   datalist1=["user_id","ca_result","ca_time","real_name","id_card","sex"]

   for j in range(len(datalist)):

                args=str(datalist[j])

#print args

                if isinstance(args,str):

                    mutations="[Mutation(column="+"'"+datalist1[j]+':'+str(j)+"'"+","+"value="+"'"+str(args)+"')"+"]"

   datalist2.append(mutations)

   #print datalist2

client.mutateRow('ca_record',str(rowKey),[Mutation(column=datalist1[j]+':'+str(j),value=str(args))])

   #client.mutateRows('ca_record',datalist2)

###在for k in xrange(len(data)): 可以利用enumerate()函数减少上面的循环 .