sqoop 导入 mysql 数据到 hbase 效率不高, 并且不能实时导入数据
这里使用 kafka-thrift
方案
原理
mysql
需要开启binlog
,配置 maxwell
监听binlog
,这样 mysql
任何数据变动,都会反应在 binlog
文件中,由 maxwell
解析成 json
格式,传入消息队列 kafka
,然后通过自己写 python 程序作为 kafka
消费者,将数据通过happybase
包存储在 hbase
中,happybase
包依赖thrift
这种方式可以实时将 mysql
数据导入 hbase
中
Phoenix安装 (便于查询 hbase, 安装可选)
由于HBase数据库只提供了Java和shell两种接口,并不支持SQL,所以诞生了Phoenix 工具
# 在 ubuntu1 上
cd /opt
wget http://mirror.bit.edu.cn/apache/phoenix/apache-phoenix-4.15.0-HBase-1.4/bin/apache-phoenix-4.15.0-HBase-1.4-bin.tar.gz
tar zvxf apache-phoenix-4.15.0-HBase-1.4-bin.tar.gz
mv apache-phoenix-4.15.0-HBase-1.4-bin phoenix-4.15.0
# 将 phoenix-5.0.0-HBase-2.0-server.jar 拷贝到所有节点的 hbase 的 lib 目录下
cp /opt/phoenix-4.15.0/phoenix-4.15.0-HBase-1.4-server.jar /opt/hbase-1.4.12/lib/
scp /opt/phoenix-4.15.0/phoenix-4.15.0-HBase-1.4-server.jar hadoop-2:/opt/hbase-1.4.12/lib/
scp /opt/phoenix-4.15.0/phoenix-4.15.0-HBase-1.4-server.jar hadoop-3:/opt/hbase-1.4.12/lib/
# 配置环境变量
vim /etc/profile.d/hbase-1.4.12.sh
export PHOENIX_HOME=/opt/phoenix-4.15.0
export PATH=$PATH:$PHOENIX_HOME/bin
source /etc/profile
# 重启 hbase
stop-hbase.sh
start-hbase.sh
# 启动 phoenix
sqlline.py
输入 !table
验证
kafka 安装
# 在所有节点上
cd /opt
wget https://mirrors.tuna.tsinghua.edu.cn/apache/kafka/2.4.0/kafka_2.13-2.4.0.tgz
tar zvxf kafka_2.13-2.4.0.tgz
mv kafka_2.13-2.4.0 kafka-2.4.0
# 分别修改 kafka 配置
vim kafka-2.4.0/config/server.properties
broker.id=1 # 所有节点 id 均不一样,随便设置一个整数即可
log.dirs=/opt/kafka-2.4.0/kafka-logs
listeners=PLAINTEXT://10.0.0.10:9092 # 改为各节点内网ip
zookeeper.connect=hadoop-1:2181,hadoop-2:2181,hadoop-3:2181
# 启动
cd /opt/kafka-2.4.0
./bin/kafka-server-start.sh -daemon config/server.properties
# 创建 topic, 注意根据实际消费者数目修改分区数
./bin/kafka-topics.sh --create --bootstrap-server 10.0.0.10:9092 --replication-factor 2 --partitions 4 --topic mysql2hbase
# 修改分区数
./kafka-topics.sh --bootstrap-server 10.0.0.10:9092 --alter --partitions 8 --topic mysql2hbase
# 分区数要大于等于消费者的数量,这样多个消费者才能并行消费,如果分区数为1,那么只会有一个消费者在消费
# 验证
./bin/kafka-topics.sh --list --bootstrap-server 10.0.0.10:9092
# 查看 kafka 消费情况
./kafka-consumer-groups.sh --bootstrap-server hadoop-1:9092 --describe --group testgroup
mysql 开启 binlog
vim /etc/mysql/my.cnf
[mysqld]
server-id=1
log-bin=/data/mysql/mysql-bin
binlog_format=row
systemctl restart mysql
# 创建 maxwell 使用的账户并设置权限
mysql> CREATE USER 'maxwell'@'%' IDENTIFIED BY 'maxwell';
mysql> GRANT ALL ON maxwell.* TO 'maxwell'@'%';
mysql> GRANT SELECT, REPLICATION CLIENT,REPLICATION SLAVE on *.* to 'maxwell'@'%';
mysql> flush privileges;
maxwell 部署
# hadoop-1 上
cd /opt
wget https://github.com/zendesk/maxwell/releases/download/v1.24.1/maxwell-1.24.1.tar.gz
tar zvxf maxwell-1.24.1.tar.gz
cd maxwell-1.24.1
cp config.properties.example config.properties
vim config.properties
kafka.bootstrap.servers=hadoop-1:9092,hadoop-2:9092,hadoop-3:9092
# mysql login info
host=10.0.0.5
user=maxwell
password=maxwell
kafka_topic=mysql2hbase
kafka.acks=all
#######修改partition_by,解决kafka数据倾斜######
# 详情参考: https://www.jianshu.com/p/a102577bfc8f
# zabbix 中 history_uint 表没有主键,可以使用 transaction_id
kafka_partition_hash=murmur3
producer_partition_by=primary_key
# 启动, maxwell 默认使用当前工作目录的配置文件
./bin/maxwell --daemon
thrift 安装
apt install -y automake bison flex g++ git libboost1.55 libevent-dev libssl-dev libtool make pkg-config
cd /opt
wget http://mirrors.hust.edu.cn/apache/thrift/0.13.0/thrift-0.13.0.tar.gz
tar zvxf thrift-0.13.0.tar.gz
cd thrift-0.13.0
./configure --with-cpp --with-boost --with-python --without-csharp --with-java --without-erlang --without-perl --with-php --without-php_extension --without-ruby --without-haskell --without-go
make && make install
# 修改 thrift 超时时间
vim /opt/hbase-1.4.12/conf/hbase-site.xml
<property>
<name>hbase.thrift.server.socket.read.timeout</name>
<value>6000000</value>
<description>eg:milisecond</description>
</property>
# 启动
hbase-daemon.sh start thrift
测试表结构
总共 342169
条数据, itemid
与 clock
为联合主键
该表数据不会新增,我们要将该表的全部数据都导入到 hbase
中。
我们将该表备份,然后删除表 (drop table
不会写 binlog
,还原表会),再还原表,这样 binlog
就能产生记录,maxwell
能解析并生产到 kafka
中
如果该表数据在不断新增,那么这种方法只能导入新增的数据, 并且下面的脚本中, 要将consumer_timeout_ms
删掉
mysql to hbase
# 安装依赖
apt-get install libsnappy-dev
# yum install libsnappy-devel
# 创建 python 虚拟环境略
pip install python-snappy pykafka happybase
python 消费者
# 使用脚本前, 先配置好 mysql->maxwell->kafka 数据流, hbase 安装并运行 thrift
# 然后将目标数据库导入mysql, 这样全部数据在 kafka 中, 核对 kafka 中 lag 是否与mysql表总数一样
# 执行该脚本, 脚本会持续消费 kafka 消息, 如果30s没有收到新的消息, 表示已全部消费完, 则表示全部数据已导入 hbase
# 该脚本可以执行多次, 已达到多进程多消费者同时消费的目的, 也可以使用下面的多进程版本
import json
import time
from pykafka import KafkaClient
import happybase
class mysqlToHbase():
def __init__(self):
self.client = KafkaClient(
hosts="hadoop-1:9092,hadoop-2:9092,hadoop-3:9092")
self.topic = self.client.topics['mysql2hbase']
self.consumer = self.topic.get_balanced_consumer(
consumer_group='testgroup',
consumer_timeout_ms=30000, # 30s kafka 没有 lag, 则停止程序, 如果需要实时导入数据,则去掉该参数
auto_commit_enable=True,
zookeeper_connect='hadoop-1:2181,hadoop-2:2181,hadoop-3:2181')
self.conn = happybase.Connection(host="127.0.0.1",
port=9090,
autoconnect=True)
def batchTokafka(self, mysql_table_name, mysql_table_id, include_column,
table_name, families):
start_time = time.time()
table = self.conn.table(table_name)
i = 1
with table.batch(batch_size=1024 * 1024) as bat:
# batch_size 要根据实际数据大小设置, 如果设置成 1024 * 1024 , zabbix 的 trends 表 26200 条数据才提交一次
for m in self.consumer:
database = json.loads(m.value.decode('utf-8'))["database"]
name = json.loads(m.value.decode('utf-8'))["table"]
row_data = json.loads(m.value.decode('utf-8'))["data"]
if database == 'zabbix' and name == mysql_table_name:
table_id = ''.join(
[str(row_data[i]) for i in mysql_table_id])
for id in mysql_table_id:
del row_data[id]
row_data = add_prefix(row_data, families, include_column)
bat.put(
table_id,
row_data) # put 内部检测是否达到 batch_size, 达到则调用 send() 发送
if i % 1000 == 0:
print("===========插入了" + str(i) + "数据!============")
print("===========累计耗时:" +
str(time.time() - start_time) + "s=============")
i += 1
print('********** 总共插入: %d *************' % (i - 1))
print('********** mysql表总共: %d *************' % mysql_table_counts)
def createTable(self, table_name, families):
if bytes(table_name, 'utf-8') in self.conn.tables():
return
self.conn.create_table(table_name, families)
print("==========create table %s successful==========" % table_name)
def deletTable(self, table_name, flag):
self.conn.delete_table(table_name, flag)
print("==========delete table %s successful==========" % table_name)
def insertData(self, table_name, row, data):
table = self.conn.table(table_name)
table.put(row=row, data=data)
def getRow(self, table_name):
table = self.conn.table(table_name)
print(table.scan())
i = 0
for key, value in table.scan():
print('key: ', key, 'value: ', value)
i += 1
print("==========counts: %d ==========" % i)
def close(self):
self.conn.close()
del self.consumer
def add_prefix(row_data, prefix, include_column):
new_raw_data = {}
if '*' in include_column:
columns = row_data
else:
columns = include_column
for column in columns:
new_raw_data[prefix + ':' + column] = str(row_data.get(column))
return new_raw_data
if __name__ == '__main__':
htb = mysqlToHbase()
mysql_table_name = 'trends'
mysql_table_id = ['clock', 'itemid']
# 表中含有多个主键, 最终 rowkey 为 'clock + itemid'
mysql_table_counts = 342169
include_column = ['*']
# 如果想包含全部列, 配置 include_column = ['*']; 多列: include_column = ['column1', 'column2']
table_name = 'mysql_hbase_trends'
families = 'info'
htb.createTable(table_name, {families: {}})
# htb.getRow(table_name)
htb.batchTokafka(mysql_table_name, mysql_table_id, include_column,
table_name, families)
# htb.deletTable(table_name, True)
htb.close()
测试
单进程测试,30w 条记录大概耗时 1分钟
342169
条数据全部导入 hbase 中,hbase 中验证:
多进程版本
# 使用脚本前, 先配置好 mysql->maxwell->kafka 数据流, hbase 安装并运行 thrift
# thrift 要设置超时时间 https://blog.51cto.com/13103353/2107257
# 然后将目标数据库导入mysql, 这样全部数据在 kafka 中, 核对 kafka 中 lag 是否与mysql表总数一样
# 执行该脚本, 脚本会持续消费 kafka 消息, 如果 5min 没有收到新的消息, 表示该消费者已无法从 kafka 获取消息, 终止相关进程
# 该脚本为多进程版本,每个版本一个消费者,kafka 相应 topic 的分区必须大于等于消费者总数才行
# 并且一定要确保生产者是在同时往 kafka 多个分区中同时写数据
# 进程数不能超过 kafka 中lag 不为0的分区数, 否则程序报错 RuntimeError: cannot join current thread, 该报错可忽略
import concurrent.futures
import json
from multiprocessing import cpu_count
import os
import time
from pykafka import KafkaClient
from pykafka.exceptions import ConsumerStoppedException
import happybase
class mysqlToHbase():
def __init__(self):
self.client = KafkaClient(
hosts="hadoop-1:9092,hadoop-2:9092,hadoop-3:9092")
self.topic = self.client.topics['mysql2hbase']
self.consumer = self.topic.get_balanced_consumer(
consumer_group='testgroup',
# 5min kafka 没有 lag, 则停止程序, 数据量小于百万, 可以设置为 30s, 数据量越大,建议设置长一点, 但不能超过 thrift 超时时间
consumer_timeout_ms=5 * 60 * 1000,
auto_commit_enable=True,
zookeeper_connect='hadoop-1:2181,hadoop-2:2181,hadoop-3:2181')
self.conn = happybase.Connection(host="127.0.0.1",
port=9090,
autoconnect=True)
def batchTokafka(self, mysql_table_name, mysql_table_id, include_column,
table_name, families, batch_size):
start_time = time.time()
table = self.conn.table(table_name)
i = 1
with table.batch() as bat:
try:
for m in self.consumer:
database = json.loads(m.value.decode('utf-8'))["database"]
name = json.loads(m.value.decode('utf-8'))["table"]
row_data = json.loads(m.value.decode('utf-8'))["data"]
if database == 'zabbix' and name == mysql_table_name:
table_id = ''.join(
[str(row_data[i]) for i in mysql_table_id])
for id in mysql_table_id:
del row_data[id]
row_data = add_prefix(row_data, families,
include_column)
bat.put(table_id, row_data)
if i % batch_size == 0:
bat.send()
print(
"[%d]: " % os.getpid(),
"===========插入了" + str(i) + "数据!============")
print(
"[%d]: " % os.getpid(), "===========累计耗时:" +
str(time.time() - start_time) +
"s=============")
i += 1
except ConsumerStoppedException:
pass
print("[%d]: " % os.getpid(),
'********** 总共插入: %d *************' % (i - 1))
def createTable(self, table_name, families):
if bytes(table_name, 'utf-8') in self.conn.tables():
return
self.conn.create_table(table_name, families)
print("==========create table %s successful==========" % table_name)
def deletTable(self, table_name, flag):
self.conn.delete_table(table_name, flag)
print("==========delete table %s successful==========" % table_name)
def insertData(self, table_name, row, data):
table = self.conn.table(table_name)
table.put(row=row, data=data)
def getRow(self, table_name):
table = self.conn.table(table_name)
print(table.scan())
i = 0
for key, value in table.scan():
print('key: ', key, 'value: ', value)
i += 1
print("==========counts: %d ==========" % i)
def close(self):
self.conn.close()
del self.consumer
def add_prefix(row_data, prefix, include_column):
new_raw_data = {}
if '*' in include_column:
columns = row_data
else:
columns = include_column
for column in columns:
new_raw_data[prefix + ':' + column] = str(row_data.get(column))
return new_raw_data
def execute(mysql_table_name, mysql_table_id, include_column, table_name,
families, batch_size):
htb = mysqlToHbase()
htb.batchTokafka(mysql_table_name, mysql_table_id, include_column,
table_name, families, batch_size)
htb.close()
if __name__ == '__main__':
cpus = cpu_count()
mysql_table_name = 'trends'
# 表中含有多个主键, 最终 rowkey 为 'clock + itemid'
mysql_table_id = ['clock', 'itemid']
mysql_table_counts = 342169
# 多少条记录批量提交一次
batch_size = 10000
# 如果想包含全部列, 配置 include_column = ['*']; 多列: include_column = ['column1', 'column2']
include_column = ['*']
table_name = 'mysql_hbase_trends'
families = 'info'
# 创建 hbase 表
htb = mysqlToHbase()
htb.createTable(table_name, {families: {}})
htb.close()
with concurrent.futures.ProcessPoolExecutor(max_workers=cpus) as executor:
submits = [
executor.submit(execute, mysql_table_name, mysql_table_id,
include_column, table_name, families, batch_size)
for i in range(cpus)
]
for future in concurrent.futures.as_completed(submits):
future.result()
print('********** mysql表总共: %d *************' % mysql_table_counts)
8核服务器运行,耗时大概10s