Kafka Connect非常强大,但是也有局限性,不能个性化的定制,如果需要参考我的另外一个博客博客地址
Python实现起来其实也很简单,就是利用消费者导出,生产者导入。而且我效率也很不错
代码介绍
下面是一个从某个topic某个分区读数据,然后写到另外一个topic的完整代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import sys
import time
from kafka import KafkaConsumer, TopicPartition, KafkaProducer
class Kafka_producer():
'''
使用kafka的生产模块
'''
def __init__(self, kafkahost,kafkaport, kafkatopic):
self.producer = KafkaProducer(
bootstrap_servers=["{}:{}".format(kafkahost,kafkaport)],
value_serializer=lambda v: json.dumps(v).encode('utf-8')
)
self.kafkatopic = kafkatopic
def sendjsondata(self, msg):
try:
self.producer.send(topic=self.kafkatopic, value=msg)
except KeyboardInterrupt, e:
print e
class Kafka_consumer():
def __init__(self, kafkahost, kafkaport, kafkatopic, partition_num, groupid,start=0,end=0,consumer_timeout_ms = 10000):
self.consumer = KafkaConsumer(
group_id=groupid,
bootstrap_servers=["{}:{}".format(kafkahost,kafkaport)],
value_deserializer=json.loads,
consumer_timeout_ms = consumer_timeout_ms
)
self.end = sys.maxint;
# consumer from special offset
if start > 0 :
self.end = end
partition = TopicPartition(kafkatopic, partition_num)
self.consumer.assign([partition])
self.consumer.seek(partition, int(start))
def consume_data(self):
try:
for message in self.consumer:
if message.offset > self.end:
break
yield message
except KeyboardInterrupt, e:
print e
def getSysMills():
return int(round(time.time() * 1000))
def main():
start_offset = 2127489 #开始的offset
end_offset = 4044141 #结束的offset
consumer_timeout_ms_global = 10000
source_hosts = "xxxx" #只是host不需要端口
source_topic = "topic_xxxx"
source_partition_num = 0
group = source_topic+'_xxxx_group'
dest_hosts = "xxxxxx"
dest_topic = "xxxxxxxx"
print("source_hosts:{},source_topic:{},source_partition_num:{},start_offset:{},end_offset:{}".format(source_hosts, source_topic, source_partition_num, start_offset, end_offset))
start_time = getSysMills()
consumer = Kafka_consumer(source_hosts, 9092, source_topic, source_partition_num, group, start=start_offset, end=end_offset, consumer_timeout_ms=consumer_timeout_ms_global)
messages = consumer.consume_data() ##导出
print("dest_hosts:{},dest_topic:{}".format(dest_hosts, dest_topic))
producer = Kafka_producer(dest_hosts, 9092, dest_topic)
count = 0
for message in messages:
producer.sendjsondata(message.value) #导入
print(message.value)
count = count+1
if count % 1000 == 0:
print("migrate already completed count:{}".format(count))
producer.producer.close()
consumer.consumer.close()
end_time = getSysMills()
print "migrate complete,cost:{},count:{}".format(end_time-start_time, count)
if __name__ == '__main__':
main()