【Python实现】Kafka批量导入导出

Kafka Connect非常强大,但是也有局限性,不能个性化的定制,如果需要参考我的另外一个博客博客地址

Python实现起来其实也很简单,就是利用消费者导出,生产者导入。而且我效率也很不错

代码介绍

下面是一个从某个topic某个分区读数据,然后写到另外一个topic的完整代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import sys
import time

from kafka import KafkaConsumer, TopicPartition, KafkaProducer


class Kafka_producer():
    '''
    使用kafka的生产模块
    '''
    def __init__(self, kafkahost,kafkaport, kafkatopic):
        self.producer = KafkaProducer(
            bootstrap_servers=["{}:{}".format(kafkahost,kafkaport)],
            value_serializer=lambda v: json.dumps(v).encode('utf-8')
        )
        self.kafkatopic = kafkatopic

    def sendjsondata(self, msg):
        try:
            self.producer.send(topic=self.kafkatopic, value=msg)
        except KeyboardInterrupt,  e:
            print e


class Kafka_consumer():
    def __init__(self, kafkahost, kafkaport, kafkatopic, partition_num, groupid,start=0,end=0,consumer_timeout_ms = 10000):
        self.consumer = KafkaConsumer(
            group_id=groupid,
            bootstrap_servers=["{}:{}".format(kafkahost,kafkaport)],
            value_deserializer=json.loads,
            consumer_timeout_ms = consumer_timeout_ms
        )
        self.end = sys.maxint;
        # consumer from special offset
        if start > 0 :
            self.end = end
            partition = TopicPartition(kafkatopic, partition_num)
            self.consumer.assign([partition])
            self.consumer.seek(partition, int(start))

    def consume_data(self):
        try:
            for message in self.consumer:
                if message.offset > self.end:
                    break
                yield message
        except KeyboardInterrupt, e:
            print e


def getSysMills():
    return  int(round(time.time() * 1000))

def main():
    start_offset = 2127489 #开始的offset
    end_offset = 4044141 #结束的offset
    consumer_timeout_ms_global = 10000

    source_hosts = "xxxx"  #只是host不需要端口
    source_topic = "topic_xxxx"
    source_partition_num = 0
    group = source_topic+'_xxxx_group'

    dest_hosts = "xxxxxx"
    dest_topic = "xxxxxxxx"

    print("source_hosts:{},source_topic:{},source_partition_num:{},start_offset:{},end_offset:{}".format(source_hosts, source_topic, source_partition_num, start_offset, end_offset))

    start_time = getSysMills()

    consumer = Kafka_consumer(source_hosts, 9092, source_topic, source_partition_num, group, start=start_offset, end=end_offset, consumer_timeout_ms=consumer_timeout_ms_global)
    messages = consumer.consume_data()  ##导出


    print("dest_hosts:{},dest_topic:{}".format(dest_hosts, dest_topic))
    producer = Kafka_producer(dest_hosts, 9092, dest_topic)

    count = 0
    for message in messages:
        producer.sendjsondata(message.value) #导入
        print(message.value)
        count = count+1
        if count % 1000 == 0:
            print("migrate already completed count:{}".format(count))

    producer.producer.close()
    consumer.consumer.close()
    end_time = getSysMills()
    print "migrate complete,cost:{},count:{}".format(end_time-start_time, count)


if __name__ == '__main__':
    main()
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值