Kafka Producer

Kafka producer overview
python clients:
confluent-kafka-python : created by Confluent which is based on C client librdkafka
kafka-python: created by open source community which is fully written by python

Next, I will show you the kafka producer workflow using kafka-python:

Initialize

# Create the producer object by passing the config dictionary
producer = KafkaProducer(**kafka_config)
# In the __init__ method of KafkaProducer, it creates below import objects:
# A network client for asynchronous request/response network I/O.
# This is an internal class used to implement the user-facing producer and consumer clients. This class is not thread-safe!
client = KafkaClient(metrics=self._metrics, metric_group_prefix='producer', wakeup_timeout_ms=self.config['max_block_ms'], **self.config)
# This class maintains a dequeue per TopicPartition that accumulates messages into MessageSets to be sent to the server.
# The accumulator attempts to bound memory use, and append calls will block when that memory is exhausted.
self._accumulator = RecordAccumulator(message_version=message_version, metrics=self._metrics, **self.config)
self._metadata = client.cluster
# The background thread that handles the sending of produce requests to the Kafka cluster. This thread makes metadata requests to renew its view of the cluster and then sends produce requests to the appropriate nodes.
self._sender = Sender(client, self._metadata, self._accumulator, self._metrics, guarantee_message_order=guarantee_message_order, **self.config)
# then start sender as a separate daemon thread:
self._sender.daemon = True
self._sender.start()
# In __init__ of KafkaClient:
# Check Broker Version if not set explicitly, it will try to connect to bootstrap server and may raise Exception
if self.config['api_version'] is None:
    check_timeout = self.config['api_version_auto_timeout_ms'] / 1000
    self.config['api_version'] = self.check_version(timeout=check_timeout)
# Create a pair connected extra socketa(AF_UNIX) used to wake up selector.select(EpollSelector):
self._wake_r, self._wake_w = socket.socketpair()
self._wake_r.setblocking(False)
self._wake_w.settimeout(self.config['wakeup_timeout_ms'] / 1000.0)
self._selector.register(self._wake_r, selectors.EVENT_READ)
def wakeup(self):
    with self._wake_lock:
        try:
            self._wake_w.sendall(b'x')
        except socket.timeout:
            log.warning('Timeout to send to wakeup socket!')
            raise Errors.KafkaTimeoutError()
        except socket.error:
            log.warning('Unable to send to wakeup socket!')
# In _poll(self, timeout) of KafkaClient:
ready = self._selector.select(timeout)
for key, events in ready:
    if key.fileobj is self._wake_r:
        self._clear_wake_fd()
        continue
    elif not (events & selectors.EVENT_READ):
        continue
    conn = key.data
    processed.add(conn)
def _clear_wake_fd(self):
    # reading from wake socket should only happen in a single thread
    while True:
        try:
            self._wake_r.recv(1024)
        except socket.error:
            break 

Get cluster meta data

# When Sender thread is started, its run method is called, then run_once->self._client.poll(poll_timeout_ms)->self._maybe_refresh_metadata()
# If _need_update is not set, Cluster meta data is refreshed every metadata_max_age_ms
ttl = self.cluster.ttl()
def ttl(self):
    """Milliseconds until metadata should be refreshed"""
    now = time.time() * 1000
    if self._need_update:
        ttl = 0
    else:
        metadata_age = now - self._last_successful_refresh_ms
        ttl = self.config['metadata_max_age_ms'] - metadata_age
    retry_age = now - self._last_refresh_ms
    next_retry = self.config['retry_backoff_ms'] - retry_age
    return max(ttl, next_retry, 0)
    
# In the send function of KafkaProducer, _wait_on_metadata is first called to check if partitions exist for the topic, if not, it will set _need_update of ClusterMetadata and wait for max_block_ms, if still can't find, it will raise KafkaTimeoutError:
def _wait_on_metadata(self, topic, max_wait):
    # add topic to metadata topic list if it is not there already.
    self._sender.add_topic(topic)
    begin = time.time()
    elapsed = 0.0
    metadata_event = None
    while True:
        partitions = self._metadata.partitions_for_topic(topic)
        if partitions is not None:
            return partitions

        if not metadata_event:
            metadata_event = threading.Event()

        log.debug("Requesting metadata update for topic %s", topic)

        metadata_event.clear()
        future = self._metadata.request_update()
        future.add_both(lambda e, *args: e.set(), metadata_event)
        self._sender.wakeup()
        metadata_event.wait(max_wait - elapsed)
        elapsed = time.time() - begin
        if not metadata_event.is_set():
            raise Errors.KafkaTimeoutError(
                "Failed to update metadata after %.1f secs." % (max_wait,))
        elif topic in self._metadata.unauthorized_topics:
            raise Errors.TopicAuthorizationFailedError(topic)
        else:
            log.debug("_wait_on_metadata woke after %s secs.", elapsed)

Serialize

# In send function of KafkaProducer
key_bytes = self._serialize(self.config['key_serializer'], topic, key)
value_bytes = self._serialize(self.config['value_serializer'], topic, value)

Partition

# In send function of KafkaProducer, use partition (int, optional) optionally specify a partition. If not set, the partition will be selected using the configured 'partitioner'.
def _partition(self, topic, partition, key, value,
               serialized_key, serialized_value):
    if partition is not None:
        assert partition
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值