Kafka producer overview
python clients:
confluent-kafka-python : created by Confluent which is based on C client librdkafka
kafka-python: created by open source community which is fully written by python
Next, I will show you the kafka producer workflow using kafka-python:
Initialize
# Create the producer object by passing the config dictionary
producer = KafkaProducer(**kafka_config)
# In the __init__ method of KafkaProducer, it creates below import objects:
# A network client for asynchronous request/response network I/O.
# This is an internal class used to implement the user-facing producer and consumer clients. This class is not thread-safe!
client = KafkaClient(metrics=self._metrics, metric_group_prefix='producer', wakeup_timeout_ms=self.config['max_block_ms'], **self.config)
# This class maintains a dequeue per TopicPartition that accumulates messages into MessageSets to be sent to the server.
# The accumulator attempts to bound memory use, and append calls will block when that memory is exhausted.
self._accumulator = RecordAccumulator(message_version=message_version, metrics=self._metrics, **self.config)
self._metadata = client.cluster
# The background thread that handles the sending of produce requests to the Kafka cluster. This thread makes metadata requests to renew its view of the cluster and then sends produce requests to the appropriate nodes.
self._sender = Sender(client, self._metadata, self._accumulator, self._metrics, guarantee_message_order=guarantee_message_order, **self.config)
# then start sender as a separate daemon thread:
self._sender.daemon = True
self._sender.start()
# In __init__ of KafkaClient:
# Check Broker Version if not set explicitly, it will try to connect to bootstrap server and may raise Exception
if self.config['api_version'] is None:
check_timeout = self.config['api_version_auto_timeout_ms'] / 1000
self.config['api_version'] = self.check_version(timeout=check_timeout)
# Create a pair connected extra socketa(AF_UNIX) used to wake up selector.select(EpollSelector):
self._wake_r, self._wake_w = socket.socketpair()
self._wake_r.setblocking(False)
self._wake_w.settimeout(self.config['wakeup_timeout_ms'] / 1000.0)
self._selector.register(self._wake_r, selectors.EVENT_READ)
def wakeup(self):
with self._wake_lock:
try:
self._wake_w.sendall(b'x')
except socket.timeout:
log.warning('Timeout to send to wakeup socket!')
raise Errors.KafkaTimeoutError()
except socket.error:
log.warning('Unable to send to wakeup socket!')
# In _poll(self, timeout) of KafkaClient:
ready = self._selector.select(timeout)
for key, events in ready:
if key.fileobj is self._wake_r:
self._clear_wake_fd()
continue
elif not (events & selectors.EVENT_READ):
continue
conn = key.data
processed.add(conn)
def _clear_wake_fd(self):
# reading from wake socket should only happen in a single thread
while True:
try:
self._wake_r.recv(1024)
except socket.error:
break
Get cluster meta data
# When Sender thread is started, its run method is called, then run_once->self._client.poll(poll_timeout_ms)->self._maybe_refresh_metadata()
# If _need_update is not set, Cluster meta data is refreshed every metadata_max_age_ms
ttl = self.cluster.ttl()
def ttl(self):
"""Milliseconds until metadata should be refreshed"""
now = time.time() * 1000
if self._need_update:
ttl = 0
else:
metadata_age = now - self._last_successful_refresh_ms
ttl = self.config['metadata_max_age_ms'] - metadata_age
retry_age = now - self._last_refresh_ms
next_retry = self.config['retry_backoff_ms'] - retry_age
return max(ttl, next_retry, 0)
# In the send function of KafkaProducer, _wait_on_metadata is first called to check if partitions exist for the topic, if not, it will set _need_update of ClusterMetadata and wait for max_block_ms, if still can't find, it will raise KafkaTimeoutError:
def _wait_on_metadata(self, topic, max_wait):
# add topic to metadata topic list if it is not there already.
self._sender.add_topic(topic)
begin = time.time()
elapsed = 0.0
metadata_event = None
while True:
partitions = self._metadata.partitions_for_topic(topic)
if partitions is not None:
return partitions
if not metadata_event:
metadata_event = threading.Event()
log.debug("Requesting metadata update for topic %s", topic)
metadata_event.clear()
future = self._metadata.request_update()
future.add_both(lambda e, *args: e.set(), metadata_event)
self._sender.wakeup()
metadata_event.wait(max_wait - elapsed)
elapsed = time.time() - begin
if not metadata_event.is_set():
raise Errors.KafkaTimeoutError(
"Failed to update metadata after %.1f secs." % (max_wait,))
elif topic in self._metadata.unauthorized_topics:
raise Errors.TopicAuthorizationFailedError(topic)
else:
log.debug("_wait_on_metadata woke after %s secs.", elapsed)
Serialize
# In send function of KafkaProducer
key_bytes = self._serialize(self.config['key_serializer'], topic, key)
value_bytes = self._serialize(self.config['value_serializer'], topic, value)
Partition
# In send function of KafkaProducer, use partition (int, optional) optionally specify a partition. If not set, the partition will be selected using the configured 'partitioner'.
def _partition(self, topic, partition, key, value,
serialized_key, serialized_value):
if partition is not None:
assert partition