kafka-python:https://github.com/dpkp/kafka-python
kafka-python 文档:https://kafka-python.readthedocs.io/en/master/apidoc/modules.html
kafka 官方文档:http://kafka.apache.org/documentation.html
Python 操作 Kafka 的通俗总结(kafka-python):https://zhuanlan.zhihu.com/p/279784873
译:Kafka 和 Unix 管道的示例:http://zqhxuyuan.github.io/2016/01/05/2016-01-05-Kafka-Unix/
一、基本概念
- Topic:一组消息数据的标记符;
- Producer:生产者,用于生产数据,可将生产后的消息送入指定的 Topic;
- Consumer:消费者,获取数据,可消费指定的 Topic 里面的数据
- Group:消费者组,同一个 group 可以有多个消费者,一条消息在一个 group 中,只会被一个消费者 获取;
- Partition:分区,为了保证 kafka 的吞吐量,一个 Topic 可以设置多个分区。同一分区只能被一个消费者订阅。
二、安装 kafka-python
pip 命令:pip install kafka-python
三、生产者(Producer)与 消费者(Consumer)
生产者 示例:
# -*- coding: utf-8 -*-
import json
import json
import msgpack
from loguru import logger
from kafka import KafkaProducer
from kafka.errors import KafkaError
def kfk_produce_1():
"""
发送 json 格式数据
:return:
"""
producer = KafkaProducer(
bootstrap_servers='ip:9092',
value_serializer=lambda v: json.dumps(v).encode('utf-8')
)
producer.send('test_topic', {'key1': 'value1'})
def kfk_produce_2():
"""
发送 string 格式数据
:return:
"""
producer = KafkaProducer(bootstrap_servers='xxxx:x')
data_dict = {
"name": 'king',
'age': 100,
"msg": "Hello World"
}
msg = json.dumps(data_dict)
producer.send('test_topic', msg, partition=0)
producer.close()
def kfk_produce_3():
producer = KafkaProducer(bootstrap_servers=['broker1:1234'])
# Asynchronous by default ( 默认是异步发送 )
future = producer.send('my-topic', b'raw_bytes')
# Block for 'synchronous' sends
try:
record_metadata = future.get(timeout=10)
except KafkaError:
# Decide what to do if produce request failed...
logger.error(KafkaError)
pass
# Successful result returns assigned partition and offset
print(record_metadata.topic)
print(record_metadata.partition)
print(record_metadata.offset)
# produce keyed messages to enable hashed partitioning
producer.send('my-topic', key=b'foo', value=b'bar')
# encode objects via msgpack
producer = KafkaProducer(value_serializer=msgpack.dumps)
producer.send('msgpack-topic', {'key': 'value'})
# produce json messages
producer = KafkaProducer(value_serializer=lambda m: json.dumps(m).encode('ascii'))
producer.send('json-topic', {'key': 'value'})
# produce asynchronously
for _ in range(100):
producer.send('my-topic', b'msg')
def on_send_success(record_metadata=None):
print(record_metadata.topic)
print(record_metadata.partition)
print(record_metadata.offset)
def on_send_error(excp=None):
logger.error('I am an errback', exc_info=excp)
# handle exception
# produce asynchronously with callbacks
producer.send('my-topic', b'raw_bytes').add_callback(on_send_success).add_errback(on_send_error)
# block until all async messages are sent
producer.flush()
# configure multiple retries
producer = KafkaProducer(retries=5)
if __name__ == '__main__':
kfk_produce_1()
kfk_produce_2()
pass
消费者 示例:
# -*- coding: utf-8 -*-
import json
import msgpack
from kafka import KafkaConsumer
# To consume latest messages and auto-commit offsets
consumer = KafkaConsumer(
'my-topic', group_id='my-group',
bootstrap_servers=['localhost:9092']
)
for message in consumer:
# message value and key are raw bytes -- decode if necessary!
# e.g., for unicode: `message.value.decode('utf-8')`
info = f'{message.topic}:{message.partition}:{message.offset}: key={message.key}, value={message.value}'
print(info)
# consume earliest available messages, don't commit offsets
KafkaConsumer(auto_offset_reset='earliest', enable_auto_commit=False)
# consume json messages
KafkaConsumer(value_deserializer=lambda m: json.loads(m.decode('ascii')))
# consume msgpack
KafkaConsumer(value_deserializer=msgpack.unpackb)
# StopIteration if no message after 1sec ( 没有消息时,1s后停止消费 )
KafkaConsumer(consumer_timeout_ms=1000)
# Subscribe to a regex topic pattern
consumer = KafkaConsumer()
consumer.subscribe(pattern='^awesome.*')
# Use multiple consumers in parallel w/ 0.9 kafka brokers
# typically you would run each on a different server / process / CPU
consumer1 = KafkaConsumer(
'my-topic', group_id='my-group',
bootstrap_servers='my.server.com'
)
consumer2 = KafkaConsumer(
'my-topic', group_id='my-group',
bootstrap_servers='my.server.com'
)
简单封装:
# -*- coding: utf-8 -*-
import time
import json
import ujson
import random
from loguru import logger