pykafka生产消费常用api
pykafka基本生产消费常用api
生产者
案例
#coding=utf-8 import time from py<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/kafka" title="View all posts in kafka" target="_blank">kafka</a></span> import KafkaClient class KafkaTest(object): """ 测试<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/kafka" title="View all posts in kafka" target="_blank">kafka</a></span>常用api """ def __init__(self, host="192.168.237.129:9092"): self.host = host self.client = KafkaClient(hosts=self.host) def producer_partition(self): """ 生产者分区查看,主要查看生产消息时offset的变化 :return: """ topic = self.client.topics["test_topic".encode()] partitions = topic.partitions print (u"查看所有分区 {}".format(partitions)) earliest_offset = topic.earliest_available_offsets() print(u"获取最早可用的offset {}".format(earliest_offset)) # 生产消息之前看看offset last_offset = topic.latest_available_offsets() print(u"最近可用offset {}".format(last_offset)) # 同步生产消息 p = topic.get_producer(sync=True) p.produce(str(time.time()).encode()) # 查看offset的变化 last_offset = topic.latest_available_offsets() print(u"最近可用offset {}".format(last_offset)) def producer_designated_partition(self): """ 往指定分区写消息,如果要控制打印到某个分区, 需要在获取生产者的时候指定选区函数, 并且在生产消息的时候额外指定一个key :return: """ def assign_patition(pid, key): """ 指定特定分区, 这里测试写入第一个分区(id=0) :param pid: 为分区列表 :param key: :return: """ print("为消息分配partition {} {}".format(pid, key)) return pid[0] topic = self.client.topics["test_topic".encode()] p = topic.get_producer(sync=True, partitioner=assign_patition) p.produce(str(time.time()).encode(), partition_key=b"partition_key_0") def async_produce_message(self): """ 异步生产消息,消息会被推到一个队列里面, 另外一个线程会在队列中消息大小满足一个阈值(min_queued_messages) 或到达一段时间(linger_ms)后统一发送,默认5s :return: """ topic = self.client.topics["kafka_test".encode()] last_offset = topic.latest_available_offsets() print("最近的偏移量 offset {}".format(last_offset)) # 记录最初的偏移量 old_offset = last_offset[0].offset[0] p = topic.get_producer(sync=False, partitioner=lambda pid, key: pid[0]) p.produce(str(time.time()).encode()) s_time = time.time() while True: last_offset = topic.latest_available_offsets() print("最近可用offset {}".format(last_offset)) if last_offset[0].offset[0] != old_offset: e_time = time.time() print('cost time {}'.format(e_time-s_time)) break time.sleep(1) def get_produce_message_report(self): """ 查看异步发送消报告,默认会等待5s后才能获得报告 """ topic = self.client.topics["kafka_test".encode()] last_offset = topic.latest_available_offsets() print("最近的偏移量 offset {}".format(last_offset)) p = topic.get_producer(sync=False, delivery_reports=True, partitioner=lambda pid, key: pid[0]) p.produce(str(time.time()).encode()) s_time = time.time() delivery_report = p.get_delivery_report() e_time = time.time() print ('等待{}s, 递交报告{}'.format(e_time-s_time, delivery_report)) last_offset = topic.latest_available_offsets() print("最近的偏移量 offset {}".format(last_offset)) if __name__ == '__main__': kafka_ins = KafkaTest() # kafka_ins.producer_partition() # kafka_ins.producer_designated_partition() # kafka_ins.async_produce_message() kafka_ins.get_produce_message_report()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
|
#coding=utf-8
import
time
from
pykafka
import
KafkaClient
class
KafkaTest
(
object
)
:
"""
测试kafka常用api
"""
def
__init__
(
self
,
host
=
"192.168.237.129:9092"
)
:
self
.
host
=
host
self
.
client
=
KafkaClient
(
hosts
=
self
.
host
)
def
producer_partition
(
self
)
:
"""
生产者分区查看,主要查看生产消息时offset的变化
:return:
"""
topic
=
self
.
client
.
topics
[
"test_topic"
.
encode
(
)
]
partitions
=
topic
.
partitions
print
(
u
"查看所有分区 {}"
.
format
(
partitions
)
)
earliest_offset
=
topic
.
earliest_available_offsets
(
)
print
(
u
"获取最早可用的offset {}"
.
format
(
earliest_offset
)
)
# 生产消息之前看看offset
last_offset
=
topic
.
latest_available_offsets
(
)
print
(
u
"最近可用offset {}"
.
format
(
last_offset
)
)
# 同步生产消息
p
=
topic
.
get_producer
(
sync
=
True
)
p
.
produce
(
str
(
time
.
time
(
)
)
.
encode
(
)
)
# 查看offset的变化
last_offset
=
topic
.
latest_available_offsets
(
)
print
(
u
"最近可用offset {}"
.
format
(
last_offset
)
)
def
producer_designated_partition
(
self
)
:
"""
往指定分区写消息,如果要控制打印到某个分区,
需要在获取生产者的时候指定选区函数,
并且在生产消息的时候额外指定一个key
:return:
"""
def
assign_patition
(
pid
,
key
)
:
"""
指定特定分区, 这里测试写入第一个分区(id=0)
:param pid: 为分区列表
:param key:
:return:
"""
print
(
"为消息分配partition {} {}"
.
format
(
pid
,
key
)
)
return
pid
[
0
]
topic
=
self
.
client
.
topics
[
"test_topic"
.
encode
(
)
]
p
=
topic
.
get_producer
(
sync
=
True
,
partitioner
=
assign_patition
)
p
.
produce
(
str
(
time
.
time
(
)
)
.
encode
(
)
,
partition_key
=
b
"partition_key_0"
)
def
async_produce_message
(
self
)
:
"""
异步生产消息,消息会被推到一个队列里面,
另外一个线程会在队列中消息大小满足一个阈值(min_queued_messages)
或到达一段时间(linger_ms)后统一发送,默认5s
:return:
"""
topic
=
self
.
client
.
topics
[
"kafka_test"
.
encode
(
)
]
last_offset
=
topic
.
latest_available_offsets
(
)
print
(
"最近的偏移量 offset {}"
.
format
(
last_offset
)
)
# 记录最初的偏移量
old_offset
=
last_offset
[
0
]
.
offset
[
0
]
p
=
topic
.
get_producer
(
sync
=
False
,
partitioner
=
lambda
pid
,
key
:
pid
[
0
]
)
p
.
produce
(
str
(
time
.
time
(
)
)
.
encode
(
)
)
s_time
=
time
.
time
(
)
while
True
:
last_offset
=
topic
.
latest_available_offsets
(
)
print
(
"最近可用offset {}"
.
format
(
last_offset
)
)
if
last_offset
[
0
]
.
offset
[
0
]
!=
old_offset
:
e_time
=
time
.
time
(
)
print
(
'cost time {}'
.
format
(
e_time
-
s_time
)
)
break
time
.
sleep
(
1
)
def
get_produce_message_report
(
self
)
:
"""
查看异步发送消报告,默认会等待5s后才能获得报告
"""
topic
=
self
.
client
.
topics
[
"kafka_test"
.
encode
(
)
]
last_offset
=
topic
.
latest_available_offsets
(
)
print
(
"最近的偏移量 offset {}"
.
format
(
last_offset
)
)
p
=
topic
.
get_producer
(
sync
=
False
,
delivery_reports
=
True
,
partitioner
=
lambda
pid
,
key
:
pid
[
0
]
)
p
.
produce
(
str
(
time
.
time
(
)
)
.
encode
(
)
)
s_time
=
time
.
time
(
)
delivery_report
=
p
.
get_delivery_report
(
)
e_time
=
time
.
time
(
)
print
(
'等待{}s, 递交报告{}'
.
format
(
e_time
-
s_time
,
delivery_report
)
)
last_offset
=
topic
.
latest_available_offsets
(
)
print
(
"最近的偏移量 offset {}"
.
format
(
last_offset
)
)
if
__name__
==
'__main__'
:
kafka_ins
=
KafkaTest
(
)
# kafka_ins.producer_partition()
# kafka_ins.producer_designated_partition()
# kafka_ins.async_produce_message()
kafka_ins
.
get_produce_message_report
(
)
|
注意要点: 多进程使用pykafka共享一个client,会造成只有进程能够正常的写入数据,如果使用了dliver_report(包括同步),会导致子进程彻底阻塞掉不可用
消费者
pykafka消费者分为simple和balanced两种
simple适用于需要消费指定分区且不需要自动的重分配(自定义)
balanced自动分配则选择
案例
<br />\#coding=utf-8 from pykafka import KafkaClient class KafkaTest(object): def __init__(self, host="192.168.237.129:9092"): self.host = host self.client = KafkaClient(hosts=self.host) def simple_consumer(self, offset=0): """ 消费者指定消费 :param offset: :return: """ topic = self.client.topics["kafka_test".encode()] partitions = topic.partitions last_offset = topic.latest_available_offsets() print("最近可用offset {}".format(last_offset)) # 查看所有分区 consumer = topic.get_simple_consumer(b"simple_consumer_group", partitions=[partitions[0]]) # 选择一个分区进行消费 offset_list = consumer.held_offsets print("当前消费者分区offset情况{}".format(offset_list)) # 消费者拥有的分区offset的情况 consumer.reset_offsets([(partitions[0], offset)]) # 设置offset msg = consumer.consume() print("消费 :{}".format(msg.value.decode())) msg = consumer.consume() print("消费 :{}".format(msg.value.decode())) msg = consumer.consume() print("消费 :{}".format(msg.value.decode())) offset = consumer.held_offsets print("当前消费者分区offset情况{}".format(offset)) # 3 def balance_consumer(self, offset=0): """ 使用balance consumer去消费kafka :return: """ topic = self.client.topics["kafka_test".encode()] # managed=True 设置后,使用新式reblance分区方法,不需要使用zk,而False是通过zk来实现reblance的需要使用zk consumer = topic.get_balanced_consumer(b"consumer_group_balanced2", managed=True) partitions = topic.partitions print("分区 {}".format(partitions)) earliest_offsets = topic.earliest_available_offsets() print("最早可用offset {}".format(earliest_offsets)) last_offsets = topic.latest_available_offsets() print("最近可用offset {}".format(last_offsets)) offset = consumer.held_offsets print("当前消费者分区offset情况{}".format(offset)) while True: msg = consumer.consume() offset = consumer.held_offsets print("{}, 当前消费者分区offset情况{}".format(msg.value.decode(), offset)) if __name__ == '__main__': kafka_ins = KafkaTest() # kafka_ins.simple_consumer() kafka_ins.balance_consumer()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
<
br
/
>
\
#coding=utf-8
from
pykafka
import
KafkaClient
class
KafkaTest
(
object
)
:
def
__init__
(
self
,
host
=
"192.168.237.129:9092"
)
:
self
.
host
=
host
self
.
client
=
KafkaClient
(
hosts
=
self
.
host
)
def
simple_consumer
(
self
,
offset
=
0
)
:
"""
消费者指定消费
:param offset:
:return:
"""
topic
=
self
.
client
.
topics
[
"kafka_test"
.
encode
(
)
]
partitions
=
topic
.
partitions
last_offset
=
topic
.
latest_available_offsets
(
)
print
(
"最近可用offset {}"
.
format
(
last_offset
)
)
# 查看所有分区
consumer
=
topic
.
get_simple_consumer
(
b
"simple_consumer_group"
,
partitions
=
[
partitions
[
0
]
]
)
# 选择一个分区进行消费
offset_list
=
consumer
.
held_offsets
print
(
"当前消费者分区offset情况{}"
.
format
(
offset_list
)
)
# 消费者拥有的分区offset的情况
consumer
.
reset_offsets
(
[
(
partitions
[
0
]
,
offset
)
]
)
# 设置offset
msg
=
consumer
.
consume
(
)
print
(
"消费 :{}"
.
format
(
msg
.
value
.
decode
(
)
)
)
msg
=
consumer
.
consume
(
)
print
(
"消费 :{}"
.
format
(
msg
.
value
.
decode
(
)
)
)
msg
=
consumer
.
consume
(
)
print
(
"消费 :{}"
.
format
(
msg
.
value
.
decode
(
)
)
)
offset
=
consumer
.
held_offsets
print
(
"当前消费者分区offset情况{}"
.
format
(
offset
)
)
# 3
def
balance_consumer
(
self
,
offset
=
0
)
:
"""
使用balance consumer去消费kafka
:return:
"""
topic
=
self
.
client
.
topics
[
"kafka_test"
.
encode
(
)
]
# managed=True 设置后,使用新式reblance分区方法,不需要使用zk,而False是通过zk来实现reblance的需要使用zk
consumer
=
topic
.
get_balanced_consumer
(
b
"consumer_group_balanced2"
,
managed
=
True
)
partitions
=
topic
.
partitions
print
(
"分区 {}"
.
format
(
partitions
)
)
earliest_offsets
=
topic
.
earliest_available_offsets
(
)
print
(
"最早可用offset {}"
.
format
(
earliest_offsets
)
)
last_offsets
=
topic
.
latest_available_offsets
(
)
print
(
"最近可用offset {}"
.
format
(
last_offsets
)
)
offset
=
consumer
.
held_offsets
print
(
"当前消费者分区offset情况{}"
.
format
(
offset
)
)
while
True
:
msg
=
consumer
.
consume
(
)
offset
=
consumer
.
held_offsets
print
(
"{}, 当前消费者分区offset情况{}"
.
format
(
msg
.
value
.
decode
(
)
,
offset
)
)
if
__name__
==
'__main__'
:
kafka_ins
=
KafkaTest
(
)
# kafka_ins.simple_consumer()
kafka_ins
.
balance_consumer
(
)
|