CMD>conda activate python36
CMD>pip install kafka-python==2.0.2
参考一python-kafka之理论篇
参考二kafka-python的API简单介绍
参考三Python脚本消费kafka数据
1 消费者和生产者
1.1 consumer.py
from kafka import KafkaConsumer
global false, null, true
false = null = true = ''
consumer = KafkaConsumer(bootstrap_servers=['IP:9092'], auto_offset_reset='latest', group_id="group1")
consumer.subscribe('t2')
for msg in consumer:
try:
if msg.value is not None:
data_json = msg.value.decode()
print(data_json)
except Exception as e:
print(e)
print("finish")
除非报异常,否则print(“finish”)永远不会执行。
循环for里面添加time.sleep(10),可以控制消费的速度。
消息轮询是消费者API的核心,通过一个简单的轮询向服务器请求数据。
一旦消费者订阅了主题,轮询就会处理所有的细节:
包括群组协调,分区再均衡,发送心跳和获取数据,
开发者只需要使用一组简单的API来处理从分区返回的数据。
上面的代码是一个简单的利用for循环的轮询。
消费多个topic,可以写成列表。
1.2 producer.py
# coding: utf-8
import json
from kafka import KafkaProducer
# 实例化一个KafkaProducer示例,用于向Kafka投递消息
# producer = KafkaProducer(bootstrap_servers='10.23.241.179:9092')
producer = KafkaProducer(bootstrap_servers=['10.23.241.179:9092'])
msg_dict= {
"name":"lucy",
"age":28
}
msg = json.dumps(msg_dict, ensure_ascii=False).encode('utf-8')
producer.send('t2', msg)
producer.close()
1.3 多线程消费和生产
生产者kafkaProducer是线程安全对象,建议KafkaProducer采用单例模式,多个线程共享一个实例。
(1)不必每次关闭consumer和producer的连接,在进程中维持长连接即可。
(2)多线程必须结合多个分区,否则永远都会只有一个线程去消费。
(3)指定5个分区,可以开辟5个线程去消费。
import threading
import json
from kafka import KafkaProducer
from kafka import KafkaConsumer
import os
producer = KafkaProducer(bootstrap_servers=['10.23.241.179:9092'])
def long_task():
t = threading.currentThread()
print("进程ID", os.getpid(),"线程ID",t.ident)
#eval不能处理带有false null以及true的json字符串
global false, null, true
false = null = true = ''
consumer = KafkaConsumer(bootstrap_servers=['10.23.241.179:9092'], auto_offset_reset='latest', group_id="group1")
consumer.subscribe('t3')
for msg in consumer:
try:
if msg.value is not None:
json_data = msg.value.decode()
print("进程ID", os.getpid(), "线程ID", t.ident,json_data)
msg_dict = {
"name": "lucy",
"age": 28
}
# kafka发送信息
msg = json.dumps(msg_dict, ensure_ascii=False).encode('utf-8')
producer.send('tt', msg)
except Exception as e:
print(e)
for i in range(5):
t = threading.Thread(target=long_task)
t.start()
输出
进程ID 15376 线程ID 5148
线程名 Thread-1
进程ID 15376 线程ID 11160
线程名 Thread-2
进程ID 15376 线程ID 21052
线程名 Thread-3
进程ID 15376 线程ID 19324
线程名 Thread-4
进程ID 15376 线程ID 9684
线程名 Thread-5
进程ID 15376 线程ID 9684 {"name": "lucy", "age": 0}
进程ID 15376 线程ID 9684 {"name": "lucy", "age": 1}
进程ID 15376 线程ID 5148 {"name": "lucy", "age": 2}
进程ID 15376 线程ID 21052 {"name": "lucy", "age": 3}
进程ID 15376 线程ID 21052 {"name": "lucy", "age": 4}
进程ID 15376 线程ID 21052 {"name": "lucy", "age": 5}
进程ID 15376 线程ID 11160 {"name": "lucy", "age": 6}
注意获取线程ID的方式
t = threading.currentThread()
print(“进程ID”, os.getpid(),“线程ID”,t.ident)
print(‘线程名’,t.getName())
1.4 多进程消费和生产
(1)不必每次关闭consumer和producer的连接,在进程中维持长连接即可。
(2)多进程必须结合多个分区,否则永远都会只有一个进程去消费。
(3)指定5个分区,可以开辟5个进程去消费。
#encoding:utf8
import json
from kafka import KafkaProducer
from kafka import KafkaConsumer
from multiprocessing import Process
import os
import psutil
def long_task():
pid = os.getpid()
p = psutil.Process(pid)
print('进程ID',pid)
print('进程名',p.name())
#eval不能处理带有false null以及true的json字符串
global false, null, true
false = null = true = ''
consumer = KafkaConsumer(bootstrap_servers=['10.23.241.179:9092'], auto_offset_reset='latest', group_id="test1111")
consumer.subscribe('t3')
producer = KafkaProducer(bootstrap_servers=['10.23.241.179:9092'])
for msg in consumer:
try:
if msg.value is not None:
json_data = msg.value.decode()
print("进程号", os.getpid(),json_data)
msg_dict = {
"name": "lucy",
"age": 28
}
# kafka发送信息
msg = json.dumps(msg_dict, ensure_ascii=False).encode('utf-8')
producer.send('tt', msg)
except Exception as e:
print(e)
if __name__ == "__main__":
p1 = Process(target=long_task)
p2 = Process(target=long_task)
p3 = Process(target=long_task)
p4 = Process(target=long_task)
p1.start()
p2.start()
p3.start()
p4.start()
注意:
import psutil
pid = os.getpid()
p = psutil.Process(pid)
print(‘进程ID’,pid)
print(‘进程名’,p.name())
1.5 断点重连
注意:客户端程序,只会在程序启动时,去连接kafka,如果没有broker启动的情况下,会报NoBrokersAvailable错误。
一旦客户端程序连接上了kakfa,就会监听有无消息,broker的启停现在对客户端程序无影响,可以人为将broker关闭一段时间,然后再启动broker,客户端程序不会异常退出,它只看有无消息到来。
1.6 指定偏移量消费
kafka提供了偏移量的概念,允许消费者根据偏移量消费之前遗漏的内容,这基于kafka名义上的全量存储,可以保留大量的历史数据,历史保存时间是可配置的,一般是7天,如果偏移量定位到了已删除的位置那也会有问题,但是这种情况可能很小。
每个保存的数据文件都是以偏移量命名的,当前要查的偏移量减去文件名就是数据在该文件的相对位置。
要指定偏移量消费数据,需要先指定该消费者要消费的分区,否则代码会找不到分区而无法消费。
每个分区都有自己的一套偏移量,都是从0开始向上逐渐增加。
from kafka import KafkaConsumer
from kafka.structs import TopicPartition
import time
# (1)连接kafka
consumer = KafkaConsumer(group_id='group22', bootstrap_servers=['10.70.70.74:9092'])
# (2)获取该topic的分区信息
print(consumer.partitions_for_topic("ttt"))
# (3)手动指定分区给consumer去消费
# consumer.assign([TopicPartition(topic='e2alarm2', partition=0), TopicPartition(topic='e2alarm2', partition=1)])
# consumer.assign([TopicPartition(topic='e2alarm2', partition=0)])
consumer.assign([TopicPartition(topic='ttt', partition=0),TopicPartition(topic='ttt', partition=1),TopicPartition(topic='ttt', partition=2)])
print(consumer.assignment()) # 获取指定的要消费的分区
# (4) 获取给定分区的第一个偏移量
print(consumer.beginning_offsets(consumer.assignment()))
# (5) 针对分区,指定抓取的偏移量
# consumer.seek(TopicPartition(topic='e2alarm2', partition=0), 0)
consumer.seek(TopicPartition(topic='ttt', partition=0), 500)
consumer.seek(TopicPartition(topic='ttt', partition=1), 500)
consumer.seek(TopicPartition(topic='ttt', partition=2), 500)
for msg in consumer:
recv = "%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value)
print(recv)
time.sleep(0.1)
consumer.seek会随机的逐个消费分区中的数据,直到最新的数据,然后切换到另一个分区从指定位置消费,因为一直有新数据进来,所以会不断的切换分区消费,不会一直在一个分区上持续消费该分区。
输出显示
数据
1.7 指定分区生产数据
import json
import time
from kafka import KafkaProducer
# 实例化一个KafkaProducer示例,用于向Kafka投递消息
producer = KafkaProducer(bootstrap_servers=['10.70.70.74:9092'])
while True:
for i in range(3):
msg_dict = {
"partition": i,
}
msg = json.dumps(msg_dict,ensure_ascii=False).encode('utf-8')
producer.send('test', msg, partition=i)
time.sleep(1)
producer.close()
2 类KafkaConsumer
2.1 KafkaConsumer的构造参数:
*topics ,要订阅的主题
bootstrap_servers :kafka节点或节点的列表,不一定需要罗列所有的kafka节点。格式为: ‘host[:port]’ 。默认值是:localhost:9092
client_id (str) : 客户端id,默认值: ‘kafka-python-{version}’
group_id (str or None):分组id
key_deserializer (callable) :key反序列化函数
value_deserializer (callable):value反序列化函数
fetch_min_bytes:服务器应每次返回的最小数据量
fetch_max_wait_ms (int): 服务器应每次返回的最大等待时间
fetch_max_bytes (int) :服务器应每次返回的最大数据量
max_partition_fetch_bytes (int) :
request_timeout_ms (int) retry_backoff_ms (int)
reconnect_backoff_ms (int)
reconnect_backoff_max_ms (int)
max_in_flight_requests_per_connection (int)
auto_offset_reset (str) enable_auto_commit (bool)
auto_commit_interval_ms (int)
default_offset_commit_callback (callable)
check_crcs (bool)
metadata_max_age_ms (int)
partition_assignment_strategy (list)
max_poll_records (int)
max_poll_interval_ms (int)
session_timeout_ms (int)
heartbeat_interval_ms (int)
receive_buffer_bytes (int)
send_buffer_bytes (int)
socket_options (list)
consumer_timeout_ms (int)
skip_double_compressed_messages (bool)
security_protocol (str)
ssl_context (ssl.SSLContext)
ssl_check_hostname (bool)
ssl_cafile (str) –
ssl_certfile (str)
ssl_keyfile (str)
ssl_password (str)
ssl_crlfile (str)
api_version (tuple)
2.2 KafkaConsumer的函数
assign(partitions):手动为该消费者分配一个topic分区列表。
assignment():获取当前分配给该消费者的topic分区。
beginning_offsets(partitions):获取给定分区的第一个偏移量。
close(autocommit=True):关闭消费者
commit(offsets=None):提交偏移量,直到成功或错误为止。
commit_async(offsets=None, callback=None):异步提交偏移量。
committed(partition):获取给定分区的最后一个提交的偏移量。
end_offsets(partitions):获取分区的最大偏移量
highwater(partition):分区最大的偏移量
metrics(raw=False):返回消费者性能指标
next():返回下一条数据
offsets_for_times(timestamps):根据时间戳获取分区偏移量
partitions_for_topic(topic):返回topic的partition列表,返回一个set集合
pause(*partitions):停止获取数据paused():返回停止获取的分区poll(timeout_ms=0, max_records=None):获取数据
position(partition):获取分区的偏移量
resume(*partitions):恢复抓取指定的分区
seek(partition, offset):seek偏移量
seek_to_beginning(*partitions):搜索最旧的偏移量
seek_to_end(*partitions):搜索最近可用的偏移量
subscribe(topics=(), pattern=None, listener=None):订阅topics
subscription():返回当前消费者消费的所有topic
topics():返回当前消费者消费的所有topic,返回的是unicode
unsubscribe():取消订阅所有的topic
3 偏移量
3.1 获取每个分区的最新偏移量
from kafka import KafkaConsumer, TopicPartition
consumer = KafkaConsumer(bootstrap_servers=['10.12.83.64:9092'])
#获取所有的topic
topics_set = consumer.topics()
print("已有topic",topics_set)
for topic in topics_set:
print(topic)
partitions_set = consumer.partitions_for_topic("E2ALARM_IN")
print("获取指定topic的分区",partitions_set)
#获取最新偏移量
zz = [TopicPartition('E2ALARM_IN', p) for p in partitions_set]
topic_offset_dict = consumer.end_offsets(zz)
offset_value=list(topic_offset_dict.values())
print(offset_value)
转变为函数
from kafka import KafkaConsumer, TopicPartition
def get_newest_offset(broker_list,topic):
# broker_list = ['10.12.83.64:9092']
# topic = "E2ALARM_IN"
consumer = KafkaConsumer(bootstrap_servers=broker_list)
partitions_set = consumer.partitions_for_topic(topic)
#获取最新偏移量
zz = [TopicPartition('E2ALARM_IN', p) for p in partitions_set]
topic_offset_dict = consumer.end_offsets(zz)
offset_value=list(topic_offset_dict.values())
return offset_value
3.2 获取指定消费者组的偏移量
3.2.1 方式一
from kafka import BrokerConnection
from kafka.protocol.commit import *
import socket
group = 'datagroup'
bc = BrokerConnection('10.12.83.64', 9092, socket.AF_INET)
bc.connect_blocking()
fetch_offset_request = OffsetFetchRequest_v3(group, None)
future = bc.send(fetch_offset_request)
while not future.is_done:
for resp, f in bc.recv():
f.success(resp)
for topic in future.value.topics:
print('offsets for {0}'.format(topic[0]))
for partition in topic[1]:
print('- partition {0}, offset: {1}'.format(partition[0], partition[1]))
3.2.2 方式二
from kafka import KafkaConsumer, TopicPartition
def get_group_offset(brokers, group_id, topic):
"""
获取一个topic特定group已经消费的offset
"""
consumer = KafkaConsumer(bootstrap_servers=brokers,
group_id=group_id,
)
pts = [TopicPartition(topic=topic, partition=i) for i in
consumer.partitions_for_topic(topic)]
result = consumer._coordinator.fetch_committed_offsets(pts)
return [r.offset for r in result.values()]
print(get_group_offset('10.12.83.64:9092','datagroup','E2ALARM_IN'))
3.3 获取消费者组的延迟
from kafka import KafkaConsumer, TopicPartition
def get_newest_offset(broker_list,topic):
# broker_list = ['10.12.83.64:9092']
# topic = "E2ALARM_IN"
consumer = KafkaConsumer(bootstrap_servers=broker_list)
partitions_set = consumer.partitions_for_topic(topic)
#获取最新偏移量
zz = [TopicPartition('E2ALARM_IN', p) for p in partitions_set]
topic_offset_dict = consumer.end_offsets(zz)
offset_value=list(topic_offset_dict.values())
return offset_value
def get_group_offset(brokers, group_id, topic):
"""
获取一个topic特定group已经消费的offset
"""
consumer = KafkaConsumer(bootstrap_servers=brokers,
group_id=group_id,
)
pts = [TopicPartition(topic=topic, partition=i) for i in
consumer.partitions_for_topic(topic)]
result = consumer._coordinator.fetch_committed_offsets(pts)
return [r.offset for r in result.values()]
broker_list = ['10.12.83.64:9092']
topic = "E2ALARM_IN"
aa = get_group_offset(broker_list,'datagroup',topic)
bb = get_newest_offset(broker_list,topic)
import numpy as np
print(np.array(bb)-np.array(aa))