基于原版理解,以原版kafka逻辑做的封装监听程序。
# -*- coding:utf-8 -*-
# author: cyz
# time: 2021/1/28 17:51
# https://github.com/dpkp/kafka-python
# https://kafka-python.readthedocs.io/
import os, sys
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
import json
from functools import wraps
from kafka import KafkaProducer, KafkaConsumer, BrokerConnection, KafkaAdminClient, KafkaClient
from kafka.admin.new_topic import NewTopic
import pandas as pd
import threading
import time
from datetime import datetime
#kafka配置
# KAFKA_HOST = "1.1.1.1"
# KAFKA_PORT = 1111
# bootstrap_servers = [f'''{KAFKA_HOST}:{KAFKA_PORT}''']
KAFKA_TOPIC = "abc.efg.topic" #
key = b'spiderResult' # 键
bootstrap_servers = ["1.1.1.1:1111","1.1.1.2:1111","1.1.1.3:1111"]
# https://github.com/dpkp/kafka-python/issues/1308
# https://github.com/dpkp/kafka-python/issues/1308
class ConnectKafka(object):
# https://yshblog.com/blog/169
# https://cloud.tencent.com/developer/article/1563195
__instance = None
def __new__(cls, *args, **kwargs):
# 判断__instance是否有None
if cls.__instance is None:
# 若为None,则new一个对象写入到该变量
cls.__instance = object.__new__(cls, *args, **kwargs)
# 返回该变量中的对象
return cls.__instance
def __init__(self, **kwargs):
self.bootstrap_servers = bootstrap_servers
self.compression_type = kwargs.get("compression_type", "gzip")
# 防止重复初始化后失去数据
# if not hasattr(self, 'kafka_consumer_pool'):
self.kafka_consumer_pool = {}
# if not hasattr(self, 'kafka_producer_pool'):
self.kafka_producer_pool = {}
def _listens_for(num_retries, retry_interval):
# https://blog.51cto.com/yishi/2354752
def decorate(func):
def wrapper(*args, **kw):
retry = 0
while 1:
try:
func_result = func(*args, **kw)
return func_result
except Exception as ex:
print("disconnection error, retrying operation. retry times: {}".format(retry))
if num_retries == 0:
pass
else:
if retry > num_retries:
raise
time.sleep(retry_interval)
retry += 1
return wrapper
return decorate
def _on_send_success(self, record_metadata):
# print(record_metadata.topic)
# print(record_metadata.partition)
# print(record_metadata.offset)
try:
print(f'{datetime.now().strftime("%B Week %w %A: [%Y-%m-%d %H:%M:%S %f]")}, kafka send msg result ->',
record_metadata, flush=True)
except IOError as e:
if e.errno == errno.EPIPE:
pass
def _on_send_error(self, excp):
print(1)
print('I am an errback', excp)
@_listens_for(0, 5)
def connect(self,type:str, **kwargs):
if type == "admin":
admin_client = KafkaAdminClient(bootstrap_servers=self.bootstrap_servers)
return admin_client
if type == "producer":
acks = kwargs.get("acks", 1)
linger_ms = kwargs.get("linger_ms", 0)
# 检查是否存在于连接池中
kafka_producer_pool_key = self._cretePoolKey("producer" ,bootstrap_servers = self.bootstrap_servers)
if kafka_producer_pool_key in self.kafka_producer_pool.keys():
producer = self.kafka_producer_pool[kafka_producer_pool_key]
if producer.bootstrap_connected():
need_reconnect = False
else:
need_reconnect = True
else:
need_reconnect = True
if need_reconnect:
producer = KafkaProducer(bootstrap_servers=self.bootstrap_servers,
compression_type=self.compression_type,
acks=acks,
linger_ms=linger_ms,
buffer_memory=80*1024*1024,
max_request_size=80*1024*1024,
request_timeout_ms=100000 # 客户端请求超时(以毫秒为单位)
)
self.kafka_producer_pool[kafka_producer_pool_key] = producer
return producer
if type == "consumer":
topic = kwargs.get("topic", [None])
group_id = kwargs.get("group_id", None)
client_id = kwargs.get("client_id", None)
partition = kwargs.get("partition", [0]) # 分区id
max_poll_records = kwargs.get("max_poll_records", 500)
enable_auto_commit = kwargs.get("enable_auto_commit", True)
auto_commit_interval_ms = kwargs.get("auto_commit_interval_ms", 5000)
consumer_timeout_ms = kwargs.get("consumer_timeout_ms", 5000)
session_timeout_ms = kwargs.get("session_timeout_ms", 10000)
auto_offset_reset = kwargs.get("auto_offset_reset", 'latest')
max_partition_fetch_bytes = kwargs.get("max_partition_fetch_bytes", 1 * 1024 * 1024)
# 修改参数格式
if not isinstance(topic, list):
topic = [topic]
if not isinstance(partition, list):
partition = [partition]
assert len(topic) == len(partition), "topic and partition must have same items"
tp_list = [TopicPartition(topic[i], partition[i]) for i in range(len(topic))]
# 检查是否存在于连接池中
kafka_consumer_pool_key = self._cretePoolKey("consumer", topic=topic, partition=partition, group_id=group_id)
if kafka_consumer_pool_key in self.kafka_consumer_pool.keys():
consumer = self.kafka_consumer_pool[kafka_consumer_pool_key]
if consumer.bootstrap_connected():
need_reconnect = False
else:
need_reconnect = True
else:
need_reconnect = True
if need_reconnect:
consumer = KafkaConsumer(bootstrap_servers=bootstrap_servers,
consumer_timeout_ms=consumer_timeout_ms,
group_id=group_id,
client_id=client_id,
max_poll_records=max_poll_records,
enable_auto_commit=enable_auto_commit,
auto_commit_interval_ms=auto_commit_interval_ms,
session_timeout_ms = session_timeout_ms,
auto_offset_reset=auto_offset_reset,
max_partition_fetch_bytes=max_partition_fetch_bytes)
consumer.assign(tp_list) # 手动将TopicPartitions列表分配给此使用者。
self.kafka_consumer_pool[kafka_consumer_pool_key] = consumer
consumer_type = kwargs.get("consumer_type", None) # 额外控制参数
if consumer_type is None:
return consumer
else:
offsetM_list = kwargs.get("offset", [None])
if not isinstance(offsetM_list, list):
offsetM_list = [offsetM_list]
assert len(tp_list) == len(offsetM_list), "topic and offset must have same items"
for i in range(len(offsetM_list)):
tp = tp_list[i]
offsetM = offsetM_list[i]
if consumer_type == "receive":
assert isinstance(offsetM, int) and offsetM >= 0, "offset must be >= 0"
consumer.seek(tp, offset=offsetM) # 手动指定TopicPartition的获取偏移量
if consumer_type == "receive_group":
bo = consumer.beginning_offsets([tp])[tp]
eo = consumer.end_offsets([tp])[tp]
# 优先人工设置偏移,从人工给定处开始取数,若否则从kafka记录中最后一次提取中获取
if offsetM is None:
offset = consumer.committed(tp) # 查询系统最近的offseet
else:
# https://www.runoob.com/python3/python3-assert.html
assert isinstance(offsetM, int) and offsetM >= 0, "offset must be >= 0"
if (offsetM < bo) or (offsetM > eo):
# raise ValueError(f"topic offset is between ({bo},{eo}),but input {offsetM}")
warnings.warn(f"topic offset is between ({bo},{eo}),but input {offsetM}")
offset = offsetM
consumer.seek(tp, offset=offset) # 手动指定TopicPartition的获取偏移量
# 若kafka中无提交记录/或者提交的记录与实际异常,则重新设置位移位置。
if (offset is None) or (offset < bo) or (offset > eo):
if offset is not None:
warnings.warn(f"{tp} maybe lost data num: {abs(offset - bo)}")
offset = bo
consumer.seek(tp, offset=offset) # 手动指定TopicPartition的获取偏移量
return consumer
if type == "client":
# 这是一个内部类,用于实现面向用户的producer和consumer客户端。
client = KafkaClient(bootstrap_servers=self.bootstrap_servers)
return client
def _cretePoolKey(self, key_type=None, **kwargs):
'''
连接池的key的构造功能-方便外部调度使用
:param key_type:str, key的类型
:param bootstrap_servers:str, producer_key的变量参数
:param topic:str, consumer_key的变量参数, 主题
:param partition:int, consumer_key的变量参数, 分区
:param group_id:str, consumer_key的变量参数, 监听组
'''
assert key_type in ["consumer", "producer"], "key_type must in ['consumer', 'producer']"
if key_type == "producer":
bootstrap_servers = kwargs.get("bootstrap_servers", self.bootstrap_servers)
key = hashlib.md5(str(bootstrap_servers).encode()).hexdigest()
return key
else:
topic = kwargs.get("topic", None)
partition = kwargs.get("partition", None)
group_id = kwargs.get("group_id", None)
key = hashlib.md5(str([topic, partition, group_id]).encode()).hexdigest()
return key
def send(self, topic: str, send_message:str, **kwargs):
assert isinstance(topic, str), "topic must be str"
assert isinstance(send_message, str) or isinstance(send_message, bytes), "send_messages must be str or bytes"
key = kwargs.get("key",None)
headers = kwargs.get("headers",None)
partition = kwargs.get("partition", 0)
timestamp_ms = kwargs.get("timestamp_ms",None)
conn = kwargs.get("conn", None)
is_print = kwargs.get("is_print", True)
is_asyn = kwargs.get("is_asyn", False)
if isinstance(send_message, str):
send_message = send_message.encode()
if conn is None:
conn = self.connect(type="producer")
if is_asyn: # 异步发送
future = conn.send(topic, key=key, value=send_message, partition=partition, headers=headers,
timestamp_ms=timestamp_ms)
if is_print:
future.add_callback(self._on_send_success)
future.add_errback(self._on_send_error)
conn.flush()
else: # 同步发送
future = conn.send(topic, key=key, value=send_message, partition=partition, headers=headers, timestamp_ms=timestamp_ms)
result = future.get(timeout=100) # 获取实例信息
if is_print:
try:
print(f'{datetime.now().strftime("%B Week %w %A: [%Y-%m-%d %H:%M:%S %f]")}, kafka send msg result ->', result,flush=True)
except IOError as e:
if e.errno == errno.EPIPE:
pass
def asynSend(self, topic: str, send_messages: list, **kwargs):
# 异步批量发送
assert isinstance(topic, str), "topic must be str"
assert isinstance(send_messages, list), "send_messages must be list"
key = kwargs.get("key", None)
headers = kwargs.get("headers", None)
partition = kwargs.get("partition", 0)
timestamp_ms = kwargs.get("timestamp_ms", None)
conn = kwargs.get("conn", None)
is_print = kwargs.get("is_print", True)
if conn is None:
conn = self.connect(type="producer", acks="all")
for message in send_messages:
if isinstance(message, bytes):
pass
else:
message = str(message)
if isinstance(message, str):
message = message.encode
# 写法1
# https://kafka-python.readthedocs.io/en/master/usage.html?highlight=callback
# if is_print:
# conn.send(topic, key=key, value=message.encode(), partition=partition, headers=headers, timestamp_ms=timestamp_ms)\
# .add_callback(self._on_send_success)\
# .add_errback(self._on_send_error)
# else:
# conn.send(topic, key=key, value=message.encode(), partition=partition, headers=headers, timestamp_ms=timestamp_ms)\
# .add_errback(self._on_send_error)
# 写法2
# https://stackoverflow.com/questions/46388116/how-to-add-a-failure-callback-for-kafka-python-kafka-kafkaproducersend
future = conn.send(topic, key=key, value=message, partition=partition, headers=headers, timestamp_ms=timestamp_ms)
if is_print:
future.add_callback(self._on_send_success)
future.add_errback(self._on_send_error)
# future = conn.send(topic, key=key, value=message.encode(), partition=partition, headers=headers, timestamp_ms=timestamp_ms)
# future.add_callback(self._cb_send_success, topic)
# future.add_errback(self._cb_send_failure, topic, message, partition=partition, headers=headers, timestamp_ms=timestamp_ms,conn=conn)
# print("start flush")
conn.flush()
# print("end flush")
def receiveBroadcast(self,topic,**kwargs):
'''
接收数据-广播模式
:param topic: str, 主题名称
:param consumer_timeout_ms: int, 超时自动断线时间
:return:
'''
self.receive_queue = []
consumer_timeout_ms = kwargs.get("consumer_timeout_ms", 10)
consumer_timeout_ms = consumer_timeout_ms * 1000
def _receive(topic,consumer_timeout_ms):
conn = self.connect(type="consumer", topic=topic, consumer_timeout_ms=consumer_timeout_ms)
for msg in conn:
# print("get data:", msg.key.decode(), msg.value.decode())
self.receive_queue.append(msg)
thread = threading.Thread(target=_receive, args=([topic,consumer_timeout_ms,])) # 创建一个线程
thread.start()
# while thread.is_alive():
# pass
# print(123)
# self._flag_thread = thread
# def _receiveClose(self):
# thread = self._flag_thread
# thread.join()
def receive(self, topic, offset, **kwargs):
'''
查询数据
:param topic: str, 主题名称
:param partition: int, 主题分区
:param offset: int, 偏移位置
:param max_poll_records: int, 每次请求最大获取数据量
:param consumer_timeout_ms: int, 消费者查询超时时间
:return :list,[ConsumerRecord_0,ConsumerRecord_1],其中ConsumerRecord为dict。
eg:
(topic='test1', partition=0, offset=22, timestamp=1647932703558, timestamp_type=0, key=b'spiderResult',
value=b"[{'col': 'value11', 'col1': 'value12'}, {'col': 'value21', 'col1': 'value22'}]",
headers=[], checksum=None, serialized_key_size=12, serialized_value_size=78, serialized_header_size=-1)
'''
max_poll_records = kwargs.get("max_poll_records", 1) # 最大获取数据量
consumer_timeout_ms = kwargs.get("consumer_timeout_ms", 300)
consumer_timeout_ms = consumer_timeout_ms * 1000
partition = kwargs.get("partition", [0]) # 分区id
max_partition_fetch_bytes = kwargs.get("max_partition_fetch_bytes", 1 * 1024 * 1024)
if not isinstance(topic, list):
topic = [topic]
if not isinstance(partition, list):
partition = [partition]
assert len(topic) == len(partition), "topic and partition must have same items"
if not isinstance(offset, list):
offset = [offset]
assert len(topic) == len(offset), "topic and offset must have same items"
tp_list = [TopicPartition(topic[i], partition[i]) for i in range(len(topic))]
conn = kwargs.get("conn", None)
if conn is None:
conn = self.connect(topic=topic,
partition=partition,
type="consumer",
consumer_timeout_ms=consumer_timeout_ms,
max_poll_records=max_poll_records,
max_partition_fetch_bytes=max_partition_fetch_bytes,
consumer_type="receive",
offset=offset)
source = conn.poll(timeout_ms=consumer_timeout_ms, max_records=max_poll_records)
if source == {}:
result = []
else:
result = []
[result.extend(source[tp]) for tp in source.keys()]
return result
def receiveGroup(self, group_id:str, client_id, topic:str or list, **kwargs):
'''
查询数据-用户组
:param group_id: str, 用户组
:param client_id: str, 用户
:param topic: str, 主题名称
:param block: int, 是否阻塞,默认不阻塞,0为阻塞,1为不阻塞
:param noack: int, 是否不提交commit,默认自动提交,0为提交,1为不提交
:param partition: int, 主题分区
:param offset: int, 偏移位置
:param max_poll_records: int, 每次请求最大获取数据量
:param consumer_timeout_ms: int, 消费者查询超时时间
:return :list,[ConsumerRecord_0,ConsumerRecord_1],其中ConsumerRecord为dict。
eg:
(topic='test1', partition=0, offset=22, timestamp=1647932703558, timestamp_type=0, key=b'spiderResult',
value=b"[{'col': 'value11', 'col1': 'value12'}, {'col': 'value21', 'col1': 'value22'}]",
headers=[], checksum=None, serialized_key_size=12, serialized_value_size=78, serialized_header_size=-1)
'''
# group内部控制参数
block = kwargs.get("block", 1) # 是否阻塞,默认不阻塞
noack = kwargs.get("noack", 0) # 是否不提交commit,默认自动提交
conn = kwargs.get("conn", None) # 外部建立好的连接
# 生成连接所用的参数
offset = kwargs.get("offset", [None]) # 人工设置偏移量,默认为None
max_poll_records = kwargs.get("max_poll_records", 1) # 最大获取数据量
consumer_timeout_ms = kwargs.get("consumer_timeout_ms", 30)
consumer_timeout_ms = consumer_timeout_ms * 1000
partition = kwargs.get("partition", [0]) # 分区id
auto_offset_reset = kwargs.get("auto_offset_reset", "latest")
max_partition_fetch_bytes = kwargs.get("max_partition_fetch_bytes", 1 * 1024 * 1024)
if not isinstance(topic, list):
topic = [topic]
if not isinstance(partition, list):
partition = [partition]
assert len(topic) == len(partition), "topic and partition must have same items"
if not isinstance(offset, list):
offset = [offset]
assert len(topic) == len(offset), "topic and offset must have same items"
tp_list = [TopicPartition(topic[i], partition[i]) for i in range(len(topic))]
if noack:
enable_auto_commit = False
else:
enable_auto_commit = True
if conn is None:
conn = self.connect(topic=topic,
partition=partition,
type="consumer",
group_id=group_id,
client_id=client_id,
enable_auto_commit=enable_auto_commit,
consumer_timeout_ms=consumer_timeout_ms,
max_poll_records=max_poll_records,
auto_offset_reset=auto_offset_reset,
max_partition_fetch_bytes=max_partition_fetch_bytes,
consumer_type="receive_group",
offset=offset
)
result = []
while 1:
poll_records = max_poll_records - len(result) # 需要请求数据
source = conn.poll(timeout_ms=consumer_timeout_ms, max_records=poll_records)
if source == {}:
if block:
if enable_auto_commit:
conn.commit() # 只有关掉连接后auto commit才会把数据commit上去。所以这里手动模拟自动提交
return result
else:
# 应对极端情况,上1秒该offset有效,下一秒无效情况
for tp in tp_list:
tmp_bo = conn.beginning_offsets([tp])[tp]
if offset < tmp_bo:
offset = tmp_bo
conn.seek(tp, offset=offset) # 手动指定TopicPartition的获取偏移量
else:
if enable_auto_commit:
conn.commit() # 只有关掉连接后auto commit才会把数据commit上去。所以这里手动模拟自动提交
[result.extend(source[tp]) for tp in source.keys()]
if len(result) < max_poll_records:
continue
else:
return result
def getKafkaVersion(self, is_print = False):
client = self.connect(type="client")
broker_version = client.check_version()
api_versions = client.get_api_versions()
if is_print:
print("broker version: "+str(broker_version))
print("api version: "+str(api_versions))
client.close()
return broker_version, api_versions
def commit(self,conn, topic:str, offset:int, **kwargs):
'''
手动提交偏移(自动在当前位点+1)
:param conn:连接kafka的连接
:param topic:str,主题
:param partition:int,分区
:param offset:int,位点
:return:True or None:如果成功了返回True,失败则无返回
'''
partition = kwargs.get("partition", 0) # 分区id
tp = TopicPartition(topic, partition)
conn.commit({tp: OffsetAndMetadata(offset=offset+1, metadata='')})
def _createTopics(self,need_create_topics:list):
'''
用于创建新主题的类
:param need_create_topics:list(dict)
dict:name (string):主题名称
num_partitions (int):分区数或-1如果指定了copy_assignment)
replication_factor (int):复制因子 或者-1 (如果指定了eplica赋值)
replica_assignment (dict of int: [int]):包含分区id和要分配给它的副本的映射。
topic_configs (dict of str: str):主题的配置键和值的映射。
:return:True or None:如果成功了返回True,失败则无返回
'''
# need_create_topics = [{"name":"test","num_partitions":1,"replication_factor":1}]
topic_list = []
for topic in need_create_topics:
topic_list.append(NewTopic(name=topic.get("name"),
num_partitions=topic.get("num_partitions"),
replication_factor=topic.get("replication_factor"),
replica_assignments=topic.get("replica_assignments"),
topic_configs=topic.get("topic_configs")
)
)
conn = self.connect(type="admin")
conn.create_topics(new_topics=topic_list)
return True
def _deleteTopics(self,need_delete_topics:list):
'''
删除主题
:param need_delete_topics: list,topic的名称(str)的列表
:return: True or None:如果成功了返回True,失败则无返回
'''
self.connect(type="admin").delete_topics(need_delete_topics)
return True
def _listTopics(self):
'''
查询所有主题名称
:return: list,主题列表
'''
return self.connect(type="admin").list_topics()
def _describeTopics(self):
return self.connect(type="admin").describe_topics()
def _describeCluster(self):
return self.connect(type="admin").describe_cluster()
#
# # 管理者
# admin_client = KafkaAdminClient(bootstrap_servers=bootstrap_servers)
# # admin_client.create_topics(["test"])
# admin_client.delete_topics(["test"])
# admin_client.list_topics()
# admin_client.describe_topics()
# admin_client.describe_cluster()
# 消费者
# consumer = KafkaConsumer('test1',bootstrap_servers=bootstrap_servers)
# # consumer.bootstrap_connected()
#
# for msg in consumer:
# print("get data:",msg.key.decode(),msg.value.decode())
# consumer.topics() # 查看所有主题
# consumer.subscription()
# consumer.close()
# producer = KafkaProducer(bootstrap_servers=bootstrap_servers, compression_type='gzip')
# producer.send(queue = "test1",key=key, value=send_message,partition = 0)
#
# producer.bootstrap_connected()
# producer.partitions_for("test")
# producer.close()
if __name__ == "__main__":
print("初始化")
# ck = ConnectKafka(KAFKA_HOST,KAFKA_PORT)
ck = ConnectKafka()
# print("获取服务器信息")
# _, _ = ck.getKafkaVersion(is_print=True)
# print("创建主题")
# need_create_topics = [{"name":"test1","num_partitions":1,"replication_factor":1}]
# ck._createTopics(need_create_topics)
# print("查询主题")
# print("list topics:")
# print(ck._listTopics())
# print("describe topics:")
# print(ck._describeTopics())
# print("describe cluster:")
# print(ck._describeCluster())
#
# print("删除主题")
# need_delete_topics = ["test1"]
# ck._deleteTopics(need_delete_topics)
# print("查询主题")
# print("list topics:")
# print(ck._listTopics())
print("开始查询数据")
topic = "test1"
topic1 = "test2"
topic2 = "test3"
# ck.receive(topic,consumer_timeout_ms = 10) # 简易接收
time.sleep(1)
client_id = "1"
source = pd.DataFrame([["value11","value12"],["value21","value22"]],columns=["col","col1"])
conn = ck.connect(type="producer", acks="all")
print(1)
time.sleep(5)
print(2)
# ck.send(topic, str(source.to_dict(orient="records")), key=key) # 同步发送
# ck.send(topic1, str(source.to_dict(orient="records")), key=key) # 同步发送
# ck.asynSend(topic, source.to_dict(orient="records"), conn=conn, is_print=True) # 异步发送
# print(3)
# a = ck.receive(topic, group_id="python-admin" ,partition=0, consumer_timeout_ms=10, offset=63, max_poll_records=10) # 查询模式
# a = ck.receive([topic, topic1], group_id="python-admin",partition=[0,0], consumer_timeout_ms=10, offset=[63, 0], max_poll_records=10) # 查询模式
# b = ck.receiveGroup(group_id="python-admin", topic=topic, client_id="asdasd", noack=1, offset=0) # 监听模式
b = ck.receiveGroup(group_id="python-admin", partition=[0, 0, 0], topic=[topic2,topic, topic1], client_id="asdasd", noack=1, offset=[0, 0, 0], max_poll_records=10) # 监听模式
print(b)
b = ck.receiveGroup(group_id="python-admin", partition=[0, 0, 0], topic=[topic2,topic, topic1], client_id="asdasd", noack=1, offset=[0, 0, 0], max_poll_records=10) # 监听模式
print(b)
# time.sleep(5)
# print(ck.receive_queue)
# ck.receiveBroadcast(topic)
# ck.send(topic, str(source.to_dict(orient="records")), key=key)
# print(ck.receive_queue)
# ck.send(topic, str(source.to_dict(orient="records")), key=key)
# print(ck.receive_queue)
基于业务场景,实际需求而调整的版本。