python kafka获取对应时间范围内的消息实例代码

最新推荐文章于 2025-07-25 17:11:04 发布

原创最新推荐文章于 2025-07-25 17:11:04 发布 · 3.4k 阅读

0 ·

CC 4.0 BY-SA版权

kafka消息队列专栏收录该内容

1 篇文章

订阅专栏

本文详细介绍了如何在Kafka中使用offsetsForTimes函数，确保数据的时间戳与偏移量正确对应。需注意Kafka版本至少为0.11，且生产者配置与log.message.timestamp.type参数开启。文章通过示例代码展示了如何设置消费者，查找特定时间窗口内的消息偏移量。

应用offsets_for_times前提是kafka版本0.11及以上版本

并且跟producer配置也有关系

使用KafkaConsumer.offsetsForTimes要确认集群已开启log.message.timestamp.type参数，并且clien要使用0.10.*以及以上的客户端发送数据，数据格式和0.9不同了

具体可看官方文档，在kafka节点上会除了.index和.log文件还多了一个时间跟offset对应关系的文件。

flow_test.py

# coding: utf-8
import sys
import datetime
import time

reload(sys)
sys.setdefaultencoding('utf-8')
import json
import hashlib
import ConfigParser
import OaDataSystem
import sys

sys.setdefaultencoding('utf-8')
import CrmOperator
from kafka import KafkaConsumer
from kafka.structs import TopicPartition, BrokerMetadata
from kafka.errors import KafkaError
import json
import send_email
from threading import Thread
from time import ctime, sleep
import redis
import traceback


class kafkaprocess():

    def msg_process(self, message_set, class_set):
        class_set['kafka_log'].logOperator(1, str(message_set))
        for msg in message_set:
            myjson = json.loads(msg.value.decode('utf-8'))
            if int(myjson['data']['teacher_id']) == 6557:
                print "test0=====",msg.key.decode('utf-8')

def getmd5(filename):
    fd = open(filename, "r")
    fcont = fd.read()
    fd.close()
    fmd5 = hashlib.md5(fcont)
    return fmd5.hexdigest()


def working(config):
    global conf_md5
    while True:
        sleep(5)
        conf_ing_md5 = getmd5('para.conf')
        if conf_md5 != conf_ing_md5:
            config.read("para.conf")
            conf_md5 = conf_ing_md5

def get_offset_time_window(consumer, partitions_structs, begin_time, end_time):
    begin_search = {}
    for partition in partitions_structs:
        begin_search[partition] = begin_time if isinstance(begin_time, int) else __str_to_timestamp(begin_time)
    begin_offset = consumer.offsets_for_times(begin_search)
    print "test31=====",begin_search
    print "test32=======",begin_offset
    end_search = {}
    for partition in partitions_structs:
        end_search[partition] = end_time if isinstance(end_time, int) else __str_to_timestamp(end_time)
    end_offset = consumer.offsets_for_times(end_search)

    for topic_partition, offset_and_timestamp in begin_offset.items():
        b_offset = 'null' if offset_and_timestamp is None else offset_and_timestamp[0]
        e_offset = 'null' if end_offset[topic_partition] is None else end_offset[topic_partition][0]
        print('Between {0} and {1}, {2} offset range = [{3}, {4}]'.format(begin_time, end_time, topic_partition, b_offset, e_offset))
        if b_offset != 'null':
            print "test33========",topic_partition,b_offset
            consumer.seek(topic_partition,b_offset)
    return consumer,e_offset

def __str_to_timestamp(str_time, format_type='%Y-%m-%d %H:%M:%S'):
    time_array = time.strptime(str_time, format_type)
    return int(time.mktime(time_array)) * 1000

def workline1():
    try:
        begin_time = '2019-08-01 10:54:15'
        end_time = '2019-08-20 19:01:15'
       # consumer = KafkaConsumer(group_id=config.get("db", "main_group_id"),
       #                          bootstrap_servers=config.get("db", "bootstrap_servers"))
        consumer = KafkaConsumer(group_id=config.get("db", "main_group_id"),
                                 sasl_plain_username='xes_oa', sasl_plain_password='CnYN88zKd44tV7ng',
                                 security_protocol='SASL_PLAINTEXT', sasl_mechanism='PLAIN',
                                 bootstrap_servers=config.get("db", "bootstrap_servers")
                                 )

        tps = []
        for p in consumer.partitions_for_topic(str(config.get("db", "main_topic_id"))):
            tp = TopicPartition(str(config.get("db", "main_topic_id")), p)
            tps.append(tp)
        print "test30======",consumer.config['api_version'],tps
        consumer.assign(tps)
        consumer, end_offset = get_offset_time_window(consumer, tps, begin_time, end_time)
        
        message_sets = []
        start_time = end_time = int(time.time())
        while True:
            try:
                value_ans = consumer.poll(max_records=20).values()
                if len(value_ans) > 0:
                    for par in value_ans:
                        msg_offset = int(par.offset)
                        msg_partition = int(par.partition)
                        msg_topic = str(par.topic)
                        if (对应分片的截止时间戳的end_offset到达)：
                            停止
                        message_sets += par
                    kp.msg_process(message_sets, class_set)
                    del message_sets[:]
            except Exception, e:
                traceback.print_exc()
                print Exception, ":================", e 
    except Exception, e:
        print Exception, ":", e


conf_md5 = getmd5('para.conf')
# read the config file
config = ConfigParser.ConfigParser()
config.read("para.conf")
print 'config...'

if __name__ == '__main__':
    print time.time()

    class_set = {}
    class_set['kafka_log'] = CrmOperator.CrmOperator('kafka_flow_test', 'all-type')

    kp = kafkaprocess()
    # config check work thread
    t1 = Thread(target=working, args=(config,))
    t1.setDaemon(True)
    t1.start()

    # the main binglog thread
    lineT1 = Thread(target=workline1)
    lineT1.setDaemon(True)
    lineT1.start()

    print 'main thread running'

    while True:
        sleep(5)

para.conf


[table_keys]
#table the index key ,maybe are multiple keys
xes_student_live_op_logs=id

[db]
bootstrap_servers=nodelist

main_group_id=flow_test_englive_2019082001
#if set this  param ,when restart the program , group_id is oa_maxwell_20170209 and the offset is the lastest offset for main_topic_id  subtraction the main_restart_offset (lastest -300)
main_restart_offset=0
main_topic_id=xxxx