快速提取NGINX/apache 给定时间段的日志

ikko

已于 2023-05-05 14:52:58 修改

阅读量911

点赞数

分类专栏： web服务器故障排查文章标签： python 运维 centos linux

于 2023-05-05 14:49:32 首次发布

本文链接：https://blog.csdn.net/mineo/article/details/130505929

版权

web服务器故障排查专栏收录该内容

2 篇文章 0 订阅

订阅专栏

本文介绍了在大量日志中快速定位和提取所需信息的方法。通过编写Python脚本，利用二分查找法在日志中高效地找到指定时间范围的内容，提高了对几百GB乃至TB级别日志的处理速度。首先模拟生成Nginx日志，然后展示了一个使用二分查找法的日志提取脚本，该脚本能在大型日志中准确且快速地提取出特定时间段的日志记录。

摘要由CSDN通过智能技术生成

使用场景说明

生产环境有时候会因为访问量很大，记录下来超大的日志，晚高峰出现问题时候要通过检查日志来排查问题，面对几百G以上甚至T级别的日志，使用less 查看会和搜索会非常慢，最好的方法是通过行编辑工具（grep、awk）将需要的日志提取出来查看，但是行编辑工具有一个缺陷就是要按行读取，从首行开始读取到晚高峰的日志也需要耗费大量的读IO以及时间。
为了能提高效率我写了这个脚本，通过二分查找法检查每一行的日志的时间，迅速定位到需要提取的日志首行和尾行，将需要的日志提取出来。
因为是二分查找法，所以理论上只要日志有记录时间，切时间是有序的，那么任何日志都可以用这个脚本完成日志提取，调整相应时间格式即可。这里举例使用的是nginx日志。

模拟生成日志

为了避免不必要的麻烦，这里我一一个脚本模拟生成一份NGINX日志，已经有日志的可以跳过。

#!/usr/bin/env python3
# encoding: utf-8
# ikko@foxmail.com

# 模拟生成千万行nginx 日志
# 36.111.88.33|-|[22/Jun/2020:10:12:45 +0800]|GET /misc.php?mod=patch&actio
# 180.163.28.55|-|[21/Mar/2023:15:58:02 +0800]|GET /search.php?mod=forum&searchid=4&orderby

import datetime
import random

time_start = datetime.datetime(2023, 3, 22, 6, 0)
time_end = datetime.datetime(2023, 3, 22, 23, 0)

def sim_ip():  # 生成IP 地址
    ip_random = random.randint(16843009, 4294967295)
    int_to_ip = lambda x: '.'.join([str(int(x / (256 ** i) % 256)) for i in range(3, -1, -1)])
    return int_to_ip(ip_random)


def sim_return_code():
    cur_wight = 0
    D_code_wight = {
        '200': 1000,
        '502': 1,
        '504': 1,
        '503': 1,
        '404': 1,
        '403': 1,
    }
    sum_wight = sum(D_code_wight.values())
    ra_wight = random.randint(0, sum_wight - 1)
    for k, v in D_code_wight.items():
        cur_wight += v
        if ra_wight < cur_wight:
            return k


def sim_uri():
    uris = ['GET /search.php?mod=forum&searchid={cid}'.format(cid=random.randint(1, 1000)),
            'GET /misc.php?mod=patch&actio={aid}'.format(aid=random.randint(1, 10)),
            'GET /data/cache/style_1_forum_moderator.css?Z9J HTTP/2.0'.format(aid=random.randint(1, 10)),
            ]
    return random.choice(uris)


def sim_ua():
    agents = [
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0',
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63'
    ]
    return random.choice(agents)


def sim_log(now_time):  # 生成日志
    return '|'.join([sim_ip(), '-', now_time, sim_uri(), sim_return_code(), str(random.randint(1024, 90000)), sim_ua()])


if __name__ == '__main__':
    f_name = 'd:\\tmp\\nginx.log'
    # 定义主函数逻辑，每秒并发量900~1200，晚高峰(18~22点)并发量提高至2.4倍
    now = time_start
    time_formate = '[%d/%b/%Y:%H:%M:%S +0800]'
    with open(f_name, 'w+', encoding='utf-8', buffering=409600) as f:
        while now < time_end:
            now = now + datetime.timedelta(seconds=+1)
            print(now.strftime(time_formate))
            for item in range(1, random.randint(900, 1200)):
                f.write(sim_log(now.strftime(time_formate)) + '\n')

日志提取脚本

脚本说明

脚本会按照给定的target 时间截取日志，保存到/tmp/need.log，因为测试和演示环境都是Windows，脚本中的路径格式部分使用的Windows，可以自行调整。
演示脚本提取的是2023年2月22日21点10分至30分之间的日志，会多提取前后一分钟

#!/usr/bin/env python3
# encoding: utf-8
# ikko
# 有一个NGINX 日志，截取21:10~21:28 之间的日志
# 通过二分法查找实现

import os
import datetime

time_formate = '[%d/%b/%Y:%H:%M:%S +0800]'
time_target_left = datetime.datetime(2023, 3, 22, 21, 9)
time_target_right = datetime.datetime(2023, 3, 22, 21, 31)


def dichotomy_find(log_file, target_time):
    f_size = os.path.getsize(log_file)
    left = 0
    right = f_size
    with open(log_file, 'r') as f:
        while True:  # 查找target time 位置
            cursor = int((left + right) / 2)
            f.seek(cursor, 0)
            f.readline()  # 防止有断行
            line = f.readline().strip()
            time_cursor = datetime.datetime.strptime(line.strip().split('|')[2], time_formate)
            if time_cursor.replace(second=0) == target_time:
                print('find result:', cursor, line)
                return cursor
            elif time_cursor.replace(second=0) < target_time:
                left = cursor
            elif time_cursor.replace(second=0) > target_time:
                right = cursor


def get_need_log(log_file, cursor_left, cursor_right):
    # 提取文件
    block_need = cursor_right - cursor_left
    with open(log_file, 'r') as f1, open('/tmp/need.log', 'w+') as f2:
        f1.seek(cursor_left, 0)
        while block_need > 0:
            data = f1.read(1024)
            f2.write(data)
            block_need -= 1024
    print('日志提取完成并保存到/tmp/need.log')


if __name__ == '__main__':
    log_file = 'd:\\tmp\\nginx.log'
    # 获取seek 位置
    cursor_left = dichotomy_find(log_file, time_target_left)
    cursor_right = dichotomy_find(log_file, time_target_right)

    # 提取日志
    print('提取日志, from: ', cursor_left, ' to: ', cursor_right)
    get_need_log(log_file, cursor_left, cursor_right)