python爬取网页，统计，写到excle表格中

最新推荐文章于 2024-01-18 07:54:56 发布
piece_66
最新推荐文章于 2024-01-18 07:54:56 发布
阅读量286
点赞数
分类专栏：运维 python 文章标签： python
本文链接：https://blog.csdn.net/piece_66/article/details/110634950
版权
运维同时被 2 个专栏收录
9 篇文章 0 订阅
订阅专栏
python
4 篇文章 0 订阅
订阅专栏
背景：作为运维，因经常需要对最近的报警进行分析，需要将某段时间的报警按照某个指标有序罗列，统计。简单的事情需要搞得很麻烦。所以敲了这几行代码。（运维简单使用python，python大神请多多指点）
# -*- coding: utf-8 -*    

import requests
from collections import OrderedDict
import xlsxwriter
import datetime
import os


def crawling():
    # 1、爬取内容到he列表中
    global he, xu_date, page, response
    xu_date = input("统计哪天的报警汇集信息(例:2020-02-02)：")
    # 需要打印哪几页
    qy = int(input("起始页："))
    zy = int(input("中止页："))
    he = []
    for i in range(qy, zy+1):
        # 1、爬取网页
        try:
            # 1、获取网页信息
            prefix_url = 'http://***..com/api/v2monitor/alarmlist/?format=json&page='
            di_ye = str(i)
            response = requests.get(prefix_url + di_ye)
            response.raise_for_status()
            response.encoding = response.apparent_encoding
            # 此时response.text为str格式
            key_null = response.text.replace('null', '"none"')
            # 转换为字典用eval
            all_info = eval(key_null)
            # 本页所有报警字典组成的列表。即字典对应的"results":[***]
            # respon.json()把json文件转换为python认识的格式
            # all_info = response.json()

            all_list = all_info["results"]
            # he即为最终想要的列表，未跑去时间状态跟时间
            he += all_list
        except:
            print("爬取失败")

    return he


def unique():
    # 2、清洗数据
    # 定义变量，全局变量和局部变量
    global problem_unique, OK_unique,problem_p_unique
    date_he = []
    problem_list = []
    ok_list = []
    # 如果截取时间==需统计时间，则为所需日期当天的所有报警
    for x in he:
        date = x['create_at'].split(' ')[0]
        if date == xu_date:
            date_he.append(x)

    # 页面中problem与OK信息分离
    for i in date_he:
        if i["status"] != 'OK':
            problem_list.append(i)
        else:
            ok_list.append(i)
    # 将problem列表所有的信息反向排列，即从0：00开始；OK列表时间不动，是倒数。
    problem_list.reverse()

    #  problem信息去重（相同主题的去重）+ 统计定义主题出现次数time
    # 帮助https://blog.csdn.net/chong_yun/article/details/80775909?depth_1-utm_source=distribute.pc_relevant_right.none-task&utm_source=distribute.pc_relevant_right.none-task
    a = OrderedDict() 
    for item in problem_list:
        a.setdefault(item['subject'],{**item, 'time': 0})['time'] += 1
    problem_unique = list(a.values())

    # 把problem_list与problem_unique中主题相同的实例+= 到problem_unique实例中
    for i in problem_list:
        for u in problem_unique:
            if i['subject'] == u['subject']:
                if i['instance'] not in u['instance']:
                    u['instance'] = '%s,%s' % (u['instance'], i['instance'])

    #   OK信息去重（相同主题的去重）
    b = OrderedDict()
    for item in ok_list:
        b.setdefault(item['subject'], {**item, 'time': 0})['time'] += 1
    ok_unique = list(b.values())

    # 计算恢复时间，将恢复时间写入problem_unique列表中
    for a in problem_unique:
        for b in ok_unique:
            if a['subject'].replace('[Alerting] ', '') == b['subject'].replace('[OK] ', ''):
                a['hfsj'] = b['create_at']
            # 若没有恢复时间则设置为''空
            # else:
            #     a['hfsj'] = ''

    # 计算出间隔时间，并写入problem_unique列表中.
    for i in problem_unique:
        if 'hfsj' in i:

            # 将时间格式的str转换为时间格式，先在顶部导入了datetime模块
            baojingshijian = datetime.datetime.strptime(i['create_at'], '%Y-%m-%d %H:%M:%S')
            huifushijian = datetime.datetime.strptime(i['hfsj'], '%Y-%m-%d %H:%M:%S')

            # 计算恢复时间-创建时间的间隔时间
            jiangemiao = (huifushijian - baojingshijian).seconds

            # 将计算出的秒转换为h、min     帮助自：https://blog.csdn.net/qq562029186/article/details/81136096
            m, s = divmod(jiangemiao, 60)
            h, m = divmod(m, 60)
            jgsj = '%d:%d:%d' % (h, m, s)
            i['jgsj'] = jgsj
        else:
            i['hfsj'] = ''
            i['jgsj'] = ''

    # 根据报警级别P 统计报警次数，合并报警主题，统计报警时间和恢复时间
    new_list = OrderedDict()
    for item in problem_unique:
        new_list.setdefault(item['levels'], {**item, 'time': 0})['time'] += item['time']
    problem_p_unique = list(new_list.values())

    # 根据级别列出ok_p_unique列表4个list，用来计算problem_p_unique的恢复时间
    ok_p = OrderedDict()
    for item in ok_list:
        ok_p.setdefault(item['levels'], {**item, 'time': 0})['time'] += 1
    ok_p_unique = list(ok_p.values())

    # 把problem_unique与problem_p_unique中P级别相同的主题+= 到problem_p_unique实例中
    for i in problem_unique:
        for x in problem_p_unique:
            if i['levels'] == x['levels'] and i['subject'] not in x['subject']:
                x['subject'] += i['subject']
                x['subject'] = '%s,%s' % (x['subject'], i['subject'])

    # problem_unique列表，级别相同挑选恢复时间最大的，作为problem_p_unique的恢复时间

    for a in problem_p_unique:
        for b in ok_p_unique:
            if a['levels'] == b['levels']:
                a['hfsj'] = b ['create_at']

    # for i in problem_p_unique:
    #     print(i)

    return problem_unique, ok_unique, problem_p_unique


def to_table():
    # 3、开始搞到excle表的sheet1和sheet2表格中
    #while true
    filename = 'python生成日报.xlsx'
    if os.path.exists(filename):
        os.remove(filename)
    workbook = xlsxwriter.Workbook('./%s' % filename)
    worksheet = workbook.add_worksheet()
    worksheet2 = workbook.add_worksheet()

    # 设定格式，等号左边格式名称自定义，字典中格式为指定选项
    # bold：加粗，num_format:数字格式
    bold_format = workbook.add_format({'bold': True,
                                       'font_size': 12,
                                       'valign': 'vcenter',
                                       'align': 'center'
                                       })
    item_style = workbook.add_format({'font_size': 12,
                                     'valign': 'vcenter',
                                     'align': 'center'})
    riqi_style = workbook.add_format({'font_size': 12,
                                      'valign': 'vcenter',
                                      'align': 'center'})
    money_format = workbook.add_format({'num_format': '$#,##0'})
    date_format = workbook.add_format({'num_format': 'mmmm d yyyy'})

    # 将二行二列设置宽度为15(从0开始)
    # worksheet.set_column(0, 1, 14.88)
    # worksheet.set_column(1, 2, 14.88)
    # worksheet.set_column(2, 3, 8.38)
    # worksheet.set_column(3, 4, 25.13)
    # worksheet.set_column(4, 5, 25.13)
    # worksheet.set_column(5, 6, 25.13)
    # worksheet.set_column(6, 7, 23.13)
    # worksheet.set_column(7, 8, 23.13)
    # worksheet.set_column(8, 9, 14.88)
    # worksheet.set_column(9, 10, 8.38)
    #
    # worksheet2.set_column(0, 1, 9)
    # worksheet2.set_column(1, 2, 9)
    # worksheet2.set_column(2, 3, 60)
    # worksheet2.set_column(3, 4, 15)
    # worksheet2.set_column(4, 5, 15)
    # worksheet2.set_column(5, 6, 15)
    # worksheet2.set_column(6, 7, 15)
    # worksheet2.set_column(7, 8, 15)
    # worksheet2.set_column(8, 9, 15)
    # worksheet2.set_column(9, 10, 24)
    # worksheet2.set_column(10, 11, 24)

    # # 用符号标记位置，例如：A列1行
    worksheet.write('A1', '统计日期', bold_format)
    worksheet.write('B1', '报警来源', bold_format)
    worksheet.write('C1', '报警级别', bold_format)
    worksheet.write('D1', '报警实例', bold_format)
    worksheet.write('E1', '报警主题', bold_format)
    worksheet.write('F1', '报警详情', bold_format)
    worksheet.write('G1', '开始时间', bold_format)
    worksheet.write('H1', '恢复时间', bold_format)
    worksheet.write('I1', '报警总时长', bold_format)
    worksheet.write('J1', '报警总次数', bold_format)

    # 定义邮件文件格式
    worksheet2.merge_range('A1:K1', '监控日报')
    worksheet2.write('A1', '监控日报', bold_format)
    worksheet2.write('A2', '报警级别P', bold_format)
    worksheet2.write('B2', '报警次数', bold_format)
    worksheet2.write('C2', '报警分布', bold_format)
    worksheet2.write('D2', '是否影响业务', bold_format)
    worksheet2.write('E2', '影响内容', bold_format)
    worksheet2.write('F2', '事件级别L', bold_format)
    worksheet2.write('G2', '时间开始时间', bold_format)
    worksheet2.write('H2', '事件恢复时间', bold_format)
    worksheet2.write('I2', '事件时长', bold_format)
    worksheet2.write('J2', '报警开始时间', bold_format)
    worksheet2.write('K2', '报警恢复时间', bold_format)
    worksheet2.write('A3', 'P0', bold_format)
    worksheet2.write('A4', 'P1', bold_format)
    worksheet2.write('A5', 'P2', bold_format)
    worksheet2.write('A6', 'P3', bold_format)
    worksheet2.write('A7', 'P4', bold_format)
    worksheet2.write('A8', 'P5', bold_format)
    worksheet2.write('A9', 'P9', bold_format)
    worksheet2.write('A10', 'P10', bold_format)
    worksheet2.write('A11', '报警总次数', bold_format)

    row = 1
    col = 0
    for item in problem_unique:
        # 使用write_string方法，指定数据格式写入数据
        worksheet.write_string(row, col, str(item['create_at'].split(' ')[0]), riqi_style)
        worksheet.write_string(row, col + 1, str(item['origin']), item_style)
        worksheet.write_string(row, col + 2, str('P'+str(item['levels'])), item_style)
        worksheet.write_string(row, col + 3, str(item['instance']), item_style)
        worksheet.write_string(row, col + 4, str(item['subject']), item_style)
        worksheet.write_string(row, col + 5, str(item['content']), item_style)
        worksheet.write_string(row, col + 6, str(item['create_at'].replace('-', '/')), item_style)
        worksheet.write_string(row, col + 7, str(item['hfsj'].replace('-', '/')), item_style)
        worksheet.write_string(row, col + 8, str(item['jgsj']), item_style)
        worksheet.write_string(row, col + 9, str(item['time']), item_style)
        row += 1

    row = 2
    col = 0
    for a in (0, 1, 2, 3, 4, 5, 9, 10):
        for i in problem_p_unique:
            if a == i['levels']:
                worksheet2.write_string(row, col + 1, str(i['time']), item_style)
                worksheet2.write_string(row, col + 2, str(i['subject']), item_style)
                worksheet2.write_string(row, col + 3, '否', item_style)
                worksheet2.write_string(row, col + 4, '否', item_style)
                worksheet2.write_string(row, col + 9, str(i['create_at']), item_style)
                worksheet2.write_string(row, col + 10, str(i['hfsj']), item_style)
            else:
                pass
        row += 1

    workbook.close()


if __name__ == '__main__':
    crawling()
    unique()
    to_table()