第一个还不太成熟的爬虫项目(dubbo-monitor)

 最近两天为了爬取dubbo-monitor中各个服务的运行情况,特地去看了一下python基础,然后花了点时间写了个爬虫项目。

from io import StringIO
from urllib import request
import re
from bs4 import BeautifulSoup


def urlopen(data):
    with request.urlopen(data) as f:
        data = f.read().decode('utf-8')
    return data


def match(pattern, date):
    compile = re.compile(pattern)
    return compile.findall(date)


class Statistics(object):
    def __init__(self, methodName, consumerSuccess, providerSuccess, consumerFailure, providerFailure,
                 consumerAvgElapsed, providerAvgElapsed, consumerMaxElapsed, providerMaxElapsed,
                 consumerMaxConcurrent, providerMaxConcurrent):
        self.methodName = methodName
        self.consumerSuccess = consumerSuccess
        self.providerSuccess = providerSuccess
        self.consumerFailure = consumerFailure
        self.providerFailure = providerFailure
        self.consumerAvgElapsed = consumerAvgElapsed
        self.providerAvgElapsed = providerAvgElapsed
        self.consumerMaxElapsed = consumerMaxElapsed
        self.providerMaxElapsed = providerMaxElapsed
        self.consumerMaxConcurrent = consumerMaxConcurrent
        self.providerMaxConcurrent = providerMaxConcurrent

    def __str__(self):
        return self.methodName + "\t" + self.consumerSuccess + "\t" + self.providerSuccess + "\t" + self.consumerFailure + "\t" \
               + self.providerFailure + "\t" + self.consumerAvgElapsed + "\t" + self.providerAvgElapsed + "\t" + self.consumerMaxElapsed \
               + "\t" + self.providerMaxElapsed + "\t" + self.consumerMaxConcurrent + "\t" + self.providerMaxConcurrent


SEPARATOR = " --> "
HREF_PATTERN = '<a href=\"(statistics.html.+?)\"'
DOMAIN = 'https://monitor.yangjiachang.com/'
URL = DOMAIN + 'services.html'
date="&date=20181023"

data = urlopen(URL)
list = match(HREF_PATTERN, data)
f = StringIO()


# 逐个打开各个服务的统计页面
for service in list:
    serviceName = service[24::]
    # print("service name =",serviceName)
    # print("open page:",DOMAIN + service)
    # 获得一个统计页面
    statistics_data = urlopen(DOMAIN + service + date)
    # 获得统计页面结构
    soup = BeautifulSoup(statistics_data)
    trList = soup.tr
    # print(trList)
    trList = soup.find_all("tr")
    for x in trList:
        if len(x.attrs) == 0:
            continue
        tdList = x.find_all("td")
        # tdList的固定长度为6
        # [<td id="td_-275351171_0_0" style="display: ;">baofooDownloadCheckFile</td>, <td id="td_-275351171_0_1" style="display: ;">1 --&gt; 1</td>, <td id="td_-275351171_0_2" style="display: ;">0 --&gt; 0</td>, <td id="td_-275351171_0_3" style="display: ;">117 --&gt; 73</td>, <td id="td_-275351171_0_4" style="display: ;">117 --&gt; 73</td>, <td id="td_-275351171_0_5" style="display: ;">1 --&gt; 1</td>]
        methodName = tdList[0].get_text()

        success = tdList[1].get_text()
        successValues = success.split(SEPARATOR)
        consumerSuccess = successValues[0]
        providerSuccess = successValues[1]

        failure = tdList[2].get_text()
        failureValues = failure.split(SEPARATOR)
        consumerFailure = failureValues[0]
        providerFailure = failureValues[1]

        avgElapsed = tdList[3].get_text()
        avgElapsedValues = avgElapsed.split(SEPARATOR)
        consumerAvgElapsed = avgElapsedValues[0]
        providerAvgElapsed = avgElapsedValues[1]

        maxElapsed = tdList[4].get_text()
        maxElapsedValues = maxElapsed.split(SEPARATOR)
        consumerMaxElapsed = maxElapsedValues[0]
        providerMaxElapsed = maxElapsedValues[1]

        maxConcurrent = tdList[5].get_text()
        maxConcurrentValues = maxConcurrent.split(SEPARATOR)
        consumerMaxConcurrent = maxConcurrentValues[0]
        providerMaxConcurrent = maxConcurrentValues[1]

        statistics = Statistics(methodName, consumerSuccess, providerSuccess, consumerFailure, providerFailure,
                                consumerAvgElapsed,
                                providerAvgElapsed, consumerMaxElapsed, providerMaxElapsed, consumerMaxConcurrent,
                                providerMaxConcurrent)

        # print(statistics)
        line = serviceName + "\t" + statistics.__str__()
        f.write(line)
        f.write("\n")
        # print("=============================================")

with open('1.txt', 'w') as fd:
    fd.write(f.getvalue())
  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值