最近两天为了爬取dubbo-monitor中各个服务的运行情况,特地去看了一下python基础,然后花了点时间写了个爬虫项目。
from io import StringIO from urllib import request import re from bs4 import BeautifulSoup def urlopen(data): with request.urlopen(data) as f: data = f.read().decode('utf-8') return data def match(pattern, date): compile = re.compile(pattern) return compile.findall(date) class Statistics(object): def __init__(self, methodName, consumerSuccess, providerSuccess, consumerFailure, providerFailure, consumerAvgElapsed, providerAvgElapsed, consumerMaxElapsed, providerMaxElapsed, consumerMaxConcurrent, providerMaxConcurrent): self.methodName = methodName self.consumerSuccess = consumerSuccess self.providerSuccess = providerSuccess self.consumerFailure = consumerFailure self.providerFailure = providerFailure self.consumerAvgElapsed = consumerAvgElapsed self.providerAvgElapsed = providerAvgElapsed self.consumerMaxElapsed = consumerMaxElapsed self.providerMaxElapsed = providerMaxElapsed self.consumerMaxConcurrent = consumerMaxConcurrent self.providerMaxConcurrent = providerMaxConcurrent def __str__(self): return self.methodName + "\t" + self.consumerSuccess + "\t" + self.providerSuccess + "\t" + self.consumerFailure + "\t" \ + self.providerFailure + "\t" + self.consumerAvgElapsed + "\t" + self.providerAvgElapsed + "\t" + self.consumerMaxElapsed \ + "\t" + self.providerMaxElapsed + "\t" + self.consumerMaxConcurrent + "\t" + self.providerMaxConcurrent SEPARATOR = " --> " HREF_PATTERN = '<a href=\"(statistics.html.+?)\"' DOMAIN = 'https://monitor.yangjiachang.com/' URL = DOMAIN + 'services.html' date="&date=20181023" data = urlopen(URL) list = match(HREF_PATTERN, data) f = StringIO() # 逐个打开各个服务的统计页面 for service in list: serviceName = service[24::] # print("service name =",serviceName) # print("open page:",DOMAIN + service) # 获得一个统计页面 statistics_data = urlopen(DOMAIN + service + date) # 获得统计页面结构 soup = BeautifulSoup(statistics_data) trList = soup.tr # print(trList) trList = soup.find_all("tr") for x in trList: if len(x.attrs) == 0: continue tdList = x.find_all("td") # tdList的固定长度为6 # [<td id="td_-275351171_0_0" style="display: ;">baofooDownloadCheckFile</td>, <td id="td_-275351171_0_1" style="display: ;">1 --> 1</td>, <td id="td_-275351171_0_2" style="display: ;">0 --> 0</td>, <td id="td_-275351171_0_3" style="display: ;">117 --> 73</td>, <td id="td_-275351171_0_4" style="display: ;">117 --> 73</td>, <td id="td_-275351171_0_5" style="display: ;">1 --> 1</td>] methodName = tdList[0].get_text() success = tdList[1].get_text() successValues = success.split(SEPARATOR) consumerSuccess = successValues[0] providerSuccess = successValues[1] failure = tdList[2].get_text() failureValues = failure.split(SEPARATOR) consumerFailure = failureValues[0] providerFailure = failureValues[1] avgElapsed = tdList[3].get_text() avgElapsedValues = avgElapsed.split(SEPARATOR) consumerAvgElapsed = avgElapsedValues[0] providerAvgElapsed = avgElapsedValues[1] maxElapsed = tdList[4].get_text() maxElapsedValues = maxElapsed.split(SEPARATOR) consumerMaxElapsed = maxElapsedValues[0] providerMaxElapsed = maxElapsedValues[1] maxConcurrent = tdList[5].get_text() maxConcurrentValues = maxConcurrent.split(SEPARATOR) consumerMaxConcurrent = maxConcurrentValues[0] providerMaxConcurrent = maxConcurrentValues[1] statistics = Statistics(methodName, consumerSuccess, providerSuccess, consumerFailure, providerFailure, consumerAvgElapsed, providerAvgElapsed, consumerMaxElapsed, providerMaxElapsed, consumerMaxConcurrent, providerMaxConcurrent) # print(statistics) line = serviceName + "\t" + statistics.__str__() f.write(line) f.write("\n") # print("=============================================") with open('1.txt', 'w') as fd: fd.write(f.getvalue())