import re from bs4 import BeautifulSoup import lxml.html import urllib2 import time FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages') def re_scraper(html): results = {} for field in FIELDS: results[field] = re.findall('<tr id="places_%s_row">.*?<td class="w2p_fw">(.*?)</td></tr>' % field, html) return results def bs_scraper(html): soup = BeautifulSoup(html, 'html.parser') results = {} for field in FIELDS: table = soup.find('table') tr = table.find('tr', id='places_%s_row' % field) #td = tr.find('td', class_='w2p_fw').text results[field] = tr return results def lxml_scraper(html): tree = lxml.html.fromstring(html) results = {} for field in FIELDS: tree = lxml.html.fromstring(html) td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0] results = td.text_content() return results NUM_ITERATIONS = 1000 url = 'http://example.webscraping.com/places/view/United-Kingdom-239' html = urllib2.urlopen(url).read() for name, scraper in [('Regular expressions', re_scraper), ('BeautifulSoup', bs_scraper), ('Lxml', lxml_scraper)]: # record start time of scrape start = time.time() for i in range(NUM_ITERATIONS): if scraper == re_scraper: re.purge() result = scraper(html) # check scraped result is as expected #assert(result['area'] == '244,820 square kilometres') # record end time of scrape and output the total end = time.time() print '%s: %.2f seconds' % (name, end - start) C:\Anaconda\python.exe C:/Users/Administrator/PythonWorkSpace/python03.py Regular expressions: 4.02 seconds BeautifulSoup: 32.48 seconds Lxml: 15.10 seconds Process finished with exit code 0
三种网页抓取方法性能对比
最新推荐文章于 2019-04-28 20:39:06 发布