#-*- coding:UTF-8 -*- #1正则表达式 import re import urllib2 url = 'http://example.webscraping.com/places/view/United-Kingdom-239' html = urllib2.urlopen(url).read() print re.findall('<td class="w2p_fw">(.*?)</td>', html) #正则表达式为我们提供了抓取数据的快捷方式,但是,该方法过于脆弱,容易在网页更新后出现问题。 #2BeautifulSoup from bs4 import BeautifulSoup import urllib2 url = 'http://example.webscraping.com/places/view/United-Kingdom-239' html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) print soup.prettify() tr = soup.find(attrs={'id':'places_area_row'}) td = tr.find(attrs={'class':'w2p_fw'}) area = td.text print area #3Lxml css选择器 import lxml.html import urllib2 url = 'http://example.webscraping.com/places/view/United-Kingdom-239' html = urllib2.urlopen(url).read() tree = lxml.html.fromstring(html) td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0] area = td.text_content() print area #抓取结果 FIELDS = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages') import re def re_scraper(html): results = {} for field in FIELDS: results[field] = re.search('<tr id="places_%s_row">.*?<td class="w2p_fw">(.*?)</tr>' % field, html).groups()[0] return results from bs4 import BeautifulSoup def bs_scraper(html): soup = BeautifulSoup(html, 'html.parser') results = {} for field in FIELDS: results[field] = soup.find('table').find('tr', id='places_%s_row' % field).find('td', class_='w2p_fw').text return results import lxml.html def lxml_scraper(html): tree = lxml.html.fromstring(html) results = {} for field in FIELDS: results[field] = tree.cssselect('table > tr#places_%s_row > td.w2p_fw' % field)[0].text_content() return results
三种网页抓取方法
最新推荐文章于 2024-08-21 03:28:22 发布