三种网页抓取方法

#-*- coding:UTF-8 -*-
#1正则表达式
import re
import urllib2
url = 'http://example.webscraping.com/places/view/United-Kingdom-239'
html = urllib2.urlopen(url).read()
print re.findall('<td class="w2p_fw">(.*?)</td>', html)
#正则表达式为我们提供了抓取数据的快捷方式,但是,该方法过于脆弱,容易在网页更新后出现问题。
#2BeautifulSoup
from bs4 import BeautifulSoup
import urllib2
url = 'http://example.webscraping.com/places/view/United-Kingdom-239'
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
print soup.prettify()
tr = soup.find(attrs={'id':'places_area_row'})
td = tr.find(attrs={'class':'w2p_fw'})
area = td.text
print area
#3Lxml css选择器
import lxml.html
import urllib2
url = 'http://example.webscraping.com/places/view/United-Kingdom-239'
html = urllib2.urlopen(url).read()
tree = lxml.html.fromstring(html)
td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0]
area = td.text_content()
print area
#抓取结果
FIELDS = ('area', 'population', 'iso', 'country', 'capital',
          'continent', 'tld', 'currency_code', 'currency_name',
          'phone', 'postal_code_format', 'postal_code_regex', 'languages')
import re
def re_scraper(html):
    results = {}
    for field in FIELDS:
        results[field] = re.search('<tr id="places_%s_row">.*?<td class="w2p_fw">(.*?)</tr>' % field, html).groups()[0]
    return results

from bs4 import BeautifulSoup
def bs_scraper(html):
    soup = BeautifulSoup(html, 'html.parser')
    results = {}
    for field in FIELDS:
        results[field] = soup.find('table').find('tr', id='places_%s_row' % field).find('td', class_='w2p_fw').text
    return results

import lxml.html
def lxml_scraper(html):
    tree = lxml.html.fromstring(html)
    results = {}
    for field in FIELDS:
        results[field] = tree.cssselect('table > tr#places_%s_row > td.w2p_fw' % field)[0].text_content()
    return results




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值