python3深度爬取_Python3-爬取页面元素的几种方法

方法一:xpath

from lxml import etree

import requests

from requests import exceptions

def get_response(url, headers=None, timeout=None):

""""

当条件为假,断言失败

assert response.status_code == 200'请求报错,请求状态码为: %s'%str(response.status_code)

"""

try:

response = requests.get(url, headers=headers, timeout=timeout)

except exceptions.Timeout as e:

response = None

raise e

except exceptions.HTTPError as e:

response = None

raise e

except exceptions.ConnectTimeout as e:

response = None

raise e

except exceptions.ReadTimeout as e:

response = None

raise e

except exceptions.ProxyError as e:

response = None

raise e

except Exception as e:

response = None

raise e

finally:

return response

def get_content(etree_html, xpath):

result = []

content = etree_html.xpath(xpath) # list 类型

for each in content:

# 把结果中的换行和空格去掉

re_each = each.replace('\n', '').replace(' ', '')

if re_each == '\n' or re_each == '':

continue

else:

# print(re_each)

result.append(re_each)

return result

# 然后

def get_spider_content_xpath(self):

# func01. 获取源码, 通过 xpath 爬取数据

html = get_response(self.url, param.headers, 10)

etree_html = etree.HTML(html.text)

result = get_content(etree_html, param.xpath) # 这里param.xpath是封装的xpath参数

return result

result 就是通过 xpath 方式得到的数据内容;

其中,xpath 我写了个简单例子:

xpath = "//div[@class='article-item-box csdn-tracking-statistics']/h4/a/text()"

方法二:正则匹配

from lxml import etree

import requests

from requests import exceptions

def get_response(url, headers=None, timeout=None):

""""

当条件为假,断言失败

assert response.status_code == 200'请求报错,请求状态码为: %s'%str(response.status_code)

"""

try:

response = requests.get(url, headers=headers, timeout=timeout)

except exceptions.Timeout as e:

response = None

raise e

except exceptions.HTTPError as e:

response = None

raise e

except exceptions.ConnectTimeout as e:

response = None

raise e

except exceptions.ReadTimeout as e:

response = None

raise e

except exceptions.ProxyError as e:

response = None

raise e

except Exception as e:

response = None

raise e

finally:

return response

# 然后

def get_spider_content_re(self):

# func02. 获取源码,直接通过源码正则匹配爬取数据

# 需要配置参数:url 和 headers 中的user-agent ,均在 param 中配置

html = get_response(self.url, param.headers, 10)

html_text = html.text

# 正则匹配页面所有url

urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html_text,

re.S)

# list 去重 -- list({}.fromkeys(list).keys())

urls = list({}.fromkeys(urls).keys())

return urls

urls 就是要匹配的内容

其中,匹配 url 的正则表达式还算可用,试了很多版本,个人认为这个表达式还算靠谱:

'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

方法三:标签 BeautifulSoup find find_all

import requests

from urllib.parse import urlparse

from urllib import request, parse

from bs4 import BeautifulSoup

word = '周杰伦'

url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(word) + '&pn=0'

page = request.urlopen(url).read()

soup = BeautifulSoup(page, 'lxml')

tagh3 = soup.find_all('a') # 返回 list

hrefs = []

for h3 in tagh3:

# href = h3.find('a').get('href')

try:

href = h3.get('href')

except:

pass

else:

hrefs.append(href)

hrefs 就是通过标签获取到的内容,这里我获取的是页面所有 url

方法四:标签 BeautifulSoup select

import urllib

import requests

from urllib.parse import urlparse

from urllib import request

from bs4 import BeautifulSoup

word = '周杰伦'

url = 'http://www.baidu.com/s?wd=' + urllib.parse.quote(word) + '&pn=0'

page = request.urlopen(url).read()

soup = BeautifulSoup(page, 'lxml')

# tagh3 = soup.select('h3 > a[href]')

tags = soup.select('a[href]') # 返回 list,同级不需要空格分隔

hrefs = []

for tag in tags:

hrefs.append(tag.get('href')) # 提取 href 的内容

hrefs = list({}.fromkeys(hrefs).keys()) # 去重

hrefs 就是通过标签 select 方法获取到的内容,这里我获取的是页面上所有 url;

关于BeautifulSoup find_all 、select 的使用,详细可见如下两个网址的描述(转):

原文链接:https://blog.csdn.net/weixin_39568072/article/details/107014701

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值