方法一:xpath
from lxml import etree
import requests
from requests import exceptions
def get_response(url, headers=None, timeout=None):
""""
当条件为假,断言失败
assert response.status_code == 200'请求报错,请求状态码为: %s'%str(response.status_code)
"""
try:
response = requests.get(url, headers=headers, timeout=timeout)
except exceptions.Timeout as e:
response = None
raise e
except exceptions.HTTPError as e:
response = None
raise e
except exceptions.ConnectTimeout as e:
response = None
raise e
except exceptions.ReadTimeout as e:
response = None
raise e
except exceptions.ProxyError as e:
response = None
raise e
except Exception as e:
response = None
raise e
finally:
return response
def get_content(etree_html, xpath):
result = []
content = etree_html.xpath(xpath) # list 类型
for each in content:
# 把结果中的换行和空格去掉
re_each = each.replace('\n', '').replace(' ', '')
if re_each == '\n' or re_each == '':
continue
else:
# print(re_each)
result.append(re_each)
return result
# 然后
def get_spider_content_xpath(self):
# func01. 获取源码, 通过 xpath 爬取数据
html = get_response(self.url, param.headers, 10)
etree_html = etree.HTML(html.text)
result = get_content(etree_html, param.xpath) # 这里param.xpath是封装的xpath参数
return result
result 就是通过 xpath 方式得到的数据内容;
其中,xpath 我写了个简单例子:
xpath = "//div[@class='article-item-box csdn-tracking-statistics']/h4/a/text()"
方法二:正则匹配
from lxml import etree
import requests
from requests import exceptions
def get_response(url, headers=None, timeout=None):
""""
当条件为假,断言失败
assert response.status_code == 200'请求报错,请求状态码为: %s'%str(response.status_code)
"""
try:
response = requests.get(url, headers=headers, timeout=timeout)
except exceptions.Timeout as e:
response = None
raise e
except exceptions.HTTPError as e:
response = None
raise e
except exceptions.ConnectTimeout as e:
response = None
raise e
except exceptions.ReadTimeout as e:
response = None
raise e
except exceptions.ProxyError as e:
response = None
raise e
except Exception as e:
response = None
raise e
finally:
return response
# 然后
def get_spider_content_re(self):
# func02. 获取源码,直接通过源码正则匹配爬取数据
# 需要配置参数:url 和 headers 中的user-agent ,均在 param 中配置
html = get_response(self.url, param.headers, 10)
html_text = html.text
# 正则匹配页面所有url
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html_text,
re.S)
# list 去重 -- list({}.fromkeys(list).keys())
urls = list({}.fromkeys(urls).keys())
return urls
urls 就是要匹配的内容
其中,匹配 url 的正则表达式还算可用,试了很多版本,个人认为这个表达式还算靠谱:
'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
方法三:标签 BeautifulSoup find find_all
import requests
from urllib.parse import urlparse
from urllib import request, parse
from bs4 import BeautifulSoup
word = '周杰伦'
url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(word) + '&pn=0'
page = request.urlopen(url).read()
soup = BeautifulSoup(page, 'lxml')
tagh3 = soup.find_all('a') # 返回 list
hrefs = []
for h3 in tagh3:
# href = h3.find('a').get('href')
try:
href = h3.get('href')
except:
pass
else:
hrefs.append(href)
hrefs 就是通过标签获取到的内容,这里我获取的是页面所有 url
方法四:标签 BeautifulSoup select
import urllib
import requests
from urllib.parse import urlparse
from urllib import request
from bs4 import BeautifulSoup
word = '周杰伦'
url = 'http://www.baidu.com/s?wd=' + urllib.parse.quote(word) + '&pn=0'
page = request.urlopen(url).read()
soup = BeautifulSoup(page, 'lxml')
# tagh3 = soup.select('h3 > a[href]')
tags = soup.select('a[href]') # 返回 list,同级不需要空格分隔
hrefs = []
for tag in tags:
hrefs.append(tag.get('href')) # 提取 href 的内容
hrefs = list({}.fromkeys(hrefs).keys()) # 去重
hrefs 就是通过标签 select 方法获取到的内容,这里我获取的是页面上所有 url;
关于BeautifulSoup find_all 、select 的使用,详细可见如下两个网址的描述(转):
原文链接:https://blog.csdn.net/weixin_39568072/article/details/107014701