今天在爬诗词名句网站上的三国演义内容时,遇到了反爬,采用requests.get(url=url, headers=headers)会产生以下报错:
Traceback (most recent call last):
File "F:\anaconda3\envs\Python3.8\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
conn = connection.create_connection(
File "F:\anaconda3\envs\Python3.8\lib\site-packages\urllib3\util\connection.py", line 84, in create_connection
raise err
File "F:\anaconda3\envs\Python3.8\lib\site-packages\urllib3\util\connection.py", line 74, in create_connection
sock.connect(sa)
socket.timeout: timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "F:\anaconda3\envs\Python3.8\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
httplib_response = self._make_request(
File "F:\anaconda3\envs\Python3.8\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
self._validate_conn(conn)
File "F:\anaconda3\envs\Python3.8\lib\site-packages\urllib3\connectionpool.py", line 976, in _validate_conn
conn.connect()
File "F:\anaconda3\envs\Python3.8\lib\site-packages\urllib3\connection.py", line 308, in connect
conn = self._new_conn()
File "F:\anaconda3\envs\Python3.8\lib\site-packages\urllib3\connection.py", line 164, in _new_conn
raise ConnectTimeoutError(
urllib3.exceptions.ConnectTimeoutError: (<urllib3.connection.HTTPSConnection object at 0x000001F94710B970>, 'Connection to www.shicimingju.com timed out. (connect timeout=5)')
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "F:\anaconda3\envs\Python3.8\lib\site-packages\requests\adapters.py", line 439, in send
resp = conn.urlopen(
File "F:\anaconda3\envs\Python3.8\lib\site-packages\urllib3\connectionpool.py", line 724, in urlopen
retries = retries.increment(
File "F:\anaconda3\envs\Python3.8\lib\site-packages\urllib3\util\retry.py", line 439, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.shicimingju.com', port=443): Max retries exceeded with url: /book/lunyu/3.html (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001F94710B970>, 'Connection to www.shicimingju.com timed out. (connect timeout=5)'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "E:/Demo/Python_demo/爬虫/数据解析/bs4解析实战.py", line 32, in <module>
detail_page_html = requests.get(url=detail_url, headers=headers,timeout=5).text
File "F:\anaconda3\envs\Python3.8\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "F:\anaconda3\envs\Python3.8\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "F:\anaconda3\envs\Python3.8\lib\site-packages\requests\sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "F:\anaconda3\envs\Python3.8\lib\site-packages\requests\sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "F:\anaconda3\envs\Python3.8\lib\site-packages\requests\adapters.py", line 504, in send
raise ConnectTimeout(e, request=request)
requests.exceptions.ConnectTimeout: HTTPSConnectionPool(host='www.shicimingju.com', port=443): Max retries exceeded with url: /book/lunyu/3.html (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001F94710B970>, 'Connection to www.shicimingju.com timed out. (connect timeout=5)'))
试了很多方法后,包括每次循环后添加time.sleep(3)延时也未能解决问题,最后通过在get()方法前面添加重传请求:
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3)) # 访问http协议时,设置重传请求最多三次
s.mount('https://', HTTPAdapter(max_retries=3)) # 访问https协议时,设置重传请求最多三次
就可以啦!
以下为完整代码:
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
import time
url = 'https://www.shicimingju.com/book/lunyu.html'
headers= {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
html = requests.get(url=url, headers=headers).text
bs = BeautifulSoup(html, 'lxml')
list = bs.select('.book-mulu > ul > li')
fp = open('./论语.txt', 'w', encoding='utf-8')
header= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
for li in list:
# 获取每一章的标题
title = li.a.string
# 得到每章内容的超链接
detail_url = 'https://www.shicimingju.com' + li.a['href']
# print(detail_url)
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3)) # 访问http协议时,设置重传请求最多三次
s.mount('https://', HTTPAdapter(max_retries=3)) # 访问https协议时,设置重传请求最多三次
detail_page_html = requests.get(url=detail_url, headers=headers,timeout=5).text
# 实例化
detail_soup = BeautifulSoup(detail_page_html, 'lxml')
# 获取文本
div_tag = detail_soup.find('div', class_="chapter_content")
content = div_tag.text
fp.write(title+':'+content+'\n')
print(title, '爬取成功!')