简单又暴力的方法,自己可以试试:
直接看代码:
from bs4 import BeautifulSoup, Comment
import requests
from retrying import retry
@retry(stop_max_attempt_number=5)
def _get_url_three_content(requests_url):
proxies = None
random_header = {}
add_header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.bilibili.com',
'Upgrade-Insecure-Requests': '1',
}
last_header = dict()
last_header.update(random_header)
last_header.update(add_header)
html = requests.get(requests_url, headers=last_header, proxies=proxies, timeout=6)
if html.status_code == 404:
return ''
assert html.status_code == 200
return html
def run():
requests_url = 'https://www.baidu.com/s?wd=BeautifulSoup%E6%8A%A5%E9%94%99input%20conversion%20failed%20due%20to%20input%20error&rsv_spt=1&rsv_iqid=0xfeb80b100001c5bc&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&rqlang=cn&tn=baiduhome_pg&rsv_enter=1&rsv_t=c3b8j6jMfVRrUK9Ad2ZUcxta09Cvd%2FPSY%2F5CwKC47Rb7tjEQJoY55RMx02dspeHzzQdv&oq=encoding%2520error%2520%253A%2520input%2520conversion%2520failed%2520due%2520to%2520input%2520error%252C%2520bytes%25200x9D%25200x%2526gt%253B6&inputT=1475&rsv_sug3=51&rsv_pq=a7659e250004e379&rsv_sug1=2&rsv_sug7=001&rsv_n=2&bs=encoding%20error%20%3A%20input%20conversion%20failed%20due%20to%20input%20error%2C%20bytes%200x9D%200xE6'
html = _get_url_three_content(requests_url=requests_url)
html_str = html.content.decode('utf-8', 'ignore')
html_str = html_str.split('<body')[-1]
html_str = '<body' + html_str
# 这样BeautifulSoup内部就不会报错了
soup=BeautifulSoup(html_str,'lxml')
if __name__ == '__main__':
run()