问题:在爬虫某个网页时,返回状态码为”521“,如何解决?
Code:
import requests
def get_one_page(url):
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
}
response = requests.get(url, headers=headers)
print(response.status_code)
print(response.text)
if response.status_code == 200:
return response.text
return None
def main():
url = 'http://sou.chinanews.com/search.do?q=社会责任报告'
html = get_one_page(url)
print(html)
if __name__ == "__main__":
main()
521
<script>document.cookie=('_')+('_')+('j')+('s')+('l')+('_')+('c')+('l')+('e')+('a')+('r')+('a')+('n')+('c')+('e')+('=')+(-~[]+'')+([2]*(3)+'')+(+!+[]+'')+(2+6+'')+(0+1+0+1+'')+((+true)+'')+((1|2)+'')+(3+3+'')+((1+[0])/[2]+'')+(~~[]+'')+('.')+(-~0+'')+((+[])+'')+(0+1+0+1+'')+('|')+('-')+(-~[]+'')+('|')+('v')+('Q')+((+true)+'')+('%')+(1+1+'')+('F')+(9+'')+('o')+('U')+('C')+('e')+('k')+('F')+('j')+('M')+('g')+('u')+('I')+('S')+('U')+('c')+(-~[6]+'')+('t')+('p')+('Z')+((1+[4]>>1)+'')+('T')+('w')+('M')+('%')+(1+2+'')+('D')+(';')+('m')+('a')+('x')+('-')+('a')+('g')+('e')+('=')+((1+[2]>>2)+'')+(6+'')+((+false)+'')+((+false)+'')+(';')+('p')+('a')+('t')+('h')+('=')+('/');location.href=location.pathname+location.search</script>
解决:
- 需要在
headers
中设置Cookie
具体Cookie值可在此处查找
import requests
def get_one_page(url):
headers = {
'Cookie' : 'cnsuuid=592bc212-6657-6d5a-3976-edbd5b8d16d91776.8981184307827_1618149399374; __jsluid_h=559aef92e2b4d5042d87311134785652; Hm_lvt_0da10fbf73cda14a786cd75b91f6beab=1618149644; Hm_lpvt_0da10fbf73cda14a786cd75b91f6beab=1618149644; zycna=wTwYuJHZjO4BATr5cC7Bsjiw; __jsl_clearance=1618212323.32|0|JdK%2FRKjgYJevFTeA6GHCHdyN%2Bd8%3D; JSESSIONID=F2521D76E6131521ACDD4458E233CEBD',
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
}
response = requests.get(url, headers=headers)
print(response.status_code)
print(response.text)
#print(response.cookies)
if response.status_code == 200:
return response.text
return None
def main():
url = 'http://sou.chinanews.com/search.do?q=社会责任报告'
html = get_one_page(url)
print(html)
if __name__ == "__main__":
main()