关于Python爬取网页返回521状况码的解决方案


# 项目场景: Python3.8

问题描述:

在使用Python爬虫爬取网页的列表页中的详情页时,返回的详情页的html文件的数据长度有限。


原因分析:

频繁爬取目标网站,导致的网址反爬虫措施


解决方案:

如果解决不了,你可以把要爬取网页的源码先保存下来,进行后续的处理。
在这里插入图片描述

方法一:

换一个vpn,也就是换一台电脑执行程序

方法二:

复制目标网页的Headers添加到代码中
在这里插入图片描述

根据目标情况不同修改

def askURL(url):
    head = {  # 模拟浏览器头部信息,向豆瓣服务器发送消息
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2022-01-10+21:47:27";}; __jsluid_h=aa6e6e4350e2fd0e52cc227da10e26b5; __omc_chl=; __omc_r=; __mfwc=direct; uva=s:78:"a:3:{s:2:"lt";i:1641822448;s:10:"last_refer";s:6:"direct";s:5:"rhost";s:0:"";}";; __mfwurd=a:3:{s:6:"f_time";i:1641822448;s:9:"f_rdomain";s:0:"";s:6:"f_host";s:3:"www";}; __mfwuuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; UM_distinctid=17e443e711c512-05dd7ff73ec639-5e181552-144000-17e443e711dc58; login=mafengwo; mafengwo=16a582a6e0ca5f6c73654cb640343886_35627906_61e15d7be119c7.29366428_61e15d7be11a11.54996187; PHPSESSID=lf7mtvr2mgj3fhnfd7sn9br1c2; mfw_uid=35627906; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642218839,1642238624,1642341547,1642381972; CNZZDATA30065558=cnzz_eid=1067569765-1641819345-&ntime=1642380961; __jsl_clearance=1642381970.541|0|cYxjLrAJMIg1j5y/qJP9hLaEN7M=; __mfwa=1641822449293.40635.15.1642341546971.1642381972692; __mfwlv=1642381972; __mfwvn=11; bottom_ad_status=0; __mfwb=b5923a0d408d.8.direct; __mfwlt=1642382984; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642382985',
        'Host': 'www.mafengwo.cn',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
    }
    # 用户代理,表示告诉豆瓣服务器,我们是什么类型的机器、浏览器(本质上是告诉浏览器,我们可以接收什么水平的文件内容)

    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html
}

方法三:

两次访问目标详情页

代码一

import execjs
import requests
import re

head = {  # 模拟浏览器头部信息,向豆瓣服务器发送消息
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Host': 'www.mafengwo.cn',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
}
url = 'http://www.mafengwo.cn/poi/5423409.html'

# response = requests.get(url)
# # cookie1
# cookie1 = response.cookies
# # js代码
# js_code = response.text




def get_521_content(url,head):

    req = requests.get(url, headers=head)
    cookies = req.cookies

    cookies = '; '.join(['='.join(item) for item in cookies.items()])
    txt_521 = req.text
    txt_521 = ''.join(re.findall('<script>(.*?)</script>', txt_521))
    return (txt_521, cookies)


def fixed_fun(function):
    func_return = function.replace('eval', 'return')
    content = execjs.compile(func_return)

    req = requests.get(url, headers=head)
    evaled_func = ''.join(re.findall('<script>(.*?)</script>', req.text))
    # print(js_con)
    # fn = js_con.split('=').split(' ')
    # evaled_func = content.call(fn)

    # print(evaled_func)
    mode_func = evaled_func.replace('while(window._phantom||window.__phantomas){};', ''). \
        replace('document.cookie=', 'return').replace(';if((function(){try{return !!window.addEventListener;}', ''). \
        replace("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',l,false);}", ''). \
        replace("else{document.attachEvent('onreadystatechange',l);}", '').replace(
        r"setTimeout('location.href=location.href.replace(/[\?|&]captcha-challenge/,\'\')',1500);", '')
    content = execjs.compile(mode_func)
    cookies = content.call('l')
    __jsl_clearance = cookies.split(';')[0]
    return __jsl_clearance


def cookie_dict(js, id):
    dict = {}
    js = js.split('=')
    id = id.split('=')
    dict[js[0]] = js[1]
    dict[id[0]] = id[1]
    return dict


if __name__ == '__main__':
    func = get_521_content(url,head)
    content = func[0]

    cookie_id = func[1]
    cookie_js = fixed_fun(func[0])
    dicted_cookie = cookie_dict(cookie_js, cookie_id)

    head = {  # 模拟浏览器头部信息,向豆瓣服务器发送消息
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host': 'www.mafengwo.cn',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
        'Cookie': cookie_id + ';' + cookie_js
    }
    req = requests.get(url, headers=head)
    print(req.status_code)

代码二

# resouce:https://blog.csdn.net/qq_41879417/article/details/101701120?spm=1001.2101.3001.6661.1&utm_medium=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7ECTRLIST%7Edefault-1.pc_relevant_default&depth_1-utm_source=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7ECTRLIST%7Edefault-1.pc_relevant_default&utm_relevant_index=1
# -*- coding: utf-8 -*-
# @Time : 2022/1/16 9:11
# @Author : sherlock
# @File : creeper_2_521.py
# @Project : creeper

import execjs

import re

import requests

url = 'http://www.mafengwo.cn/poi/5423409.html'

head = {  # 模拟浏览器头部信息,向豆瓣服务器发送消息
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Host': 'www.mafengwo.cn',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
}




def get_521_content(url):
    req = requests.get(url, headers=head, timeout=5)
    print(req.status_code, req.text)
    if req.status_code == 521:
        cookies = dict(req.cookies.items())
        print(cookies)
        js_con = ''.join(re.findall('<script>(.*?)</script>', req.text))
        if js_con:
            __jsl_clearance = fixed_fun(js_con, url)
            if __jsl_clearance:
                key, value = __jsl_clearance.split('=')
                cookies[key] = value
                return cookies


# 执行js代码获取cookies 的__jsl_clearance的键值
def fixed_fun(js_con, url):  # js_con 第一次请求获取的js内容

    func_return = js_con.replace('eval(', 'return(')
    print('第一次替换eval==》return后:  ', func_return)
    content = execjs.compile(func_return)
    # fn = js_con.split('=')[0].split(' ')[1]
    # 只有['document.cookie']
    fn = js_con.split('=')[0].split(' ')[1]
    evaled_func = content.call(fn)
    print('第一次执行js代码后: ', evaled_func)
    fn = evaled_func.split('=')[0].split(' ')[1]  # 获取动态函数名
    aa = evaled_func.split("<a href=\\'/\\'>")  # 获取<a>标签的内容
    aa = aa[1].split("</a>")[0] if len(aa) >= 2 else ''
    mode_func = evaled_func. \
        replace(
        "setTimeout('location.href=location.pathname+location.search.replace(/[\\?|&]captcha-challenge/,\\'\\')',1500);document.cookie=",
        'return'). \
        replace(';if((function(){try{return !!window.addEventListener;}', ''). \
        replace(
        "}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
        ''). \
        replace(
        "if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
        ''). \
        replace("return'__jsl_clearance", "var window={};return '__jsl_clearance"). \
        replace(
        "var " + fn + "=document.createElement('div');" + fn + ".innerHTML='<a href=\\'/\\'>" + aa + "</a>';" + fn + "=" + fn + ".firstChild.href",
        "var " + fn + "='" + url + "'")
    print('第二次替换后的js代码:', mode_func)
    try:
        content = execjs.compile(mode_func)
        cookies = content.call(fn)
        __jsl_clearance = cookies.split(';')[0]
        print(__jsl_clearance)
        return __jsl_clearance
    except:
        print('js执行错误:', mode_func)
        return None


# 携带解密后的cookies第二次爬取详情页
def con_spider(cookies, url):
    response = requests.get(url, headers=head, cookies=cookies, timeout=5)
    if response.status_code == 200:
        response.encoding = 'utf-8'
        print(response.status_code)
        print(response.text)
        return response
    else:
        print('第二次爬取错误状态码:', response.status_code)
        return None


if __name__ == "__main__":
    cookies = get_521_content(url)
    con_spider(cookies, url)

代码三

# resource:https://www.cnblogs.com/gongs/p/10524710.html

import execjs

import re

import requests

url = 'http://www.mafengwo.cn/poi/5423409.html'

head = {  # 模拟浏览器头部信息,向豆瓣服务器发送消息
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",

    "Host": "www.mafengwo.cn",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55",
}





def getResponse():
    """
    获取response
    :return:
    """
    response = requests.get(url, headers=head)
    return response


def getJslid(response):
    """

    :param response:
    :return:
    """

    cook = response.cookies

    ans = '; '.join(['='.join(item) for item in cook.items()])

    return ans


def getClearance(response):
    """

    :return:
    """
    txt = ''.join(re.findall('<script>(.*?)</script>', response.text))
    func_return = txt.replace('eval', 'return')

    content = execjs.compile(func_return)

    print("accurate error")
    # error
    eval_func = content.call('x')
    print(1)
    name = re.findall(r'var (.*?)=function.*', eval_func)[0]
    print(2)
    mode_func = eval_func.replace('while(window._phantom||window.__phantomas){};', ''). \
        replace('document.cookie=', 'return').replace('if((function(){try{return !!window.addEventListener;}', ''). \
        replace("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',%s,false)}" % name, ''). \
        replace("else{document.attachEvent('onreadystatechange',%s)}" % name, '').replace(
        r"setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')',1500);",
        '')  

    content = execjs.compile(mode_func)
    cookies = content.call(name)
    # print(cookies)
    clearance = cookies.split(';')[0]

    return clearance


def structurehead(cook, clearance):
    """
    构造新的head
    :return:
    """

    cookie = {
        'cookie': cook + ';' + clearance
    }

    return dict(head, **cookie)

def main():
    response = getResponse()
    cook = getJslid(response)
    print("error")
    # this step has some error about exejcss
    clearance = getClearance(response)
    print("2 error")
    dict = structurehead(cook, clearance)
    print(dict)

if __name__ == '__main__':
    main()

代码四

# -*- coding: utf-8 -*-
# @Time : 2022/1/18 13:32
# @Author : sherlock
# @File : creeper_4_521.py
# @Project : creeper

# coding=utf-8
# author=zhangjingyuan
# python3
from html.parser import HTMLParser
import lxml
import requests
from lxml import etree
import urllib.request
import urllib.parse
import re
import time
import io
import gzip
import random
import codecs
import execjs
import requests
import re

url1 = 'http://www.mafengwo.cn/poi/5423409.html'

url2 = 'https://jobs.51job.com/haikou/135562401.html'

url3 = 'https://movie.douban.com/subject/1292052/'

head = {  # 模拟浏览器头部信息,向豆瓣服务器发送消息
    # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    # "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    # "Cache-Control": "max-age=0",
    # "Connection": "keep-alive",
    # "Cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2022-01-10+21:47:27";}; __jsluid_h=aa6e6e4350e2fd0e52cc227da10e26b5; __omc_chl=; __omc_r=; __mfwc=direct; uva=s:78:"a:3:{s:2:"lt";i:1641822448;s:10:"last_refer";s:6:"direct";s:5:"rhost";s:0:"";}";; __mfwurd=a:3:{s:6:"f_time";i:1641822448;s:9:"f_rdomain";s:0:"";s:6:"f_host";s:3:"www";}; __mfwuuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; UM_distinctid=17e443e711c512-05dd7ff73ec639-5e181552-144000-17e443e711dc58; login=mafengwo; mafengwo=16a582a6e0ca5f6c73654cb640343886_35627906_61e15d7be119c7.29366428_61e15d7be11a11.54996187; __jsl_clearance=1642341544.979|0|fafiHNHGZB+baEyxg5NVjPfVXm0=; PHPSESSID=s4foj9fhkm3mq8rs64omagvvp2; mfw_uid=35627906; __mfwa=1641822449293.40635.14.1642238623523.1642341546971; __mfwlv=1642341546; __mfwvn=10; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642215122,1642218839,1642238624,1642341547; CNZZDATA30065558=cnzz_eid=1067569765-1641819345-&ntime=1642337760; bottom_ad_status=0; uol_throttle=35627906; __mfwb=8cc49c72508e.10.direct; __mfwlt=1642343676; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642343676',
    # "Host": "www.mafengwo.cn",
    # "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55",
}


def getResponse():
    """
    获取response
    :return:
    """
    response = requests.get(url1, headers=head)
    return response


def getJslid(response):
    """

    :param response:
    :return:
    """
    cook = response.cookies
    return '; '.join(['='.join(item) for item in cook.items()])


def getClearance(response):
    """

    :return:
    """
    txt = ''.join(re.findall('<script>(.*?)</script>', response.text))
    func_return = txt.replace('eval', 'return')
    print(func_return)

    content = execjs.compile(func_return)
    print(type(content))
    # content = open("jsdom_document").read()
    # print(content)
    # execjs._exceptions.ProgramError: ReferenceError: document is not defined
    eval_func = content.call('x')

    name = re.findall(r'var (.*?)=function.*', eval_func)[0]

    mode_func = eval_func.replace('while(window._phantom||window.__phantomas){};', ''). \
        replace('document.cookie=', 'return').replace('if((function(){try{return !!window.addEventListener;}', ''). \
        replace("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',%s,false)}" % name, ''). \
        replace("else{document.attachEvent('onreadystatechange',%s)}" % name, '').replace(
        r"setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')',1500);",
        '')

    content = execjs.compile(mode_func)
    cookies = content.call(name)
    # print(cookies)
    clearance = cookies.split(';')[0]

    return clearance


def structureCookie(cook, clearance):
    """
    构造新的headers
    :return:
    """

    cookie = cook + ';' + clearance
    print(cookie)

    return cookie


if __name__ == '__main__':
    response = getResponse()
    clearance = getClearance(response)
    cook = getJslid(response)

    head = {  # 模拟浏览器头部信息,向豆瓣服务器发送消息
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host': 'www.mafengwo.cn',
        'Cookie': cook,
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
    }


    request = urllib.request.Request(url2, headers=head)

    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode(encoding="utf-8", errors="ignore")
        print(html)
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print("状态码:%s" % (e.code))
        if hasattr(e, "reason"):
            print("原因:%s" % (e.reason))

代码五

# -*- coding: utf-8 -*-
# @Time : 2022/1/18 17:43
# @Author : sherlock
# @File : creeper_5_seleu.py
# @Project : creeper


# -*-  coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import redis
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import smtplib
import email.utils
from email.mime.text import MIMEText
import time

url = 'https://www.ipip.net'


def driver_chrome():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    driver = webdriver.Chrome(chrome_options=chrome_options)
    return driver


def mymail(content):
    msg = MIMEText(content, _subtype='plain', _charset='utf8')
    msg['From'] = email.utils.formataddr(('Author', '989989797@qq.com'))
    msg['To'] = email.utils.formataddr(('Recipient', '8979879879@me.com'))
    msg['date'] = time.strftime('%a, %d %b %Y %H:%M:%S %z')
    msg['Subject'] = 'Your ip address'
    return msg


r = redis.Redis(host='localhost', port=6379, decode_responses=True)
myip = r.get('myip')
driver = driver_chrome()
driver.get(url)
cookies = driver.get_cookies()
new_cookies = {}
for i in cookies:
    driver.add_cookie({'name': i.get('name'), 'value': i.get('value')})
driver.get(url)
soup = BeautifulSoup(driver.page_source, features='lxml')
myres = soup.find_all('div', attrs={'class': 'yourInfo'})
trueip = myres[0].find_all('a')[0].text
msg = mymail(trueip)

with smtplib.SMTP_SSL('smtp.qq.com', 465) as server:
    server.login('80988988@qq.com', '9jsdfhjhfio')
    if myip != trueip:
        r.set('myip', trueip)
        server.sendmail('98198397@qq.com', '9879878798@me.com', msg.as_string())

driver.close()
driver.quit()

Test代码

# coding=utf-8
# author=zhangjingyuan
# python3
from html.parser import HTMLParser
import lxml
import requests
from lxml import etree
import urllib.request
import urllib.parse
import re
import time
import io
import gzip
import random
import codecs

url1 = 'http://www.mafengwo.cn/poi/5423409.html'

url2 = 'https://jobs.51job.com/haikou/135562401.html'

url3 ='https://movie.douban.com/subject/1292052/'

url4 = 'http://www.mafengwo.cn/search/q.php?q=%E6%B3%89%E5%B7%9E'

head = {  # 模拟浏览器头部信息,向豆瓣服务器发送消息
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2022-01-10+21:47:27";}; __jsluid_h=aa6e6e4350e2fd0e52cc227da10e26b5; __omc_chl=; __omc_r=; __mfwc=direct; uva=s:78:"a:3:{s:2:"lt";i:1641822448;s:10:"last_refer";s:6:"direct";s:5:"rhost";s:0:"";}";; __mfwurd=a:3:{s:6:"f_time";i:1641822448;s:9:"f_rdomain";s:0:"";s:6:"f_host";s:3:"www";}; __mfwuuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; UM_distinctid=17e443e711c512-05dd7ff73ec639-5e181552-144000-17e443e711dc58; login=mafengwo; mafengwo=16a582a6e0ca5f6c73654cb640343886_35627906_61e15d7be119c7.29366428_61e15d7be11a11.54996187; __jsl_clearance=1642341544.979|0|fafiHNHGZB+baEyxg5NVjPfVXm0=; PHPSESSID=s4foj9fhkm3mq8rs64omagvvp2; mfw_uid=35627906; __mfwa=1641822449293.40635.14.1642238623523.1642341546971; __mfwlv=1642341546; __mfwvn=10; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642215122,1642218839,1642238624,1642341547; CNZZDATA30065558=cnzz_eid=1067569765-1641819345-&ntime=1642337760; bottom_ad_status=0; uol_throttle=35627906; __mfwb=8cc49c72508e.10.direct; __mfwlt=1642343676; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642343676',
    "Host": "www.mafengwo.cn",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55",
}

# # 输出访问网页的状态码
# req = requests.get(url, headers=head).status_code
# print(req)

request = urllib.request.Request(url1, headers=head)

html = ""
try:
    response = urllib.request.urlopen(request)
    html = response.read().decode(encoding="utf-8", errors="ignore")
    print(html)
except urllib.error.URLError as e:
    if hasattr(e, "code"):
        print("状态码:%s"%(e.code))
    if hasattr(e, "reason"):
        print("原因:%s"%(e.reason))

# response = requests.get(url1)
# print(response)

# # cookie1
# cookie1 = response.cookies
# print(cookie1)
# # js代码
# js_code = response.text
# print(js_code)

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值