关于Python爬取网页返回521状况码的解决方案

NorburyL

已于 2022-01-24 17:34:36 修改

阅读量3.9k

点赞数

分类专栏： python 文章标签： python 爬虫开发语言

于 2022-01-23 11:35:24 首次发布

本文链接：https://blog.csdn.net/sherlocklcy/article/details/122643279

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

文章目录

问题描述：
原因分析：
解决方案：

# 项目场景： Python3.8

问题描述：

在使用Python爬虫爬取网页的列表页中的详情页时，返回的详情页的html文件的数据长度有限。

原因分析：

频繁爬取目标网站，导致的网址反爬虫措施

解决方案：

如果解决不了，你可以把要爬取网页的源码先保存下来，进行后续的处理。
在这里插入图片描述

方法一：

换一个vpn,也就是换一台电脑执行程序

方法二：

复制目标网页的Headers添加到代码中
在这里插入图片描述

根据目标情况不同修改

def askURL(url):
    head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2022-01-10+21:47:27";}; __jsluid_h=aa6e6e4350e2fd0e52cc227da10e26b5; __omc_chl=; __omc_r=; __mfwc=direct; uva=s:78:"a:3:{s:2:"lt";i:1641822448;s:10:"last_refer";s:6:"direct";s:5:"rhost";s:0:"";}";; __mfwurd=a:3:{s:6:"f_time";i:1641822448;s:9:"f_rdomain";s:0:"";s:6:"f_host";s:3:"www";}; __mfwuuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; UM_distinctid=17e443e711c512-05dd7ff73ec639-5e181552-144000-17e443e711dc58; login=mafengwo; mafengwo=16a582a6e0ca5f6c73654cb640343886_35627906_61e15d7be119c7.29366428_61e15d7be11a11.54996187; PHPSESSID=lf7mtvr2mgj3fhnfd7sn9br1c2; mfw_uid=35627906; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642218839,1642238624,1642341547,1642381972; CNZZDATA30065558=cnzz_eid=1067569765-1641819345-&ntime=1642380961; __jsl_clearance=1642381970.541|0|cYxjLrAJMIg1j5y/qJP9hLaEN7M=; __mfwa=1641822449293.40635.15.1642341546971.1642381972692; __mfwlv=1642381972; __mfwvn=11; bottom_ad_status=0; __mfwb=b5923a0d408d.8.direct; __mfwlt=1642382984; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642382985',
        'Host': 'www.mafengwo.cn',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
    }
    # 用户代理，表示告诉豆瓣服务器，我们是什么类型的机器、浏览器（本质上是告诉浏览器，我们可以接收什么水平的文件内容）

    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html
}

方法三：

两次访问目标详情页

代码一

import execjs
import requests
import re

head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Host': 'www.mafengwo.cn',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
}
url = 'http://www.mafengwo.cn/poi/5423409.html'

# response = requests.get(url)
# # cookie1
# cookie1 = response.cookies
# # js代码
# js_code = response.text




def get_521_content(url,head):

    req = requests.get(url, headers=head)
    cookies = req.cookies

    cookies = '; '.join(['='.join(item) for item in cookies.items()])
    txt_521 = req.text
    txt_521 = ''.join(re.findall('<script>(.*?)</script>', txt_521))
    return (txt_521, cookies)


def fixed_fun(function):
    func_return = function.replace('eval', 'return')
    content = execjs.compile(func_return)

    req = requests.get(url, headers=head)
    evaled_func = ''.join(re.findall('<script>(.*?)</script>', req.text))
    # print(js_con)
    # fn = js_con.split('=').split(' ')
    # evaled_func = content.call(fn)

    # print(evaled_func)
    mode_func = evaled_func.replace('while(window._phantom||window.__phantomas){};', ''). \
        replace('document.cookie=', 'return').replace(';if((function(){try{return !!window.addEventListener;}', ''). \
        replace("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',l,false);}", ''). \
        replace("else{document.attachEvent('onreadystatechange',l);}", '').replace(
        r"setTimeout('location.href=location.href.replace(/[\?|&]captcha-challenge/,\'\')',1500);", '')
    content = execjs.compile(mode_func)
    cookies = content.call('l')
    __jsl_clearance = cookies.split(';')[0]
    return __jsl_clearance


def cookie_dict(js, id):
    dict = {}
    js = js.split('=')
    id = id.split('=')
    dict[js[0]] = js[1]
    dict[id[0]] = id[1]
    return dict


if __name__ == '__main__':
    func = get_521_content(url,head)
    content = func[0]

    cookie_id = func[1]
    cookie_js = fixed_fun(func[0])
    dicted_cookie = cookie_dict(cookie_js, cookie_id)

    head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host': 'www.mafengwo.cn',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
        'Cookie': cookie_id + ';' + cookie_js
    }
    req = requests.get(url, headers=head)
    print(req.status_code)

代码二

# resouce:https://blog.csdn.net/qq_41879417/article/details/101701120?spm=1001.2101.3001.6661.1&utm_medium=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7ECTRLIST%7Edefault-1.pc_relevant_default&depth_1-utm_source=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7ECTRLIST%7Edefault-1.pc_relevant_default&utm_relevant_index=1
# -*- coding: utf-8 -*-
# @Time : 2022/1/16 9:11
# @Author : sherlock
# @File : creeper_2_521.py
# @Project : creeper

import execjs

import re

import requests

url = 'http://www.mafengwo.cn/poi/5423409.html'

head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Host': 'www.mafengwo.cn',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
}




def get_521_content(url):
    req = requests.get(url, headers=head, timeout=5)
    print(req.status_code, req.text)
    if req.status_code == 521:
        cookies = dict(req.cookies.items())
        print(cookies)
        js_con = ''.join(re.findall('<script>(.*?)</script>', req.text))
        if js_con:
            __jsl_clearance = fixed_fun(js_con, url)
            if __jsl_clearance:
                key, value = __jsl_clearance.split('=')
                cookies[key] = value
                return cookies


# 执行js代码获取cookies 的__jsl_clearance的键值
def fixed_fun(js_con, url):  # js_con 第一次请求获取的js内容

    func_return = js_con.replace('eval(', 'return(')
    print('第一次替换eval==》return后：  ', func_return)
    content = execjs.compile(func_return)
    # fn = js_con.split('=')[0].split(' ')[1]
    # 只有['document.cookie']
    fn = js_con.split('=')[0].split(' ')[1]
    evaled_func = content.call(fn)
    print('第一次执行js代码后： ', evaled_func)
    fn = evaled_func.split('=')[0].split(' ')[1]  # 获取动态函数名
    aa = evaled_func.split("<a href=\\'/\\'>")  # 获取<a>标签的内容
    aa = aa[1].split("</a>")[0] if len(aa) >= 2 else ''
    mode_func = evaled_func. \
        replace(
        "setTimeout('location.href=location.pathname+location.search.replace(/[\\?|&]captcha-challenge/,\\'\\')',1500);document.cookie=",
        'return'). \
        replace(';if((function(){try{return !!window.addEventListener;}', ''). \
        replace(
        "}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
        ''). \
        replace(
        "if((function(){try{return !!window.addEventListener;}catch(e){return false;}})()){document.addEventListener('DOMContentLoaded'," + fn + ",false)}else{document.attachEvent('onreadystatechange'," + fn + ")",
        ''). \
        replace("return'__jsl_clearance", "var window={};return '__jsl_clearance"). \
        replace(
        "var " + fn + "=document.createElement('div');" + fn + ".innerHTML='<a href=\\'/\\'>" + aa + "</a>';" + fn + "=" + fn + ".firstChild.href",
        "var " + fn + "='" + url + "'")
    print('第二次替换后的js代码：', mode_func)
    try:
        content = execjs.compile(mode_func)
        cookies = content.call(fn)
        __jsl_clearance = cookies.split(';')[0]
        print(__jsl_clearance)
        return __jsl_clearance
    except:
        print('js执行错误:', mode_func)
        return None


# 携带解密后的cookies第二次爬取详情页
def con_spider(cookies, url):
    response = requests.get(url, headers=head, cookies=cookies, timeout=5)
    if response.status_code == 200:
        response.encoding = 'utf-8'
        print(response.status_code)
        print(response.text)
        return response
    else:
        print('第二次爬取错误状态码：', response.status_code)
        return None


if __name__ == "__main__":
    cookies = get_521_content(url)
    con_spider(cookies, url)

代码三

# resource:https://www.cnblogs.com/gongs/p/10524710.html

import execjs

import re

import requests

url = 'http://www.mafengwo.cn/poi/5423409.html'

head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",

    "Host": "www.mafengwo.cn",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55",
}





def getResponse():
    """
    获取response
    :return:
    """
    response = requests.get(url, headers=head)
    return response


def getJslid(response):
    """

    :param response:
    :return:
    """

    cook = response.cookies

    ans = '; '.join(['='.join(item) for item in cook.items()])

    return ans


def getClearance(response):
    """

    :return:
    """
    txt = ''.join(re.findall('<script>(.*?)</script>', response.text))
    func_return = txt.replace('eval', 'return')

    content = execjs.compile(func_return)

    print("accurate error")
    # error
    eval_func = content.call('x')
    print(1)
    name = re.findall(r'var (.*?)=function.*', eval_func)[0]
    print(2)
    mode_func = eval_func.replace('while(window._phantom||window.__phantomas){};', ''). \
        replace('document.cookie=', 'return').replace('if((function(){try{return !!window.addEventListener;}', ''). \
        replace("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',%s,false)}" % name, ''). \
        replace("else{document.attachEvent('onreadystatechange',%s)}" % name, '').replace(
        r"setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')',1500);",
        '')  

    content = execjs.compile(mode_func)
    cookies = content.call(name)
    # print(cookies)
    clearance = cookies.split(';')[0]

    return clearance


def structurehead(cook, clearance):
    """
    构造新的head
    :return:
    """

    cookie = {
        'cookie': cook + ';' + clearance
    }

    return dict(head, **cookie)

def main():
    response = getResponse()
    cook = getJslid(response)
    print("error")
    # this step has some error about exejcss
    clearance = getClearance(response)
    print("2 error")
    dict = structurehead(cook, clearance)
    print(dict)

if __name__ == '__main__':
    main()

代码四

# -*- coding: utf-8 -*-
# @Time : 2022/1/18 13:32
# @Author : sherlock
# @File : creeper_4_521.py
# @Project : creeper

# coding=utf-8
# author=zhangjingyuan
# python3
from html.parser import HTMLParser
import lxml
import requests
from lxml import etree
import urllib.request
import urllib.parse
import re
import time
import io
import gzip
import random
import codecs
import execjs
import requests
import re

url1 = 'http://www.mafengwo.cn/poi/5423409.html'

url2 = 'https://jobs.51job.com/haikou/135562401.html'

url3 = 'https://movie.douban.com/subject/1292052/'

head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
    # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    # "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    # "Cache-Control": "max-age=0",
    # "Connection": "keep-alive",
    # "Cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2022-01-10+21:47:27";}; __jsluid_h=aa6e6e4350e2fd0e52cc227da10e26b5; __omc_chl=; __omc_r=; __mfwc=direct; uva=s:78:"a:3:{s:2:"lt";i:1641822448;s:10:"last_refer";s:6:"direct";s:5:"rhost";s:0:"";}";; __mfwurd=a:3:{s:6:"f_time";i:1641822448;s:9:"f_rdomain";s:0:"";s:6:"f_host";s:3:"www";}; __mfwuuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; UM_distinctid=17e443e711c512-05dd7ff73ec639-5e181552-144000-17e443e711dc58; login=mafengwo; mafengwo=16a582a6e0ca5f6c73654cb640343886_35627906_61e15d7be119c7.29366428_61e15d7be11a11.54996187; __jsl_clearance=1642341544.979|0|fafiHNHGZB+baEyxg5NVjPfVXm0=; PHPSESSID=s4foj9fhkm3mq8rs64omagvvp2; mfw_uid=35627906; __mfwa=1641822449293.40635.14.1642238623523.1642341546971; __mfwlv=1642341546; __mfwvn=10; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642215122,1642218839,1642238624,1642341547; CNZZDATA30065558=cnzz_eid=1067569765-1641819345-&ntime=1642337760; bottom_ad_status=0; uol_throttle=35627906; __mfwb=8cc49c72508e.10.direct; __mfwlt=1642343676; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642343676',
    # "Host": "www.mafengwo.cn",
    # "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55",
}


def getResponse():
    """
    获取response
    :return:
    """
    response = requests.get(url1, headers=head)
    return response


def getJslid(response):
    """

    :param response:
    :return:
    """
    cook = response.cookies
    return '; '.join(['='.join(item) for item in cook.items()])


def getClearance(response):
    """

    :return:
    """
    txt = ''.join(re.findall('<script>(.*?)</script>', response.text))
    func_return = txt.replace('eval', 'return')
    print(func_return)

    content = execjs.compile(func_return)
    print(type(content))
    # content = open("jsdom_document").read()
    # print(content)
    # execjs._exceptions.ProgramError: ReferenceError: document is not defined
    eval_func = content.call('x')

    name = re.findall(r'var (.*?)=function.*', eval_func)[0]

    mode_func = eval_func.replace('while(window._phantom||window.__phantomas){};', ''). \
        replace('document.cookie=', 'return').replace('if((function(){try{return !!window.addEventListener;}', ''). \
        replace("catch(e){return false;}})()){document.addEventListener('DOMContentLoaded',%s,false)}" % name, ''). \
        replace("else{document.attachEvent('onreadystatechange',%s)}" % name, '').replace(
        r"setTimeout('location.href=location.pathname+location.search.replace(/[\?|&]captcha-challenge/,\'\')',1500);",
        '')

    content = execjs.compile(mode_func)
    cookies = content.call(name)
    # print(cookies)
    clearance = cookies.split(';')[0]

    return clearance


def structureCookie(cook, clearance):
    """
    构造新的headers
    :return:
    """

    cookie = cook + ';' + clearance
    print(cookie)

    return cookie


if __name__ == '__main__':
    response = getResponse()
    clearance = getClearance(response)
    cook = getJslid(response)

    head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Host': 'www.mafengwo.cn',
        'Cookie': cook,
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55',
    }


    request = urllib.request.Request(url2, headers=head)

    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode(encoding="utf-8", errors="ignore")
        print(html)
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print("状态码：%s" % (e.code))
        if hasattr(e, "reason"):
            print("原因：%s" % (e.reason))

代码五

# -*- coding: utf-8 -*-
# @Time : 2022/1/18 17:43
# @Author : sherlock
# @File : creeper_5_seleu.py
# @Project : creeper


# -*-  coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import redis
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import smtplib
import email.utils
from email.mime.text import MIMEText
import time

url = 'https://www.ipip.net'


def driver_chrome():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    driver = webdriver.Chrome(chrome_options=chrome_options)
    return driver


def mymail(content):
    msg = MIMEText(content, _subtype='plain', _charset='utf8')
    msg['From'] = email.utils.formataddr(('Author', '989989797@qq.com'))
    msg['To'] = email.utils.formataddr(('Recipient', '8979879879@me.com'))
    msg['date'] = time.strftime('%a, %d %b %Y %H:%M:%S %z')
    msg['Subject'] = 'Your ip address'
    return msg


r = redis.Redis(host='localhost', port=6379, decode_responses=True)
myip = r.get('myip')
driver = driver_chrome()
driver.get(url)
cookies = driver.get_cookies()
new_cookies = {}
for i in cookies:
    driver.add_cookie({'name': i.get('name'), 'value': i.get('value')})
driver.get(url)
soup = BeautifulSoup(driver.page_source, features='lxml')
myres = soup.find_all('div', attrs={'class': 'yourInfo'})
trueip = myres[0].find_all('a')[0].text
msg = mymail(trueip)

with smtplib.SMTP_SSL('smtp.qq.com', 465) as server:
    server.login('80988988@qq.com', '9jsdfhjhfio')
    if myip != trueip:
        r.set('myip', trueip)
        server.sendmail('98198397@qq.com', '9879878798@me.com', msg.as_string())

driver.close()
driver.quit()

Test代码

# coding=utf-8
# author=zhangjingyuan
# python3
from html.parser import HTMLParser
import lxml
import requests
from lxml import etree
import urllib.request
import urllib.parse
import re
import time
import io
import gzip
import random
import codecs

url1 = 'http://www.mafengwo.cn/poi/5423409.html'

url2 = 'https://jobs.51job.com/haikou/135562401.html'

url3 ='https://movie.douban.com/subject/1292052/'

url4 = 'http://www.mafengwo.cn/search/q.php?q=%E6%B3%89%E5%B7%9E'

head = {  # 模拟浏览器头部信息，向豆瓣服务器发送消息
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": 'mfw_uuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; oad_n=a:3:{s:3:"oid";i:1029;s:2:"dm";s:15:"www.mafengwo.cn";s:2:"ft";s:19:"2022-01-10+21:47:27";}; __jsluid_h=aa6e6e4350e2fd0e52cc227da10e26b5; __omc_chl=; __omc_r=; __mfwc=direct; uva=s:78:"a:3:{s:2:"lt";i:1641822448;s:10:"last_refer";s:6:"direct";s:5:"rhost";s:0:"";}";; __mfwurd=a:3:{s:6:"f_time";i:1641822448;s:9:"f_rdomain";s:0:"";s:6:"f_host";s:3:"www";}; __mfwuuid=61dc38ef-2c67-45ce-ed26-c30fa04f2418; UM_distinctid=17e443e711c512-05dd7ff73ec639-5e181552-144000-17e443e711dc58; login=mafengwo; mafengwo=16a582a6e0ca5f6c73654cb640343886_35627906_61e15d7be119c7.29366428_61e15d7be11a11.54996187; __jsl_clearance=1642341544.979|0|fafiHNHGZB+baEyxg5NVjPfVXm0=; PHPSESSID=s4foj9fhkm3mq8rs64omagvvp2; mfw_uid=35627906; __mfwa=1641822449293.40635.14.1642238623523.1642341546971; __mfwlv=1642341546; __mfwvn=10; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642215122,1642218839,1642238624,1642341547; CNZZDATA30065558=cnzz_eid=1067569765-1641819345-&ntime=1642337760; bottom_ad_status=0; uol_throttle=35627906; __mfwb=8cc49c72508e.10.direct; __mfwlt=1642343676; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1642343676',
    "Host": "www.mafengwo.cn",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.55",
}

# # 输出访问网页的状态码
# req = requests.get(url, headers=head).status_code
# print(req)

request = urllib.request.Request(url1, headers=head)

html = ""
try:
    response = urllib.request.urlopen(request)
    html = response.read().decode(encoding="utf-8", errors="ignore")
    print(html)
except urllib.error.URLError as e:
    if hasattr(e, "code"):
        print("状态码：%s"%(e.code))
    if hasattr(e, "reason"):
        print("原因：%s"%(e.reason))

# response = requests.get(url1)
# print(response)

# # cookie1
# cookie1 = response.cookies
# print(cookie1)
# # js代码
# js_code = response.text
# print(js_code)