urllib库：页面获取类

最新推荐文章于 2022-06-10 13:56:18 发布

总想转行

最新推荐文章于 2022-06-10 13:56:18 发布

阅读量154

点赞数

分类专栏： python基础 urllib

本文链接：https://blog.csdn.net/qq_46020608/article/details/113111705

版权

python基础同时被 2 个专栏收录

21 篇文章 0 订阅

订阅专栏

urllib

4 篇文章 0 订阅

订阅专栏

使用的两种方式：

1.urllib.request.urlopen()方法直接使用

urllib.reuqest.urlopen(url)

2.urllib.request.Request类：将url变为一个对象进行操作

urlR=urllib.request.Request(url)
urllib.request.urlopen（urlR） #打开的是url的对象

urllib实例操作一：urllib.error的三种异常处理方式

分别有HTTPError 和URLError两种子类

import urllib.request
from urllib import error
import sys,os,re

#####基本方法调用###
url="https://shimo.im/docs/CHQt86xJvgp3R8yp"
res=urllib.request.urlopen(url)
print(res)
print(res.read())
#decode解码 encode编码
html=res.read().decode('utf-8')
title=re.findall('<title>(.*?)</title',html)
print(title)

#########Request类使用#######
if __name__ =='__main__':
    url="https://shimo.im/docs/CHQt86xJvgp3R8yp"
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'} #伪装一个浏览器用户
    try:
    ###这里默认的是get的请求##
        urlR=urllib.request.Request(url,headers=headers) #将url和访问头信息封装为一个对象
        resp=urllib.request.urlopen(urlR)  #打开的url的对象  等于requests库里面的：html=requests.get(url,headers=headers)
        html=resp.read().decode('utf-8')  #读取html具体内容并解码

    #############第一种异常写的方式########
    # except error:
    #     if error.URLError:
    #         print("url地址有问题"+error.URLError.reason) #只能获取失败原因
    #     elif error.HTTPError:
    #         print("错误原因："+error.HTTPError.reason)
    #         print("状态码："+error.HTTPError.code)  #404
    #     else:
    #         print(error)
    ######################第二种异常处理方式：较为常用####
    # HTTPError是URLError的子类,子类放前面
    # except error.HTTPError as e:
    #     print("错误原因：" + e.reason)
    #     print("状态码："+e.code)  #404
    # except error.URLError as e:
    #     print("url地址有问题" + e.reason)

    ###############第三种异常处理方式：能确保一定处理异常########
    except Exception as e:   #hasattr()判断是否有属性
        if hasattr(e,'code'):
            print("HTTPError"+e.reason)
            print(e.code)
        elif hasattr(e,'reason'):
            print("URLError"+e.reason)

    title = re.findall('<title>(.*?)</title', html)

urllib实例操作二：urllib.parse处理链接中的参数拼接

urllib.parse 中的 urlencode子类操作

#当需要拼接url时：如下url=
#https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=fiddler%E5%AE%89%E8%A3%85&fenlei=256&oq=fiddler%25E4%25B8%258B%25E8%25BD%25BD&rsv_pq=fcd8477700004214&rsv_t=1c661ksw6PxN371S62fgFQICO5MCASzR%2F2DNJRQd4JenyReCJUGpip9hwjg&rqlang=cn&rsv_enter=0&rsv_dl=tb&sug=fiddler&rsv_btype=t&rsv_sug3=16&rsv_sug1=11&rsv_sug7=100&prefixsug=fiddler%25E5%25AE%2589%25E8%25A3%2585&rsp=0&inputT=9322&rsv_sug4=10990
#拆分为：
#https://www.baidu.com/s
# ?
# ie=utf-8
# f=8
# rsv_bp=1
# rsv_idx=1
# tn=baidu
# wd=fiddler%E5%AE%89%E8%A3%85
# fenlei=256
# oq=fiddler%25E4%25B8%258B%25E8%25BD%25BD
# rsv_pq=fcd8477700004214
# ...等 这里太多了 省略
# #rsv_sug4=10990
from urllib.parse import urlencode  #处理url的参数为需要的格式
para={
    'ie':'utf-8',
    'f':'8',
    'rsv_bp':'1',
    'rsv_idx':'1',
    'tn':'baidu',
    'wd':'fiddler%E5%AE%89%E8%A3%85',
    'fenlei':'256',
    'oq':'fiddler%25E4%25B8%258B%25E8%25BD%25BD',
    'rsv_pq':'fcd8477700004214'
}
url='https://www.baidu.com/s?'+urlencode(para)
print(url)

urllib实例操作三：urllib.request.HTTPCookieProcessor记录登录的cookie

from urllib import request,parse
from http import cookiejar


#########生成cookie请求管理器用于记录cookie#############
cookie=cookiejar.CookieJar()  #生成一个cookie
cookie_handle=request.HTTPCookieProcessor(cookie)  #request模块生成一个cookie管理器
opener=request.build_opener(cookie_handle)   #生成带cookie的请求管理器

###############开始登录###########
login_url="https://**/sso/user/login"
data={
    'mode': '',
    'login_type': '0',
    'loginTypePage':'1',
    'username':'**',
    'password':'GjBC1Ew+ItcqIEeT6cf4JRHJk1jnN0MCjdFNzWaP4cm0na4zh+zbTuxh6I+yW1tdJ1IGfv/CzXT5Zt***',
    'from_url':'**'}
data=parse.urlencode(data)

headers={
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Content-Length': len(data),
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}

#bytes(data,encoding='utf-8') 转换为二进制
req=request.Request(login_url,data=bytes(data,encoding='utf-8'),headers=headers)
#resp=request.urlopen(req)
resp=opener.open(req)  #这个response是包含服务器回传的cookie标记的，后面获取其他页面数据直接用opener进行登录就可以了，不用再次写header了

##########获取其他页面信息#####
url='http://bbm.scm.adc.com/branchManager/feature/list'
req=request.Request(url)
resp=opener.open(req)
print(resp)
print( resp.read().decode('utf-8'))