使用的两种方式:
1.urllib.request.urlopen()方法直接使用
urllib.reuqest.urlopen(url)
2.urllib.request.Request类:将url变为一个对象进行操作
urlR=urllib.request.Request(url)
urllib.request.urlopen(urlR) #打开的是url的对象
urllib实例操作一:urllib.error的三种异常处理方式
分别有HTTPError 和URLError两种子类
import urllib.request
from urllib import error
import sys,os,re
#####基本方法调用###
url="https://shimo.im/docs/CHQt86xJvgp3R8yp"
res=urllib.request.urlopen(url)
print(res)
print(res.read())
#decode解码 encode编码
html=res.read().decode('utf-8')
title=re.findall('<title>(.*?)</title',html)
print(title)
#########Request类使用#######
if __name__ =='__main__':
url="https://shimo.im/docs/CHQt86xJvgp3R8yp"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'} #伪装一个浏览器用户
try:
###这里默认的是get的请求##
urlR=urllib.request.Request(url,headers=headers) #将url和访问头信息封装为一个对象
resp=urllib.request.urlopen(urlR) #打开的url的对象 等于requests库里面的:html=requests.get(url,headers=headers)
html=resp.read().decode('utf-8') #读取html具体内容并解码
#############第一种异常写的方式########
# except error:
# if error.URLError:
# print("url地址有问题"+error.URLError.reason) #只能获取失败原因
# elif error.HTTPError:
# print("错误原因:"+error.HTTPError.reason)
# print("状态码:"+error.HTTPError.code) #404
# else:
# print(error)
######################第二种异常处理方式:较为常用####
# HTTPError是URLError的子类,子类放前面
# except error.HTTPError as e:
# print("错误原因:" + e.reason)
# print("状态码:"+e.code) #404
# except error.URLError as e:
# print("url地址有问题" + e.reason)
###############第三种异常处理方式:能确保一定处理异常########
except Exception as e: #hasattr()判断是否有属性
if hasattr(e,'code'):
print("HTTPError"+e.reason)
print(e.code)
elif hasattr(e,'reason'):
print("URLError"+e.reason)
title = re.findall('<title>(.*?)</title', html)
urllib实例操作二:urllib.parse处理链接中的参数拼接
urllib.parse 中的 urlencode子类操作
#当需要拼接url时:如下url=
#https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=fiddler%E5%AE%89%E8%A3%85&fenlei=256&oq=fiddler%25E4%25B8%258B%25E8%25BD%25BD&rsv_pq=fcd8477700004214&rsv_t=1c661ksw6PxN371S62fgFQICO5MCASzR%2F2DNJRQd4JenyReCJUGpip9hwjg&rqlang=cn&rsv_enter=0&rsv_dl=tb&sug=fiddler&rsv_btype=t&rsv_sug3=16&rsv_sug1=11&rsv_sug7=100&prefixsug=fiddler%25E5%25AE%2589%25E8%25A3%2585&rsp=0&inputT=9322&rsv_sug4=10990
#拆分为:
#https://www.baidu.com/s
# ?
# ie=utf-8
# f=8
# rsv_bp=1
# rsv_idx=1
# tn=baidu
# wd=fiddler%E5%AE%89%E8%A3%85
# fenlei=256
# oq=fiddler%25E4%25B8%258B%25E8%25BD%25BD
# rsv_pq=fcd8477700004214
# ...等 这里太多了 省略
# #rsv_sug4=10990
from urllib.parse import urlencode #处理url的参数为需要的格式
para={
'ie':'utf-8',
'f':'8',
'rsv_bp':'1',
'rsv_idx':'1',
'tn':'baidu',
'wd':'fiddler%E5%AE%89%E8%A3%85',
'fenlei':'256',
'oq':'fiddler%25E4%25B8%258B%25E8%25BD%25BD',
'rsv_pq':'fcd8477700004214'
}
url='https://www.baidu.com/s?'+urlencode(para)
print(url)
urllib实例操作三:urllib.request.HTTPCookieProcessor记录登录的cookie
from urllib import request,parse
from http import cookiejar
#########生成cookie请求管理器用于记录cookie#############
cookie=cookiejar.CookieJar() #生成一个cookie
cookie_handle=request.HTTPCookieProcessor(cookie) #request模块生成一个cookie管理器
opener=request.build_opener(cookie_handle) #生成带cookie的请求管理器
###############开始登录###########
login_url="https://**/sso/user/login"
data={
'mode': '',
'login_type': '0',
'loginTypePage':'1',
'username':'**',
'password':'GjBC1Ew+ItcqIEeT6cf4JRHJk1jnN0MCjdFNzWaP4cm0na4zh+zbTuxh6I+yW1tdJ1IGfv/CzXT5Zt***',
'from_url':'**'}
data=parse.urlencode(data)
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Length': len(data),
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
#bytes(data,encoding='utf-8') 转换为二进制
req=request.Request(login_url,data=bytes(data,encoding='utf-8'),headers=headers)
#resp=request.urlopen(req)
resp=opener.open(req) #这个response是包含服务器回传的cookie标记的,后面获取其他页面数据直接用opener进行登录就可以了,不用再次写header了
##########获取其他页面信息#####
url='http://bbm.scm.adc.com/branchManager/feature/list'
req=request.Request(url)
resp=opener.open(req)
print(resp)
print( resp.read().decode('utf-8'))