urllib_demos（urllib.request类；HTTPResponse类对象构造及属性方法；自定义opener/设置代理服务器等）

最新推荐文章于 2022-07-26 11:47:09 发布
血皇敖天
最新推荐文章于 2022-07-26 11:47:09 发布
阅读量492
点赞数
分类专栏：爬虫相关
本文链接：https://blog.csdn.net/weixin_44015805/article/details/103149844
版权
爬虫相关专栏收录该内容
14 篇文章 2 订阅
订阅专栏
"""
urllib_demos.py
（urllib.request类；HTTPResponse类对象构造及属性方法；自定义opener/设置代理服务器等）


使用：
# #urllib库包含四大模块：
# #urllib.request        : 请求模块
# #urllib.error          : 异常处理模块
# #urllib.parse          : URL解析模块
# #urllib.robotparser    : robots.txt解析模块

一、##### urllib.request类
1.1 #使用：urllib.request.urlopen(url, data=None, timeout=<object object at 0x0000000001CBE6C0>, *, cafile=None, capath=None, cadefault=False, context=None)
#来发送HTTP请求，获取网页内容。
#参数url: 可以是字符串，也可以是一个urllib.request请求对象

#参数data: 指定发送数据
#注意：data参数必须是一个bytes对象
#注意：使用urllib.parse.urlencode()可以将自定义的data转换为标准格式，其接受键值对参数。
#注意：data默认为None时时GET方式发送请求，当用户设置data参数时需要将发送请求的方式改为POST

#参数timeout: 可选参数，设置超时时间，单位为秒。
#参数cafile/capath/cadefault: 用于实现可信任的CA证书的HTTPS请求。很少用到
#参数context: 实现SSL加密传输。很少用到


二、##### HTTPResponse类对象
#使用：urllib.request模块中的urlopen()方法发送HTTP请求，
#返回的相应内容封装在一个HTTPResponse类对象中，该类属于http.client模块.
#该类提供了获取URL、状态码、相应内容等一系列方法。如：
#geturl(): 用于获取相应内容的URL，该方法可以验证发送的HTTP请求是否被重新调配。
#info(): 返回页面的元信息。
#getcode(): 返回HTTP请求的相应状态码。
#read():  读取获取到的网页内容

2.1 ##### 构造Request对象
#使用：urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None))
#来构造Request对象
#参数data: 默认为None,该参数表示提交表单数据，同时HTTP请求方法将从 默认的GET方法改为POST方式
#参数headers: 默认为空，该参数是一个字典类型，包含了需要发送的HTTP报头的键值对。

2.2 ##### URL的编码解码
#使用: urllib.parse.urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=<function quote_plus at 0x0000000002F04BF8>)
#来将URL进行编码。可以将key:value格式的键值对转换成伟'key=value'格式的字符串。

#使用：urllib.parse.unquote(string, encoding='utf-8', errors='replace')
#来将编码过的URL进行解码，对应urllib.parse.urlencode()

2.3 ##### 添加特定 headers
#使用：Request.add_header()方法添加特定的Headers.
#使用：Request.get_header()方法来查看已有的Headers。

#运行程序后，使用Fiddler查看HTTP请求，可以看到在发送的请求头中，已经包含了添加的headers


三、###### 代理服务器（设置代理）
#注意：urllib.request.urlopen()是urllib.request.OpenerDirector类的对象
#注意：urllib.request.urlopen()不支持代理、Cookie等其他HTTP/HTTPS高级功能。
#如果需要设置代理，需要自定义opener,而urllib.request.urlopen()就是模块构建好的一个opener

3.1 #使用：代理服务器设置 需要 自定义opener，需要执行下列3个步骤：
#1、使用相关的 Handler处理器创建特定功能的处理器对象
#2、通过 urllib.request.build_opener()方法 使用这些处理器对象 创建自定义的 opener对象。
#3、使用自定义的 opener对象，调用 open()方法 发送请求。
#注意：如果程序中所有的请求都使用 自定义的opener,可以使用urllib2.install_opener()将自定义的opener对象定义为全局opener,
#表示以后凡是调用 urlopen，都将使用自定义的opener。

#####
3.2 #使用：urllib.request.ProxyHandler(proxies=None)方法 来设置代理服务器，
#然后自定义opener，然后在使用opener的open()方法发送请求。

四、#####
#使用：urllib.error.URLError 捕获网络异常.
#使用：urllib.error.HTTPError 捕获服务器异常。

"""

# =============================================================================
# #urllib
# #urllib库是python内置的HTTP请求库，可以看作处理URL的组件集合。
# #
# #urllib库包含四大模块：
# #urllib.request        : 请求模块
# #urllib.error          : 异常处理模块
# #urllib.parse          : URL解析模块
# #urllib.robotparser    : robots.txt解析模块
# =============================================================================

##########
#探索urllib模块
import urllib

help(urllib)
dir(urllib)

help(urllib.request)
help(urllib.parse)
help(urllib.response)
help(urllib.error)

dir(urllib.request)
dir(urllib.response)
dir(urllib.parse)
dir(urllib.error)


##########
#常用类、属性、方法等
help(urllib.request)
dir(urllib.request)

help(urllib.request.urlopen)


#####
#使用：urllib.request.urlopen(url, data=None, timeout=<object object at 0x0000000001CBE6C0>, *, cafile=None, capath=None, cadefault=False, context=None)
#来发送HTTP请求，获取网页内容。

#参数url: 可以是字符串，也可以是一个urllib.request请求对象

#参数data: 指定发送数据
#注意：data参数必须是一个bytes对象
#注意：使用urllib.parse.urlencode()可以将自定义的data转换为标准格式，其接受键值对参数。
#注意：data默认为None时时GET方式发送请求，当用户设置data参数时需要将发送请求的方式改为POST

#参数timeout: 可选参数，设置超时时间，单位为秒。
#参数cafile/capath/cadefault: 用于实现可信任的CA证书的HTTPS请求。很少用到
#参数context: 实现SSL加密传输。很少用到

import urllib.request
response=urllib.request.urlopen('http://www.baidu.com')
print(type(response))                   #返回HTTPResponse类型的对象。
html=response.read().decode('utf-8')    #使用：read()方法读取获取到的网页内容
print(type(html))

#data参数的使用
import urllib.request
import urllib.parse
data=bytes(urllib.parse.urlencode({'world':'hello'}).encode('utf-8'))
response=urllib.request.urlopen('http://httpbin.org/post',data=data)
print(response.read())                  #使用：read()方法读取获取到的网页内容
print(response)


##### HTTPResponse类对象
#使用：urllib.request模块中的urlopen()方法发送HTTP请求，
#返回的相应内容封装在一个HTTPResponse类对象中，该类属于http.client模块.
#该类提供了获取URL、状态码、相应内容等一系列方法。如：
#geturl(): 用于获取相应内容的URL，该方法可以验证发送的HTTP请求是否被重新调配。
#info(): 返回页面的元信息。
#getcode(): 返回HTTP请求的相应状态码。
#read():  读取获取到的网页内容

import urllib.request
response=urllib.request.urlopen('http://python.org')
print(response.geturl())    #获取URL
print(response.getcode())   #获取状态码
print(response.info())      #获取元信息
print(response.read())      #读取获取到的网页内容



##### 构造Request对象
#使用：urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None))
#来构造Request对象
#参数data: 默认为None,该参数表示提交表单数据，同时HTTP请求方法将从 默认的GET方法改为POST方式
#参数headers: 默认为空，该参数是一个字典类型，包含了需要发送的HTTP报头的键值对。
import urllib.request
import urllib.parse
#help(urllib.request.Request)

url='http://www.itcast.cn'
headers={'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} 

dict_demo={'name':'itcast'}
data=bytes(urllib.parse.urlencode(dict_demo).encode('utf-8'))

#构造Request对象
request=urllib.request.Request(url, data=data, headers=headers)
response=urllib.request.urlopen(request)
#html=response.read().decode('utf-8')
#print(html)
print(response.read().decode('utf-8'))      #读取获取到的网页内容
print(response.geturl())    #获取URL
print(response.getcode())   #获取状态码
print(response.info())      #获取元信息


##### URL的编码解码
#使用: urllib.parse.urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=<function quote_plus at 0x0000000002F04BF8>)
#来将URL进行编码。可以将key:value格式的键值对转换成伟'key=value'格式的字符串。

#使用：urllib.parse.unquote(string, encoding='utf-8', errors='replace')
#来将编码过的URL进行解码，对应urllib.parse.urlencode()

import urllib.parse
#help(urllib.parse.urlencode)
#help(urllib.parse.unquote)
data={
      'a':'传播智客',
      'b':'黑马程序员'
      }
result=urllib.parse.urlencode(data)      #urlencode()对URL进行编码
print(result)

result2=urllib.parse.unquote(result)     #unquote()对编码过的URL进行解码
print(result2)


##### 处理GET请求（通过编码url，构造新URL，构造Request对象，添加headers等）
import urllib.request
import urllib.parse

url='http://www.baidu.com'
word={'wd':'传播智客'}
word=urllib.parse.urlencode(word)        #urlencode()对URL进行编码

url=url + '?' + word
headers={'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} 

request=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(request)

print(response.read().decode('utf-8'))      #读取获取到的网页内容
print(response.geturl())    #获取URL
print(response.getcode())   #获取状态码
print(response.info())      #获取元信息


##### 处理POST请求（）
#注意：如果以POST方式发送请求，urlopen()方法必须设置data参数。data参数以字典形式存放数据。
import urllib.request
import urllib.parse

url='http://fanyi.youdao.com/translate?smartresult-dict&smartresult=rule&smartresult=ugc&sessionFrom=null'
headers={'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} 


#深入：Fiddler抓包软件，打开fiddler请求窗口，点击WebFroms选项查看数据体。
#注意：打开 Fiddler 请求窗口，点击 WebFroms选项 查看数据体。

formdata={
        'type':'AUTO',
        'I':'I love python',
        'doctypev':'json',
        'xmlVersion':'1.8',
        'keyfrom':'fanyi.web',
        'ue':'utf-8',
        'action':'FY_BY_ENTER',
        'typoResult':'true'
        }

data=bytes(urllib.parse.urlencode(formdata).encode('utf-8'))
request=urllib.request.Request(url,data=data,headers=headers)
response=urllib.request.urlopen(request)

print(response.read().decode('utf-8'))      #读取获取到的网页内容
print(response.geturl())    #获取URL
print(response.getcode())   #获取状态码
print(response.info())      #获取元信息


##### 添加特定 headers
#使用：Request.add_header()方法添加特定的Headers.
#使用：Request.get_header()方法来查看已有的Headers。

#运行程序后，使用Fiddler查看HTTP请求，可以看到在发送的请求头中，已经包含了添加的headers
import urllib.request

url='http://www.itcast.cn'
user_agent={'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)'}

request=urllib.request.Request(url,headers=user_agent)
request.add_header('Connection','keep-alive')    #使用：Request.add_header()方法添加特定的Headers.
response=urllib.request.urlopen(request)
print(response.code)
print(response.read().decode('utf-8'))



###### 代理服务器（设置代理）
#注意：urllib.request.urlopen()是urllib.request.OpenerDirector类的对象
#注意：urllib.request.urlopen()不支持代理、Cookie等其他HTTP/HTTPS高级功能。
#如果需要设置代理，需要自定义opener,而urllib.request.urlopen()就是模块构建好的一个opener

#使用：代理服务器设置 需要 自定义opener，需要执行下列3个步骤：
#1、使用相关的 Handler处理器创建特定功能的处理器对象
#2、通过 urllib.request.build_opener()方法 使用这些处理器对象 创建自定义的 opener对象。
#3、使用自定义的 opener对象，调用 open()方法 发送请求。
#注意：如果程序中所有的请求都使用 自定义的opener,可以使用urllib2.install_opener()将自定义的opener对象定义为全局opener,
#表示以后凡是调用 urlopen，都将使用自定义的opener。

import urllib.request
#dir(urllib.request)
#help(urllib.request.OpenerDirector)
#dir(urllib.request.OpenerDirector)

http_handler=urllib.request.HTTPHandler()               #1、构建一个HTTPHandler处理器对象，支持处理HTTP请求
opener=urllib.request.build_opener(http_handler)        #2、调用urllib2.build_opener()方法，创建支持处理HTTP请求的opener对象

request=urllib.request.Request('http://www.baidu.com')  #构建Request对象
response=opener.open(request)                           #3、调用自定义的opener对象的open()方法来发送request请求。区别于不再通过 urllib.request.urlopen()
print(response.read().decode('utf-8'))


#####
#使用：urllib.request.ProxyHandler(proxies=None)方法 来设置代理服务器，
#然后自定义opener，然后在使用opener的open()方法发送请求。
import urllib.request
#help(urllib.request.ProxyHandler)
import random                                           #构建一个代理IP列表，随机选择一个IP作为代理IP
proxy_list=[
        {'http':'180.119.141.135:9999'},
        {'http':'117.95.214.24:9999'},
        {'http':'120.83.111.102:9999'},
        {'http':'117.57.91.31:9999'},
        {'http':'27.152.91.68:9999'},
        {'http':'182.34.32.104:9999'}]
proxy=random.choice(proxy_list)

#构建两个 ProxyHandler处理器对象，其中一个有代理IP，另外一个没有代理IP。
httpproxy_handler=urllib.request.ProxyHandler(proxy)
nullproxy_handler=urllib.request.ProxyHandler({})

#构建代理开关，根据开关状态，使用不同的代理模式。
#通过urllib.request.build_opener()方法 使用proxy_handler处理器对象，创建自定义的opener对象。
proxy_switch=False
if proxy_switch:
    opener=urllib.request.build_opener(httpproxy_handler)
else:
    opener=urllib.request.build_opener(nullproxy_handler)

request=urllib.request.Request('http://www.baidu.com')
response=opener.open(request)
print(response.read().decode('utf-8'))



#####
#使用：urllib.error.URLError 捕获网络异常.
import urllib.request
import urllib.error
#help(urllib.error)
#dir(urllib.error)
#help(urllib.error.URLError)

request=urllib.request.Request('http://www.ajkfhafwjqh.com')
try:
    urllib.request.urlopen(request,timeout=5)
except urllib.error.URLError as err:
    print(err)

#####
#使用：urllib.error.HTTPError 捕获服务器异常。
import urllib.request
import urllib.error
#help(urllib.error.HTTPError)

request=urllib.request.Request('http://www.itcast.cn/net')
try:
    urllib.request.urlopen(request)
except urllib.error.HTTPError as e:
    print(e)