urllib的使用
urllib是Python内置的HTTP请求库
包括:
- urlib.request : 请求模块
- urlib.error : 异常处理
- urlib.parse url : URL解析
- urllib.robotparser :robots.txt解析模块
一个基本的请求
# 导入urllib库
import urllib.request
# 打开URL链接,存入response
response = urllib.request.urlopen('http://www.baidu.com')
# 结果为:<http.client.HTTPResponse object at 0x000001387150CC08>
print(response)
# read() 函数获取网页内容
print(response.read())
# decode()设置编码集
print(response.read().decode('utf-8'))
data参数的使用
import urllib.parse
import urllib.request
# bytes(urllib.parse.urlencode()):可以将post数据进行转换放到urllib.request.urlopen的data参数中
data = bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf-8')
print(data) # b'word=hello'
response = urllib.request.urlopen('http://httpbin.org/post',data=data)
# 打印请求接受的参数
print(response.read().decode('utf-8'))
timeout的使用
timeout:设定程序超时的时间
import urllib.request
# timeout程序超时时间设定
response = urllib.request.urlopen('http://httpbin.org/get',timeout=0.3)
print(response.read().decode('utf-8'))
如果设置为0.1就会报错,
Traceback (most recent call last):
File "C:\Users\10218\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 1317, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "C:\Users\10218\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 1244, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\10218\AppData\Local\Programs\Python\Python37\lib\http\client.py", line 1290, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
...
对异常进行抓取
import socket
import urllib.request
import urllib.error
# timeout程序超时时间设定
try:
response = urllib.request.urlopen('http://httpbin.org/get',timeout=0.1)
except urllib.error.URLError as e:
# isinstance:判读对象是否为已知的类型
if isinstance(e.reason,socket.timeout):
print('Time Out')
request
设置Headers
from urllib import request,parse
url = 'http://httpbin.org/post'
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
'Host': 'httpbin.org'
}
dict = {
'name': 'zhangsan'
}
data = bytes(parse.urlencode(dict),encoding='utf-8')
req = request.Request(url=url, data=data, headers=headers, method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
result:
{
"args": {},
"data": "",
"files": {},
"form": {
"name": "zhangsan"
},
"headers": {
"Accept-Encoding": "identity",
"Content-Length": "13",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
},
"json": null,
"origin": "111.227.172.92, 111.227.172.92",
"url": "https://httpbin.org/post"
}
第二种方式
from urllib import request, parse
url = 'http://httpbin.org/post'
dict = {
'name': 'Germey'
}
data = bytes(parse.urlencode(dict), encoding='utf8')
req = request.Request(url=url, data=data, method='POST')
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
ProxyHandler代理
import urllib.request
import urllib.request
# 设置代理地址:urllib.request.ProxyHandler
proxy_handler = urllib.request.ProxyHandler({
'http': 'http://127.0.0.1:9743',
'https': 'https://127.0.0.1:9743'
})
opener = urllib.request.build_opener(proxy_handler)
response = opener.open('http://httpbin.org/get')
print(response.read())
cookie,HTTPCookiProcessor
import http.cookiejar, urllib.request
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
print(item.name+"="+item.value)
result:
BAIDUID=B847E5C816CFA63FE58503419CEEC741:FG=1
BIDUPSID=B847E5C816CFA63FE58503419CEEC741
H_PS_PSSID=1465_21097_29519_28519_29098_29567_28830_29221_29460_29588
PSTM=1564109868
delPer=0
BDSVRTM=0
BD_HOME=0
如果想要保存cookie到本地,可以使用http.cookiejar.MozillaCookieJar和http.cookiejar.LWPCookieJar()
http.cookiejar.MozillaCookieJar()方式
import http.cookiejar, urllib.request
filename = "cookie.txt"
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)
http.cookiejar.LWPCookieJar()方式
import http.cookiejar, urllib.request
filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
# save()函数带有两个参数,ignore_discard和ignore_expires。
# ignore_discard: 即保存需要被丢弃的cookie。
# ignore_expires: 即过期的cookie也保存。
cookie.save(ignore_discard=True,ignore_expires=True)
result:
# Netscape HTTP Cookie File
# http://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file! Do not edit.
.baidu.com TRUE / FALSE 3711594182 BAIDUID 42B07E3E284A3F95EE69D8D8FF10F975:FG=1
.baidu.com TRUE / FALSE 3711594182 BIDUPSID 42B07E3E284A3F95EE69D8D8FF10F975
.baidu.com TRUE / FALSE H_PS_PSSID 1451_21095_18560_29520_28519_29098_29567_28839_29221_26350
.baidu.com TRUE / FALSE 3711594182 PSTM 1564110535
.baidu.com TRUE / FALSE delPer 0
www.baidu.com FALSE / FALSE BDSVRTM 0
www.baidu.com FALSE / FALSE BD_HOME 0
异常处理
example:
from urllib import request,error
try:
response = request.urlopen("http://pythonsite.com/1111.html")
except error.URLError as e:
print(e.reason)
# result: Not Found
URLError:reason
HTTPError有三个属性:reason,code,headers,是URLError的子类
from urllib import request,error
try:
response = request.urlopen("http://pythonsite.com/1111.html")
except error.HTTPError as e:
print(e.reason)
print(e.code)
print(e.headers)
except error.URLError as e:
print(e.reason)
else:
print("reqeust successfully")
对e.reason进行改进
import socket
from urllib import error,request
try:
response = request.urlopen("http://www.pythonsite.com/",timeout=0.001)
except error.URLError as e:
print(type(e.reason))
if isinstance(e.reason,socket.timeout):
print("time out")
URL解析
urlparse
The URL parsing functions focus on splitting a URL string into its components, or on combining URL components into a URL string.
urllib.parse.urlparse(urlstring, scheme=’’, allow_fragments=True)
from urllib.parse import urlparse
# 将URL结果返回为一个URL字符串,进行拆分
result = urlparse("http://www.baidu.com/index.html;user?id=5#comment")
print(result) # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
urlunpars
与urlparse相反,他用于拼接URL
from urllib.parse import urlunparse
data = ['http','www.baidu.com','index.html','user','a=123','commit']
print(urlunparse(data)) # http://www.baidu.com/index.html;user?a=123#commit
urljoin
直接拼接URL字符串
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com', 'FAQ.html'))
print(urljoin('http://www.baidu.com', 'https://pythonsite.com/FAQ.html'))
print(urljoin('http://www.baidu.com/about.html', 'https://pythonsite.com/FAQ.html'))
print(urljoin('http://www.baidu.com/about.html', 'https://pythonsite.com/FAQ.html?question=2'))
print(urljoin('http://www.baidu.com?wd=abc', 'https://pythonsite.com/index.php'))
print(urljoin('http://www.baidu.com', '?category=2#comment'))
print(urljoin('www.baidu.com', '?category=2#comment'))
print(urljoin('www.baidu.com#comment', '?category=2')
result:
http://www.baidu.com/FAQ.html
https://pythonsite.com/FAQ.html
https://pythonsite.com/FAQ.html
https://pythonsite.com/FAQ.html?question=2
https://pythonsite.com/index.php
http://www.baidu.com?category=2#comment
www.baidu.com?category=2#comment
www.baidu.com?category=2
urlencode
用于将字典转换为URL
from urllib.parse import urlencode
params = {
"name":"zhaofan",
"age":23
}
base_url = "http://www.baidu.com?"
url = base_url+urlencode(params)
print(url)
总结:
urllib.request
负责处理请求
请求参数:
- data参数:有data参数;则是POST请求,无data参数,则是get请求
- timeout参数
设置请求头
通过 request.Request
可以处理设置请求URL、请求data参数、请求头,请求方式
通过req.add_header
可以设置请求头
ProxyHandler负责代理
Cookie
cookie,HTTPCookiProcessor可以保存Cookie信息
利用http.cookiejar.MozillaCookieJar
和http.cookiejar.LWPCookieJar()
可以保存Cookie到本地
异常处理
urllib.error为异常处理包
异常类型:URLError、HTTPError(code,reason,headers)
HTTPErro为URLError的子类
URL解析
urlparse: 用于URL的分解
urlunpars: 可用于带data的URL的拼接
urljoin: 直接拼接URL
urlencode: 将字段转换为URL参数