urllib库的用法
urlopen
urllib.request.urlopen(url,data=None,[timeout,]*,cafile=None,cadefault=False,context=None)
import urllib.request
# GET类型的请求,无需data参数
response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))
import urllib.request
import urllib.parse
# POST类型的请求,需要传入data
data = bytes(urllib.parse.urlencode({'world': 'hello'}), encoding='utf8')
response = urllib.request.urlopen('http://httpbin.org/post', data=data)
print(response.read())
import urllib.request
# timeout 表示必须在此时间内获得请求,否则报错
response = urllib.request.urlopen('http://httpbin.org/get', timeout=1)
print(response.read())
import urllib.request
import socket
import urllib.error
# 请求时长超出0.1 则抛出异常
try:
response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1)
except urllib.error.URLError as e:
if isinstance(e.reason, socket.timeout):
print('TIME OUT')
# TIME OUT
response
响应的类型
import urllib.request
response = urllib.request.urlopen('https://www.python.org')
print(type(response))
# <class 'http.client.HTTPResponse'>
状态码Status Code
,响应头Response Headers
import urllib.request
response = urllib.request.urlopen('https://www.python.org')
print(response.status)
print(response.getheaders()) # 获取所有响应头
print(response.getheader('Server')) # 获取特定的响应头
import urllib.request
response = urllib.request.urlopen('https://www.python.org')
print(response.status)
print(response.read().decode('utf-8'))
request
构造request,便于自定义参数,设置请求方式
from urllib import request, parse
# 构造request,便于自定义参数,设置请求方式
url = 'http://httpbin.org/post'
headers = {
'User-Agent': 'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
'Host': 'httpbin.org'
}
dict = {
'name': 'Germey'
}
data = bytes(parse.urlencode(dict), encoding='utf8') # formdata的数据用bytes类型编码下
req = request.Request(url=url, data=data, headers=headers,
method='POST') # POST请求方式
response = request.urlopen(req)
print(response.status)
# print(response.read())
print(response.read().decode('utf-8'))
或者调用request的add_header方法传入头
from urllib import request, parse
url = 'http://httpbin.org/post'
dict = {
'name': 'Germey'
}
# formdata的数据用 bytes类型编码下
data = bytes(parse.urlencode(dict), encoding='utf8')
req = request.Request(url=url, data=data, method='POST') # POST请求方式
req.add_header('User-Agent', 'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
handler
代理
通过切换代理 ip 让服务器识别出来自不同的地方,防止封掉爬虫
import urllib.request
proxy_hander = urllib.request.ProxyHandler({
'http': 'http://49.76.14.74:61202',
'https': 'https://49.76.14.74:61202'
})
# 通过切换代理 ip 让服务器识别出来自不同的地方,防止封掉爬虫
opener = urllib.request.build_opener(proxy_hander)
response = opener.open('http://www.baidu.com')
print(response.read())
Cookie,保存用户登陆信息
import http.cookiejar
import urllib.request
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com') # 把百度的cookie复制到变量中
for item in cookie: # 打印出cookie
print(item.name+"="+item.value)
把cookie保存为文件,便于每次读取登陆信息。
import http.cookiejar
import urllib.request
filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename) # 声明为MoziilaCookiwJar(火狐)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save( ignore_expires = True,ignore_discard=True)
import http.cookiejar
import urllib.request
filename = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar(filename) # cookie2.0的格式存储
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save( ignore_expires = True,ignore_discard=True)
用什么方法存cookie,用什么方法读取就行了
import http.cookiejar
import urllib.request
# 用什么格式的cookie存,用什么方法读取就行了
cookie = http.cookiejar.LWPCookieJar()
# ignore_discard的意思是即使cookies将被丢弃也将它保存下来
#ignore_expires的意思是如果cookies已经过期也将它保存并且文件已存在时将覆盖
cookie.load('cookie.txt', ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
异常处理
from urllib import request,error
# 捕捉异常
try:
response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.URLError as e:
print(e.reason)
from urllib import request,error
# 捕捉异常
try:
response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:
print(e.reason,e.code,e.headers,sep='\n')
except error.URLError as e:
print(e.reason)
else:
print('Request Successfully')
import socket
import urllib.request
import urllib.error
try:
response = urllib.request.urlopen('http://www.baidu.com',timeout=0.01)
except urllib.error.URLError as e:
print(type(e.reason))
if isinstance(e.reason,socket.timeout):
print('TIME OUT')
URL解析
urlparse 将网址分解成六个参数
urllib.parse.urlparse(urlstring,scheme=”,allow_fragments=True)
from urllib.parse import urlparse
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(type(result),result)
如果网址没有指定协议类型,则可以自定义协议类型。如果前面网址制定了协议类型,则后面不会生效。
from urllib.parse import urlparse
# 指定协议类型
result = urlparse('www.baidu.com/index.html;user?id=5#comment',scheme='https')
print(result)
如果allow_fragment参数为false,则不识别freagment标识符。相反,它们被解析为path、params或query的一部分,并将fragment设置为返回值中的空字符串。
from urllib.parse import urlparse
result = urlparse('www.baidu.com/index.html;user?id=5#comment',allow_fragments=False)
print(result)
urlunparse 将参数拼接成网址
from urllib.parse import urlunparse
data = ['http','www.baidu.com','index.html','user','a=6','comment']
print(urlunparse(data))
urljoin
from urllib.parse import urljoin
print(urljoin('http://www.baidu.com','https://cuiqingcai.com/FAQ.html'))
urlencode 把一个字典对象转化为get参数
from urllib.parse import urlencode
params = {
'name':'germey',
'age':22
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(params)
print(url)