文档:https://docs.python.org/3/library/urllib.html
目录
文档:https://docs.python.org/3/library/urllib.html
10.下载网页并返回其 HTML,有些时候返回 4xx 是我们客户端问题,5xx 是服务端问题,可重新下载试试,并设置代理
12.提取 url 的内容 https://www.baidu.com/s?ie=utf-8&wd=python
1.post 请求
import urllib.request
import urllib.parse
url = ''
#将数据使用 urlencode 编码处理后,使用 encode 设置为 utf-8 格式
postdata = urllib.parse.urlencode({'name' : '用户名',
'pass' : '密码'}).encode('utf-8')
req = urllib.request.Request(url,postdata)
req.add_header(header)
data = urllib.request.urlopen(req).read()
2.get 请求
url 改关键字就可以请求的叫 get 请求
import urllib.request
url = 'http://www.baidu.com/s?wd='
key = '韦玮老师'
key_code = urllib.request.quote(key)
url_all = url + key_code
req = urllib.request.Request(url_all)
data = urllib.request.urlopen(req).read()
fh = open('5.html','wb')
fh.write(data)
fh.close()
3.header 属性
"""
方一:使用 build_opener() 修改报头
"""
import urllib.request
url = 'https://blog.csdn.net/zjkpy_5/article/details/83352403'
headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKi\
t/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.3\
6')
opener = urllib.request.build_opener()
opener.addheaders = [headers]
data = opener.open(url).read()
write_file = open('3.html','wb')
write_file.write(data)
write_file.close()
"""
方二:使用 add_header() 添加报头
"""
import urllib.request
url = 'https://blog.csdn.net/zjkpy_5/article/details/83352403'
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKi\
t/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.3\
6')
data = urllib.request.urlopen(req).read()
4.超时设置
import urllib.request
for i in range(1,100):
try:
file = urllib.request.urlopen('http://yum.iqianyue.com',timeout= 1)
data = file.read()
print(len(data))
except Exception as e:
print(e)
5.代理服务设置
import urllib.request
def use_proxy(proxy_addr,url):
proxy = urllib.request.ProxyHandler({'http' : proxy_addr})
opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
data = urllib.request.urlopen(url).read().decode('utf-8')
return data
proxy_addr = '103.16.61.182:36885'
data = use_proxy(proxy_addr,'http://www.baidu.com')
print(len(data))
6.DegbugLog 调试日志
import urllib.request
httphd = urllib.request.HTTPHandler(debuglevel=1)
httpshd = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(httphd,httpshd)
urllib.request.install_opener(opener)
data = urllib.request.urlopen('http://edu.51cto.com')
7.URLError 异常处理
import urllib.request
import urllib.error
try:
html = urllib.request.urlopen('http://blog.csdn.net').read()
print(html)
except urllib.error.HTTPError: #触发了 HTTPError
print(e.code)
print(e.resason)
except urllib.error.URLError as e: #连接不上服务器;远程 url 不存在;无网络
print(e.resaon)
"""
改进合并,HTTPError 是 URLError 子类
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hassttr(e.'reason'):
print(e.reason)
"""
8.使用 urllib 库获取网页源代码后匹配
#请求获得源代码
opener = urllib.request.build_opener()
opener.addheaders = [headers]
data = opener.open(url).read()
data = str(data,encoding= 'utf-8')
-
获得源代码以后,并不是字符串形式,要先转换成编码格式为 utf-8 格式的字符串
9.编码中文为 url 的编码
#编码关键字
from urllib import parse
city = parse.quote(city)
10.下载网页并返回其 HTML,有些时候返回 4xx 是我们客户端问题,5xx 是服务端问题,可重新下载试试,并设置代理
import urllib.request
from urllib.error import URLError,HTTPError,ContentTooShortError
def download(url,user_agent = 'wswp',num_retries = 2):
print('Downloading:',url)
res = urllib.request.Request(url)
res.add_header('User-agent',user_agent)
try:
html = urllib.request.urlopen(res).read()
except (URLError,HTTPError,ContentTooShortError) as e:
print('Download error:',e.reason)
html = None
if num_reties > 0:
if hasattr(e,'code') and 500 <= e.code < 600:
return download(url,num_retries - 1)
return html
11.将 url 分割成几部分
from urllib.parse import urlsplit
com = urlsplit(url)
com.XXX #取对应的分割结果
12.提取 url 的内容 https://www.baidu.com/s?ie=utf-8&wd=python
- 提取域名
from urllib.parse import urlparse
qs = urlparse('https://www.baidu.com/s?ie=utf-8&wd=python').query
13. url 后面的参数拼接
from urllib.parse import urlencode
splash_url = 'http://localhost:8050/render.html'
args = {'url': 'http://quotes.toscrape.com/js', 'timeout': 5, 'image': 0}
url = urlencode(args)
fullurl = splash_url + args