urllib

最新推荐文章于 2023-01-12 10:41:59 发布

木下瞳

最新推荐文章于 2023-01-12 10:41:59 发布

阅读量193

点赞数

分类专栏： Python模块使用

本文链接：https://blog.csdn.net/zjkpy_5/article/details/83471988

版权

Python模块使用专栏收录该内容

51 篇文章 4 订阅

订阅专栏

文档：https://docs.python.org/3/library/urllib.html

8.使用 urllib 库获取网页源代码后匹配

9.编码中文为 url 的编码

10.下载网页并返回其 HTML，有些时候返回 4xx 是我们客户端问题，5xx 是服务端问题，可重新下载试试，并设置代理

11.将 url 分割成几部分

12.提取 url 的内容 https://www.baidu.com/s?ie=utf-8&wd=python

13. url 后面的参数拼接

1.post 请求

import urllib.request
import urllib.parse

url = ''

#将数据使用 urlencode 编码处理后，使用 encode 设置为 utf-8 格式
postdata = urllib.parse.urlencode({'name' : '用户名',
                                   'pass' : '密码'}).encode('utf-8')

req = urllib.request.Request(url,postdata)
req.add_header(header)
data = urllib.request.urlopen(req).read()

2.get 请求

url 改关键字就可以请求的叫 get 请求

import urllib.request
url = 'http://www.baidu.com/s?wd='
key = '韦玮老师'
key_code = urllib.request.quote(key)
url_all = url + key_code
req = urllib.request.Request(url_all)
data = urllib.request.urlopen(req).read()
fh = open('5.html','wb')
fh.write(data)
fh.close()

"""
方一：使用 build_opener() 修改报头
"""
import urllib.request

url = 'https://blog.csdn.net/zjkpy_5/article/details/83352403'

headers = ('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKi\
            t/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.3\
            6')

opener = urllib.request.build_opener()
opener.addheaders = [headers]
data = opener.open(url).read()

write_file = open('3.html','wb')
write_file.write(data)
write_file.close()


"""
方二：使用 add_header() 添加报头
"""
import urllib.request

url = 'https://blog.csdn.net/zjkpy_5/article/details/83352403'

req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKi\
            t/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.3\
            6')
data = urllib.request.urlopen(req).read()

4.超时设置

import urllib.request

for i in range(1,100):
    try:
        file = urllib.request.urlopen('http://yum.iqianyue.com',timeout= 1)
        data = file.read()
        print(len(data))
    except Exception as e:
        print(e)

5.代理服务设置

import urllib.request

def use_proxy(proxy_addr,url):
    proxy = urllib.request.ProxyHandler({'http' : proxy_addr})
    opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    data = urllib.request.urlopen(url).read().decode('utf-8')
    return data

proxy_addr = '103.16.61.182:36885'
data = use_proxy(proxy_addr,'http://www.baidu.com')
print(len(data))

6.DegbugLog 调试日志

import urllib.request

httphd = urllib.request.HTTPHandler(debuglevel=1)
httpshd = urllib.request.HTTPSHandler(debuglevel=1)

opener = urllib.request.build_opener(httphd,httpshd)
urllib.request.install_opener(opener)

data = urllib.request.urlopen('http://edu.51cto.com')

7.URLError 异常处理

import urllib.request
import urllib.error
try:
    html = urllib.request.urlopen('http://blog.csdn.net').read()
    print(html)
except urllib.error.HTTPError: #触发了 HTTPError
    print(e.code)
    print(e.resason)
except urllib.error.URLError as e: #连接不上服务器；远程 url 不存在；无网络
    print(e.resaon)

"""
改进合并,HTTPError 是 URLError 子类
except urllib.error.URLError as e:
    if hasattr(e,'code'):
        print(e.code)
    if hassttr(e.'reason'):
        print(e.reason)
"""

8.使用 urllib 库获取网页源代码后匹配

#请求获得源代码
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    data = opener.open(url).read()
    data = str(data,encoding= 'utf-8')

获得源代码以后，并不是字符串形式，要先转换成编码格式为 utf-8 格式的字符串

9.编码中文为 url 的编码

#编码关键字
from urllib import parse
city = parse.quote(city)

10.下载网页并返回其 HTML，有些时候返回 4xx 是我们客户端问题，5xx 是服务端问题，可重新下载试试，并设置代理

import urllib.request
from urllib.error import URLError,HTTPError,ContentTooShortError

def download(url,user_agent = 'wswp',num_retries = 2):
    print('Downloading:',url)
    res = urllib.request.Request(url)
    res.add_header('User-agent',user_agent)
    try:
        html = urllib.request.urlopen(res).read()
    except (URLError,HTTPError,ContentTooShortError) as e:
        print('Download error:',e.reason)
        html = None
        if num_reties > 0:
            if hasattr(e,'code') and 500 <= e.code < 600:
                return download(url,num_retries - 1)
    return html

11.将 url 分割成几部分

from urllib.parse import urlsplit
com = urlsplit(url)
com.XXX #取对应的分割结果

12.提取 url 的内容 https://www.baidu.com/s?ie=utf-8&wd=python

提取域名

from urllib.parse import urlparse
qs = urlparse('https://www.baidu.com/s?ie=utf-8&wd=python').query

13. url 后面的参数拼接

from urllib.parse import urlencode

splash_url = 'http://localhost:8050/render.html'
args = {'url': 'http://quotes.toscrape.com/js', 'timeout': 5, 'image': 0}
url = urlencode(args)
fullurl = splash_url + args