Python3爬虫入门之Urllib库的用法

最新推荐文章于 2020-12-01 20:15:45 发布

Cowry5

最新推荐文章于 2020-12-01 20:15:45 发布

阅读量411

点赞数

分类专栏：爬虫文章标签：爬虫 urllib python

本文链接：https://blog.csdn.net/Cowry5/article/details/79696397

版权

爬虫专栏收录该内容

13 篇文章 0 订阅

订阅专栏

urllib库的用法

urlopen

urllib.request.urlopen(url,data=None,[timeout,]*,cafile=None,cadefault=False,context=None)

import urllib.request

# GET类型的请求，无需data参数
response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))

import urllib.request
import urllib.parse

# POST类型的请求，需要传入data
data = bytes(urllib.parse.urlencode({'world': 'hello'}), encoding='utf8')
response = urllib.request.urlopen('http://httpbin.org/post', data=data)
print(response.read())

import urllib.request

# timeout 表示必须在此时间内获得请求，否则报错
response = urllib.request.urlopen('http://httpbin.org/get', timeout=1)
print(response.read())

import urllib.request
import socket
import urllib.error

# 请求时长超出0.1 则抛出异常
try:
    response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')
# TIME OUT

response

响应的类型

import urllib.request

response = urllib.request.urlopen('https://www.python.org')
print(type(response))
# <class 'http.client.HTTPResponse'>

状态码Status Code，响应头Response Headers

import urllib.request

response = urllib.request.urlopen('https://www.python.org')
print(response.status)
print(response.getheaders())  # 获取所有响应头
print(response.getheader('Server'))  # 获取特定的响应头

import urllib.request

response = urllib.request.urlopen('https://www.python.org')
print(response.status)
print(response.read().decode('utf-8'))

request

构造request，便于自定义参数，设置请求方式

from urllib import request, parse

# 构造request，便于自定义参数，设置请求方式
url = 'http://httpbin.org/post'
headers = {
    'User-Agent': 'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
    'Host': 'httpbin.org'
}
dict = {
    'name': 'Germey'
}
data = bytes(parse.urlencode(dict), encoding='utf8')  # 　formdata的数据用bytes类型编码下
req = request.Request(url=url, data=data, headers=headers,
                      method='POST')  # POST请求方式
response = request.urlopen(req)
print(response.status)
# print(response.read())
print(response.read().decode('utf-8'))

或者调用request的add_header方法传入头

from urllib import request, parse

url = 'http://httpbin.org/post'
dict = {
    'name': 'Germey'
}
# formdata的数据用 bytes类型编码下
data = bytes(parse.urlencode(dict), encoding='utf8')
req = request.Request(url=url, data=data, method='POST')  # POST请求方式
req.add_header('User-Agent', 'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

handler

代理
通过切换代理 ip 让服务器识别出来自不同的地方，防止封掉爬虫

import urllib.request

proxy_hander = urllib.request.ProxyHandler({
    'http': 'http://49.76.14.74:61202',
    'https': 'https://49.76.14.74:61202'
})
# 通过切换代理 ip 让服务器识别出来自不同的地方，防止封掉爬虫
opener = urllib.request.build_opener(proxy_hander)
response = opener.open('http://www.baidu.com')
print(response.read())

Cookie，保存用户登陆信息

import http.cookiejar
import urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')  # 把百度的cookie复制到变量中
for item in cookie:  # 打印出cookie
    print(item.name+"="+item.value)

把cookie保存为文件，便于每次读取登陆信息。

import http.cookiejar
import urllib.request

filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename)  # 声明为MoziilaCookiwJar（火狐）
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save( ignore_expires = True,ignore_discard=True)

import http.cookiejar
import urllib.request

filename = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar(filename)  # cookie2.0的格式存储
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save( ignore_expires = True,ignore_discard=True)

用什么方法存cookie，用什么方法读取就行了

import http.cookiejar
import urllib.request

# 用什么格式的cookie存，用什么方法读取就行了
cookie = http.cookiejar.LWPCookieJar()
# ignore_discard的意思是即使cookies将被丢弃也将它保存下来
#ignore_expires的意思是如果cookies已经过期也将它保存并且文件已存在时将覆盖
cookie.load('cookie.txt', ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

异常处理

from urllib import request,error
# 捕捉异常
try:
    response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.URLError as e:
    print(e.reason)

from urllib import request,error
# 捕捉异常
try:
    response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:
    print(e.reason,e.code,e.headers,sep='\n')
except error.URLError as e:
    print(e.reason)
else:
    print('Request Successfully')

import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('http://www.baidu.com',timeout=0.01)
except urllib.error.URLError as e:
    print(type(e.reason))
    if isinstance(e.reason,socket.timeout):
        print('TIME OUT')

URL解析

urlparse 将网址分解成六个参数

urllib.parse.urlparse(urlstring,scheme=”,allow_fragments=True)

from urllib.parse import urlparse

result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(type(result),result)

如果网址没有指定协议类型，则可以自定义协议类型。如果前面网址制定了协议类型，则后面不会生效。

from urllib.parse import urlparse
# 指定协议类型
result = urlparse('www.baidu.com/index.html;user?id=5#comment',scheme='https')
print(result)

如果allow_fragment参数为false，则不识别freagment标识符。相反，它们被解析为path、params或query的一部分，并将fragment设置为返回值中的空字符串。

from urllib.parse import urlparse

result = urlparse('www.baidu.com/index.html;user?id=5#comment',allow_fragments=False)
print(result)

urlunparse 将参数拼接成网址

from urllib.parse import urlunparse

data = ['http','www.baidu.com','index.html','user','a=6','comment']
print(urlunparse(data))

urljoin

from urllib.parse import urljoin

print(urljoin('http://www.baidu.com','https://cuiqingcai.com/FAQ.html'))

urlencode 把一个字典对象转化为get参数

from urllib.parse import urlencode

params = {
    'name':'germey',
    'age':22
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(params)
print(url)