不知有没空写下去——Python爬虫(二) urllib库

参考课程:崔庆才大神的Python3网络爬虫实战案例

PycharmProject下载:https://download.csdn.net/download/lly1122334/10419435

urllib是Python3的内置库,提供了一系列操作URL的功能

1_1 urllib 三大模块

#coding=utf-8
import socket
import urllib.request
import urllib.error
import urllib.parse

'''
urllib是Python内置的Http请求库

urllib.request      请求模块
urllib.error        异常处理模块
urllib.parse        url解析模块
urllib.robotparser  robots.txt解析模块(很少用)
'''

#urllib.request      请求模块
response = urllib.request.urlopen("http://www.baidu.com")
#print(response.read().decode("utf-8"))

#urllib.parse        url解析模块
data = bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf8')#请求httpbin.org,该网址用于做HTTP测试
response = urllib.request.urlopen("http://httpbin.org/post",data=data)
#print(response.read())

#urllib.error        异常处理模块
response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)#超时参数timeout=1可达到
#print(response.read())
try:
    response = urllib.request.urlopen("http://httpbin.org/get",timeout=0.1)#超时参数timeout=0.1不可达到
except urllib.error.URLError as e:
    if isinstance(e.reason,socket.timeout):
        print("TIME OUT")
    else:
        print("OK")

这里写图片描述

1_2 urllib 响应

#coding=utf-8
import urllib.request

#响应:响应类型、状态码、响应头

#响应类型
response = urllib.request.urlopen("http://www.baidu.com")
print(type(response))

#状态码:判断请求是否成功
print(response.status)

#响应头
print(response.getheaders())

这里写图片描述

1_3 urllib Request

#coding=utf-8
from urllib import request,parse

# Request:向网页发出复杂请求
url = 'http://httpbin.org/post'
headers = {
    'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
    'Host':'httpbin.org'
}
dict = {
    'name':'Germey'
}
data = bytes(parse.urlencode(dict),encoding='utf8')
req = request.Request(url=url,data=data,headers=headers,method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

这里写图片描述

1_4 urllib Handler

#coding=utf-8
import urllib.request
import http.cookiejar

#Handler:代理、Cookie

#代理
proxy_handler = urllib.request.ProxyHandler({
    'http':'http://127.0.0.1:80',           #将端口号改为代理的端口号
    'https':'https://127.0.0.1:80'          #将端口号改为代理的端口号
})
opener = urllib.request.build_opener(proxy_handler)
#response = opener.open('http://www.baidu.com')
#print(response.read())










#Cookie:维持登录会话信息
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
    print(item.name+"="+item.value)

#Cookie_保存到本地文件,方便下次请求时调用
filename1 = "1_4_1 urllib Handler cookie1.txt"
cookie = http.cookiejar.MozillaCookieJar(filename1)#Mozilla模式保存
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)#ignore_discard,即使cookies将被丢弃也保存下来。ignore_expires,如果在该文件cookies已存在,覆盖写入

filename2 = "1_4_2 urllib Handler cookie2.txt"
cookie = http.cookiejar.LWPCookieJar(filename2)#LWP模式保存
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

#Cookie_加载本地Cookie将其放到Request中请求
cookie = http.cookiejar.LWPCookieJar()
cookie.load('1_4_2 urllib Handler cookie2.txt',ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
#print(response.read().decode('utf-8'))

这里写图片描述

这里写图片描述

1_5 urllib 异常处理

#coding=utf-8
from urllib import request, error
import socket

#异常处理

try:
    response = request.urlopen('http://cuiqingcai.com/index.htm')#一个不存在的网页
except error.HTTPError as e:
    print(e.reason,e.code,e.headers,sep='\n')
except error.URLError as e:
    print(e.reason)
else:
    print('Request Successfully')


try:
    response = request.urlopen('http://www.baidu.com',timeout=0.01)
except error.URLError as e:
    print(type(e.reason))
    if isinstance(e.reason,socket.timeout):
        print('TIME OUT')

这里写图片描述

1_6 urllib URL解析

#coding=utf-8
import urllib.parse

#原型:urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)
#URL解析:urlparse
print('URL解析:urlparse')
result = urllib.parse.urlparse('http://www.baidu.com/index.html;user?id=5#comment')
print(type(result),result)

result = urllib.parse.urlparse('www.baidu.com/index.html;user?id=5#comment',scheme='https')#自动填充协议类型
print(result)

result = urllib.parse.urlparse('http://www.baidu.com/index.html;user?id=5#comment',scheme='https')#若原本有协议类型则不填充
print(result)

result = urllib.parse.urlparse('http://www.baidu.com/index.html;user?id=5#comment',allow_fragments=False)#将锚点链接拼接到前面
print(result)

result = urllib.parse.urlparse('http://www.baidu.com/index.html#comment',allow_fragments=False)#将锚点链接拼接到前面
print(result)










#URL反解析(拼接):urlunparse
print('\nURL反解析(拼接):urlunparse')
data = ['http','www.baidu.com','index.html','user','a=6','comment']
print(urllib.parse.urlunparse(data))#http://www.baidu.com/index.html;user?a=6#comment









#URL填充(后者填充前者):urljoin
print('\nURL填充(后者填充前者):urljoin')
print(urllib.parse.urljoin('http://www.baidu.com','FAQ.html'))#http://www.baidu.com/FAQ.html
print(urllib.parse.urljoin('http://www.baidu.com','https://XerCis.com/FAQ.html'))#https://XerCis.com/FAQ.html
print(urllib.parse.urljoin('http://www.baidu.com/about.html','https://XerCis.com/FAQ.html'))#https://XerCis.com/FAQ.html
print(urllib.parse.urljoin('http://www.baidu.com/about.html','https://XerCis.com/FAQ.html?question=2'))#https://XerCis.com/FAQ.html?question=2
print(urllib.parse.urljoin('http://www.baidu.com/about.html?wd=abc','https://XerCis.com/index.php'))#https://XerCis.com/index.php
print(urllib.parse.urljoin('http://www.baidu.com','?category=2#comment'))#http://www.baidu.com?category=2#comment
print(urllib.parse.urljoin('www.baidu.com','?category=2#comment'))#www.baidu.com?category=2#comment
print(urllib.parse.urljoin('www.baidu.com#comment','?category=2'))#www.baidu.com?category=2









#URL请求参数转换:urlencode
print('\nURL字典转请求参数:urlencode')
params = {
    'name':'germey',
    'age':22
}
base_url = 'http://www.baidu.com?'
url = base_url + urllib.parse.urlencode(params)
print(url)#http://www.baidu.com?name=germey&age=22

这里写图片描述

阅读更多
个人分类: Python爬虫
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

关闭
关闭
关闭