网络库
常用库 | 使用场景 |
---|---|
urllib库 | http协议常用库 |
requests库 | http协议常用库 |
BeauifulSoup库 | xml格式处理库 |
urllib简单爬虫
# 网页爬虫
from urllib import request
url = 'http://www.baidu.com'
response = request.urlopen(url, timeout=1)
print(response.read().decode('utf-8'))
处理url超时异常
# 网页爬虫
from urllib import request
import socket
import urllib
try:
response2 = request.urlopen('http://httpbin.org/get',timeout=0.1)
except urllib.error.URLError as e:
if isinstance(e.reason,socket.timeout):
print('Time Out')
模拟浏览器请求
from urllib import request, parse
url = 'http://httpbin.org/post'
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Upgrade-Insecure-Requests": "1",
"Referer": "http://httpbin.org",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
dict = {
"name": "value"
}
data = bytes(parse.urlencode(dict),encoding='utf-8')
req = request.Request(url, data=data, headers=headers, method='post')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
运行这个段代码会报错
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
原因:"b’\x1f\x8b\x08"开头的数据是经过gzip压缩过的数据,这里当然需要进行解压了。导入gzip解压方法
from urllib import request, parse
url = 'http://httpbin.org/post'
from io import BytesIO
import gzip
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Upgrade-Insecure-Requests": "1",
"Referer": "http://httpbin.org",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
dict = {
"name": "value"
}
data = bytes(parse.urlencode(dict),encoding='utf-8')
req = request.Request(url, data=data, headers=headers, method='post')
response = request.urlopen(req)
# print(response.read().decode('utf-8'))
html = response.read()
# 导入gzip解压
buff = BytesIO(html)
f = gzip.GzipFile(fileobj=buff)
res = f.read().decode('utf-8')
print(res)
使用requests库模拟get/post请求
# requests库
import requests as req
# get请求
url = 'http://httpbin.org/get'
data = {"key": "value", "abc": "xyz"}
response = req.get(url, data)
print(response.text)
# post请求
url = 'http://httpbin.org/post'
response = req.post(url, data)
print(response.text)
结合正则爬取网页图片链接
# requests结合re正则
import requests as req
import re
context = req.get("http://www.cnu.cc/discoveryPage/hot-人像").text
# <div class="grid-item work-thumbnail">
# <a href="http://www.cnu.cc/works/351070" class="thumbnail" target="_blank">
# <div class="title">"广州森林“</div>
# <div class="author>安正</div>
# <img src="http://img.cnu.cc/uploads/images/flow/1904/26/b0bb0ccad62b3b1992f4d4249433c528.jpg?width=2983&height=1678" alt=""广州森林“">
# </a>
# <div class="grid-item work-thumbnail">
# <a href="(.*?)" .*?title">(.*?)</div>
# <div class="author>安正</div>
# <img src="http://img.cnu.cc/uploads/images/flow/1904/26/b0bb0ccad62b3b1992f4d4249433c528.jpg?width=2983&height=1678" alt=""广州森林“">
# </a>
pattern = re.compile(r'<a href="(.*?)" .*?title">(.*?)</div>', re.S)
results = re.findall(pattern, context)
print(results)
for result in results:
url, name = result
# 替换 '\s'匹配空白
print(url, re.sub(r'\s', '', name))
BeautifulSoup
简单使用BeautifulSoup
# BeautifulSoup库
from bs4 import BeautifulSoup as bs
html_doc = """
<html>
<head>
<title>这是个标题</title>
</head>
<body>
<h1>这是一个一个简单的HTML</h1>
<p class="hahaha">Hello World!</p>
<a href="www.baidu.com">1</a>
<a href="www.baidu.com">2</a>
</body>
</html>
"""
soup = bs(html_doc, 'lxml')
# 输出标准的html格式
print(soup.prettify())
# 找到title标签
print(soup.title)
# 获取title标签里的内容
print(soup.title.string)
# 获取p标签
print(soup.p)
# 获取p标签class的名字
print(soup.p['class'])
# 获取第一个a标签
print(soup.a)
# 获取所有的a标签
print(soup.find_all('a'))
案例模拟爬虫网页并下载图片
# 爬虫下载图片
from bs4 import BeautifulSoup
import requests as req
import os
import shutil
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Upgrade-Insecure-Requests": "1",
"Referer": "http://httpbin.org",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
url = 'https://www.infoq.com/presentations'
# 获取网页图片
def get_image(url):
response = req.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
# print(soup.find_all('img'))
dir_path = r'C:\Users\yuanshuai\Desktop\下载图片'
os.mkdir(dir_path)
for i in soup.find_all('img'):
img_url = i.get("src")
# 等价于 https://www.infoq.com/presentations/a.jpg 取出a.jpg作为file_name
fire_name = os.path.basename(img_url)
image_path = os.path.join(dir_path, fire_name)
download_image(img_url, image_path)
# 下载图片
def download_image(img_url, image_path):
# 打开流获取下载
response = req.get(img_url, stream=True)
if response.status_code == 200:
with open(image_path, 'wb') as f:
# 要使用response.raw文件样对象,默认情况下不会解码压缩的响应(使用GZIP或deflate)。
# 你可以强制它解压缩你设置decode_content属性为True(请求设置为False以控制解码本身)。
response.raw.decode_content = True
# 将数据流传输到文件对象
shutil.copyfileobj(response.raw, f)
get_image(url)