下载缓存
缓存使用的方式有很多种,对于数百万网页的网站来说,重新爬取会非常费劲,一开始就爬取可以让每个网页只下载一次
为链接爬虫添加缓存支持
我们把上一张的download函数写成一个类,让其拥有缓存的功能,用dict保存访问过的链接来过滤是一个比较不错的主意,key设置为url,以下的Downloader保存了上一章的核心代码
#!/usr/bin/env python
# encoding: utf-8
from random import choice
class Downloader:
""" Downloader class to use cache and requests for downloading pages.
For contructor, pass:
delay (int): # of secs delay between requests (default: 5)
user_agent (str): user agent string (default: 'wswp')
proxies (list[dict]): list of possible proxies, each
must be a dict with http / https keys and proxy values
cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
timeout (float/int): number of seconds to wait until timeout
"""
def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={},
timeout=60):
self.user_agent = user_agent
self.proxies = proxies
self.cache = cache
self.num_retries = None # we will set this per request
self.timeout = timeout
def __call__(self, url, num_retries=2):
""" Call the downloader class, which will return HTML from cache
or download it
args:
url (str): url to download
kwargs:
num_retries (int): # times to retry if 5xx code (default: 2)
"""
self.num_retries = num_retries
try:
result = self.cache[url]
print('Loaded from cache:', url)
except KeyError:
result = None
if result and self.num_retries and 500 <= result['code'] < 600:
# server error so ignore result from cache
# and re-download
result = None
if result is None:
# result was not loaded from cache, need to download
proxies = choice(self.proxies) if self.proxies else None
headers = {'User-Agent': self.user_agent}
result = self.download(url, headers, proxies)
self.cache[url] = result
return result['html']
def download(self, url, headers, proxies):
print('Downloading:', url)
if __name__ == "__main__":
downloader = Downloader()
downloader('http://example.python-scraping.com') # 首次访问无cache
downloader('http://example.python-scraping.com') # 有cache的情况
downloader('http://www.baidu.com') # 再来一个无cache的
执行结果如下
磁盘缓存
书中提到的磁盘缓存并不是把html都保存到本地中,仅仅是保存html中的json字符串,对这个json字符串进行zlib.compress和zlib.decompress实现压缩的功能,缓存有效期为30天
#!/usr/bin/env python
# encoding: utf-8
import os
import json
import re
import zlib
from datetime import datetime
from urllib.parse import urlsplit
import requests
from datetime import timedelta
class DiskCache:
""" DiskCache helps store urls and their responses to disk
Intialization components:
cache_dir (str): abs file path or relative file path
for cache directory (default: ../data/cache)
max_len (int): maximum filename length (default: 255)
compress (bool): use zlib compression (default: True)
encoding (str): character encoding for compression (default: utf-8)
expires (datetime.timedelta): timedelta when content will expire
(default: 30 days ago)
"""
def __init__(self, cache_dir='./data/cache', max_len=255, compress=True,
encoding='utf-8', expires=timedelta(days=30)):
self.cache_dir = cache_dir
self.max_len = max_len
self.compress = compress
self.encoding = encoding
self.expires = expires
def url_to_path(self, url):
""" Return file system path string for given URL """
components = urlsplit(url)
# append index.html to empty paths
path = components.path
if not path:
path = '/index.html'
elif path.endswith('/'):
path += 'index.html'
filename = components.netloc + path + components.query
# replace invalid characters
filename = re.sub(r'[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
# restrict maximum number of characters
filename = '/'.join(seg[:self.max_len] for seg in filename.split('/'))
return os.path.join(self.cache_dir, filename)
def __getitem__(self, url):
"""Load data from disk for given URL"""
path = self.url_to_path(url)
if os.path.exists(path):
mode = ('rb' if self.compress else 'r')
with open(path, mode) as fp:
if self.compress:
data = zlib.decompress(fp.read()).decode(self.encoding)
data = json.loads(data)
else:
data = json.load(fp)
exp_date = data.get('expires')
if exp_date and datetime.strptime(exp_date,
'%Y-%m-%dT%H:%M:%S') <= datetime.utcnow():
print('Cache expired!', exp_date)
raise KeyError(url + ' has expired.')
return data
else:
# URL has not yet been cached
raise KeyError(url + ' does not exist')
def __setitem__(self, url, result):
print(url, result, "==========")
"""Save data to disk for given url"""
path = self.url_to_path(url)
folder = os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
mode = ('wb' if self.compress else 'w')
# Note: the timespec command requires Py3.6+ (if using 3.X you can
# export using isoformat() and import with '%Y-%m-%dT%H:%M:%S.%f'
result['expires'] = (datetime.utcnow() + self.expires).isoformat(
timespec='seconds')
with open(path, mode) as fp:
if self.compress:
data = bytes(json.dumps(result), self.encoding)
fp.write(zlib.compress(data))
else:
json.dump(result, fp)
if __name__ == "__main__":
dc = DiskCache()
url = "http://example.python-scraping.com"
dc.url_to_path(url)
resp = requests.get(url)
d = {"name": "zengraoli", "password": "123456"}
html = resp.text
# dc[url] = html
dc[url] = d
使用测试后,可以看到文件夹生成了,内容是一个压缩过的html页面
磁盘缓存缺点
文件名即使我们做了替换,还是会出现重复的情况,解决方案是用哈希处理;另外一个情况是网站子类太多,那么查找起来会很慢,解决方案是使用多个网页合并到一个里面,用其他数据结构进行查找
键值对存储缓存
redis实现缓存
如果是使用redis作为缓存在合适不过了,redis还能提供对索引有效期的设置,替换了我们之前的手工处理,但是压缩还是需要靠自己来做
下面的代码是使用redis作为缓存的测试,请首先确保安装了对应的包
#!/usr/bin/env python
# encoding: utf-8
import json
import zlib
import requests
from datetime import timedelta
from redis import StrictRedis
class RedisCache:
def __init__(self, client=None, expires=timedelta(days=30), encoding='utf-8', compress=True):
self.client = (StrictRedis(host='localhost', port=6379, db=0)
if client is None else client)
self.expires = expires
self.encoding = encoding
self.compress = compress
def __getitem__(self, url):
"""Load data from Redis for given URL"""
record = self.client.get(url)
if record:
if self.compress:
record = zlib.decompress(record)
return json.loads(record.decode(self.encoding))
else:
# URL has not yet been cached
raise KeyError(url + ' does not exist')
def __setitem__(self, url, result):
"""Save data to Redis for given url"""
data = bytes(json.dumps(result), self.encoding)
if self.compress:
data = zlib.compress(data)
self.client.setex(url, self.expires, data)
if __name__ == "__main__":
dc = RedisCache()
url = "http://example.python-scraping.com"
resp = requests.get(url)
d = {"name": "zengraoli", "password": "123456"}
html = resp.text
# dc[url] = html
dc[url] = d
print(dc[url])
探索requests-cache
requests-cache让我们自己免于实现cache类,他支持多种后端,redis、mongodb、sqlite以及内存。使用之前需要安装
pip install requests-cache
下面的代码时使用requests-cache的,中间设置了缓存第二次的访问基本不需要多少时间
import time
import requests_cache
if __name__ == "__main__":
requests_cache.install_cache(backend='redis', expire_after=timedelta(days=30))
url = "http://example.python-scraping.com"
start = time.time()
resp = requests.get(url)
end = time.time()
print("循环运行时间:%.2f秒" % (end - start)) # 循环运行时间:1.01秒
resp = requests.get(url)
end = time.time()
print("循环运行时间:%.2f秒" % (end - start)) # 循环运行时间:1.02秒
并发下载
100万个网页
所谓的100w个网页是从亚马逊下载的一个压缩文件,里面是一个csv,用requests请求会比较慢,所以可以先用浏览器下载,然后再从本地读取
#!/usr/bin/env python
# encoding: utf-8
import csv
from zipfile import ZipFile
from io import TextIOWrapper, BytesIO
import requests
if __name__ == "__main__":
# resp = requests.get('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip', stream=True)
urls = [] # top 1 million URL's will be stored in this list
content = ""
# with ZipFile(BytesIO(content)) as zf:
with ZipFile("top-1m.csv.zip", "r") as zf: # 从本地中读取url
csv_filename = zf.namelist()[0]
with zf.open(csv_filename) as csv_file:
for _, website in csv.reader(TextIOWrapper(csv_file)):
urls.append('http://' + website)
多线程爬虫
去掉一些其他功能,来模拟一下多线程的爬虫
#!/usr/bin/env python
# encoding: utf-8
import csv
from zipfile import ZipFile
from io import TextIOWrapper
import requests
import threading
import socket
import time
SLEEP_TIME = 1
socket.setdefaulttimeout(60)
requests.packages.urllib3.disable_warnings() # 我们对输出的ssl警告不感兴趣
def threaded_crawler(start_url, max_threads=5):
if isinstance(start_url, list):
crawl_queue = start_url
else:
crawl_queue = [start_url]
def process_queue():
while crawl_queue:
url = crawl_queue.pop()
headers = {'User-Agent': "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",}
try:
html = requests.get(url, headers=headers, verify=False)
print(url, html.status_code)
except Exception as ee:
print(url, " ee:", ee)
# wait for all download threads to finish
threads = []
print(max_threads)
while threads or crawl_queue:
for thread in threads:
if not thread.is_alive():
threads.remove(thread)
while len(threads) < max_threads and crawl_queue:
# can start some more threads
thread = threading.Thread(target=process_queue)
thread.setDaemon(True) # set daemon so main thread can exit w/ ctrl-c
thread.start()
threads.append(thread)
print(threads)
for thread in threads:
thread.join()
time.sleep(SLEEP_TIME)
if __name__ == "__main__":
urls = [] # top 1 million URL's will be stored in this list
content = ""
with ZipFile("top-1m.csv.zip", "r") as zf: # 从本地中读取url
csv_filename = zf.namelist()[0]
with zf.open(csv_filename) as csv_file:
for _, website in csv.reader(TextIOWrapper(csv_file)):
urls.append('http://' + website)
# 拿出来10个url做多线程爬虫示例
url_list = urls[:10]
threaded_crawler(url_list)
输出如下
多进程爬虫
多进程在py中相对多线程会有速度的提升,因为GIL的缘故,但多进程又无法同时读取同一个下载队列,因此可以采用redis作为中间介质
#!/usr/bin/env python
# encoding: utf-8
import csv
from zipfile import ZipFile
from io import TextIOWrapper
import requests
import threading
import multiprocessing
import socket
import time
from redis_queue import RedisQueue
SLEEP_TIME = 1
socket.setdefaulttimeout(60)
requests.packages.urllib3.disable_warnings() # 我们对输出的ssl警告不感兴趣
def threaded_crawler_rq(start_url, max_threads=5):
crawl_queue = RedisQueue()
def process_queue():
while crawl_queue:
url = crawl_queue.pop()
headers = {'User-Agent': "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",}
try:
html = requests.get(url, headers=headers, verify=False)
print(url, html.status_code)
except Exception as ee:
print(url, " ee:", ee)
# wait for all download threads to finish
threads = []
print(max_threads)
while threads or crawl_queue:
for thread in threads:
if not thread.is_alive():
threads.remove(thread)
while len(threads) < max_threads and crawl_queue:
# can start some more threads
thread = threading.Thread(target=process_queue)
thread.setDaemon(True) # set daemon so main thread can exit w/ ctrl-c
thread.start()
threads.append(thread)
print(threads)
for thread in threads:
thread.join()
time.sleep(SLEEP_TIME)
def mp_threaded_crawler(*args, **kwargs):
""" create a multiprocessing threaded crawler """
processes = []
num_procs = kwargs.pop('num_procs')
if not num_procs:
num_procs = multiprocessing.cpu_count()
for _ in range(num_procs):
proc = multiprocessing.Process(target=threaded_crawler_rq,
args=args, kwargs=kwargs)
proc.start()
processes.append(proc)
# wait for processes to complete
for proc in processes:
proc.join()
if __name__ == "__main__":
urls = [] # top 1 million URL's will be stored in this list
content = ""
crawl_queue = RedisQueue()
with ZipFile("top-1m.csv.zip", "r") as zf: # 从本地中读取url
csv_filename = zf.namelist()[0]
with zf.open(csv_filename) as csv_file:
for _, website in csv.reader(TextIOWrapper(csv_file)):
urls.append('http://' + website)
# 拿出来10个url做多线程爬虫示例
url_list = urls[:20]
crawl_queue.push(url_list) # 把20个链接推进redis队列中
# threaded_crawler(url_list)
start_time = time.time()
mp_threaded_crawler(url_list, num_procs=4)
print('Total time: %ss' % (time.time() - start_time))
配套的redis代码,注意存放路径
# Based loosely on the Redis Cookbook FIFO Queue: http://www.rediscookbook.org/implement_a_fifo_queue.html
from redis import StrictRedis
class RedisQueue:
def __init__(self, client=None, db=0, queue_name='wswp'):
self.client = (StrictRedis(host='localhost', port=6379, db=db)
if client is None else client)
self.name = "queue:%s" % queue_name
self.seen_set = "seen:%s" % queue_name
self.depth = "depth:%s" % queue_name
def __len__(self):
return self.client.llen(self.name)
def push(self, element):
"""Push an element to the tail of the queue"""
if isinstance(element, list):
element = [e for e in element if not self.already_seen(e)]
self.client.lpush(self.name, *element)
self.client.sadd(self.seen_set, *element)
elif not self.already_seen(element):
self.client.lpush(self.name, element)
self.client.sadd(self.seen_set, element)
def already_seen(self, element):
""" determine if an element has already been seen """
return self.client.sismember(self.seen_set, element)
def set_depth(self, element, depth):
""" Set the seen hash and depth """
self.client.hset(self.depth, element, depth)
def get_depth(self, element):
""" Get the seen hash and depth """
return (lambda dep: int(dep) if dep else 0)(self.client.hget(self.depth, element))
def pop(self):
"""Pop an element from the head of the queue"""
return self.client.rpop(self.name).decode('utf-8')
输出如下
参考网站
python3 __call__方法
python计算时间的两种方式:time与datetime
python中zipfile模块实例化解析
python requests提示警告InsecureRequestWarning