本地缓存
第二章学习了如何爬取网页并保存数据,但如果想另外再提取数据,则需要重新下载网页,对于大型网站这是个不小的时间消耗,因此可以先把网页存储,而不用重新下载。
1.为链接爬虫添加缓存支持
下载网页之前需要先检查该网页是否已缓存
之后需检查是否有服务器错误
若都没问题可直接从缓存提取数据,否则需重新下载网页
另,只有在下载网页之前才需要限速
from random import choice
import requests
from ch1_link_crawler import Throttle
class Downloader:
""" Downloader class to use cache and requests for downloading pages.
For contructor, pass:
delay (int): # of secs delay between requests (default: 5)
user_agent (str): user agent string (default: 'wswp')
proxies (list[dict]): list of possible proxies, each
must be a dict with http / https keys and proxy values
cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
timeout (float/int): number of seconds to wait until timeout
"""
def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={},
timeout=60):
self.throttle = Throttle(delay)
self.user_agent = user_agent
self.proxies = proxies
self.cache = cache
self.num_retries = None # we will set this per request
self.timeout = timeout
def __call__(self, url, num_retries=2):
""" Call the downloader class, which will return HTML from cache
or download it
args:
url (str): url to download
kwargs:
num_retries (int): # times to retry if 5xx code (default: 2)
"""
self.num_retries = num_retries
try:
result = self.cache[url]
print('Loaded from cache:', url)
except KeyError:
result = None
if result and self.num_retries and 500 <= result['code'] < 600:
# server error so ignore result from cache
# and re-download
result = None
if result is None:
# result was not loaded from cache, need to download
self.throttle.wait(url)
proxies = choice(self.proxies) if self.proxies else None
headers = {'User-Agent': self.user_agent}
result = self.download(url, headers, proxies)
self.cache[url] = result
return result['html']
def download(self, url, headers, proxies):
""" Download a and return the page content
args:
url (str): URL
headers (dict): dict of headers (like user_agent)
proxies (dict): proxy dict w/ keys 'http'/'https', values
are strs (i.e. 'http(s)://IP') (default: None)
"""
print('Downloading:', url)
try:
resp = requests.get(url, headers=headers, proxies=proxies,
timeout=self.timeout)
html = resp.text
if resp.status_code >= 400:
print('Download error:', resp.text)
html = None
if self.num_retries and 500 <= resp.status_code < 600:
# recursively retry 5xx HTTP errors
self.num_retries -= 1
return self.download(url, headers, proxies)
except requests.exceptions.RequestException as e:
print('Download error:', e)
return {'html': None, 'code': 500}
return {'html': html, 'code': resp.status_code}
link_crawler函数也需要添加cache参数、删除throttle、以新的Downloader类代替download函数,来支持缓存。
def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
proxies=None, delay=3, max_depth=4, num_retries=2, cache={}, scraper_callback=None):
""" Crawl from the given start URL following links matched by link_regex. In the current
implementation, we do not actually scrape any information.
args:
start_url (str): web site to start crawl
link_regex (str): regex to match for links
kwargs:
robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
user_agent (str): user agent (default: wswp)
proxies (list of dicts): a list of possible dicts for http / https proxies
For formatting, see the requests library
delay (int): seconds to throttle between requests to one domain (default: 3)
max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
num_retries (int): # of retries when 5xx error (default: 2)
cache (dict): cache dict with urls as keys and dicts for responses (default: {})
scraper_callback: function to be called on url and html content
"""
crawl_queue = [start_url]
# keep track which URL's have seen before
seen = {}
if not robots_url:
robots_url = '{}/robots.txt'.format(start_url)
rp = get_robots_parser(robots_url)
D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, cache=cache)
while crawl_queue:
url = crawl_queue.pop()
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
depth = seen.get(url, 0)
if depth == max_depth:
print('Skipping %s due to depth' % url)
continue
html = D(url, num_retries=num_retries)
if not html:
continue
if scraper_callback:
links = scraper_callback(url, html) or []
else:
links = []
# filter for links matching our regular expression
for link in get_links(html) + links:
if re.match(link_regex, link):
abs_link = urljoin(start_url, link)
if abs_link not in seen:
seen[abs_link] = depth + 1
crawl_queue.append(abs_link)
else:
print('Blocked by robots.txt:', url)
2.磁盘缓存
不同文件系统有不同的限制,为使文件路径在不同文件系统都是安全的,需要限制为数字、字母和基本的符号,用’_'代替其他字符。
文件名及目录长度限制在255个字符以内
对于以’/‘结尾的URL,’/'后的空白会造成非法文件名,可添加’index.html’为文件名
url_to_path函数实现了符合上述限制的从URL到文件名的映射
__setitem__方法实现向磁盘写入给定URL的数据,保存当下时间与过期时间的和
__getitem__方法实现从磁盘取得给定URL的数据,对比当下时间和__setitem__中保存的时间,以检查是否过期。
其中,为节约磁盘空间,使用了zlim压缩序列化字符串
import os
import json
import re
import zlib
from datetime import datetime, timedelta
from urllib.parse import urlsplit
class DiskCache:
""" DiskCache helps store urls and their responses to disk
Intialization components:
cache_dir (str): abs file path or relative file path
for cache directory (default: ../data/cache)
max_len (int): maximum filename length (default: 255)
compress (bool): use zlib compression (default: True)
encoding (str): character encoding for compression (default: utf-8)
expires (datetime.timedelta): timedelta when content will expire
(default: 30 days ago)
"""
def __init__(self, cache_dir='cache/', max_len=255, compress=True,
encoding='utf-8', expires=timedelta(days=30)):
self.cache_dir = cache_dir
self.max_len = max_len
self.compress = compress
self.encoding = encoding
self.expires = expires
def url_to_path(self, url):
""" Return file system path string for given URL """
components = urlsplit(url)
# append index.html to empty paths
path = components.path
if not path:
path = '/index.html'
elif path.endswith('/'):
path += 'index.html'
filename = components.netloc + path + components.query
# replace invalid characters
filename = re.sub(r'[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
# restrict maximum number of characters
filename = '/'.join(seg[:self.max_len] for seg in filename.split('/'))
return os.path.join(self.cache_dir, filename)
def __getitem__(self, url):
"""Load data from disk for given URL"""
path = self.url_to_path(url)
if os.path.exists(path):
mode = ('rb' if self.compress else 'r')
if os.path.isfile(path):
with open(path, mode) as fp:
if self.compress:
data = zlib.decompress(fp.read()).decode(self.encoding)
data = json.loads(data)
else:
data = json.load(fp)
exp_date = data.get('expires')
if exp_date and datetime.strptime(exp_date,
'%Y-%m-%dT%H:%M:%S') <= datetime.utcnow():
print('Cache expired!', exp_date)
raise KeyError(url + ' has expired.')
return data
else:
# URL has not yet been cached
raise KeyError(url + ' does not exist')
def __setitem__(self, url, result):
"""Save data to disk for given url"""
path = self.url_to_path(url)
folder = os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
mode = ('wb' if self.compress else 'w')
# Note: the timespec command requires Py3.6+ (if using 3.X you can
# export using isoformat() and import with '%Y-%m-%dT%H:%M:%S.%f'
result['expires'] = (datetime.utcnow() + self.expires).isoformat(
timespec='seconds')
if os.path.isfile(path):
with open(path, mode) as fp:
if self.compress:
data = bytes(json.dumps(result), self.encoding)
fp.write(zlib.compress(data))
else:
json.dump(result, fp)
由于文件系统的限制以及URL的长度不止255,URL可能会映射到相同的文件名下,且目录和文件系统底下可存储的文件数有限,为避免这些,可使用数据库存储。
3.数据库缓存
在MongoDB中创建timestamp索引,到达给定时间戳后,会自动删除过期记录。
其他方法类似磁盘存储的实现。
class MongoCache:
def __init__(self, client=None, expires=timedelta(days=30),encoding='utf-8'):
"""
client: mongo database client
expires: timedelta of amount of time before a cache entry is considered expired
"""
# if a client object is not passed
# then try connecting to mongodb at the default localhost port
self.client = MongoClient('localhost', 27017) if client is None else client
#create collection to store cached webpages,
# which is the equivalent of a table in a relational database
self.db = self.client.cache
self.encoding=encoding
#TTL索引:对“date”、“timestamp”类型的字段建立TTL索引,并在索引选项中指定索引保存的时长,
#那么mongodb将会在一定时间之后自动移除那些“过期”的索引和documents。通过TTL索引,可以实现一个清洗数据的“定时任务”。
self.db.webpage.create_index('timestamp100', expireAfterSeconds=expires.total_seconds())
def __contains__(self, url):
try:
self[url]
except KeyError:
return False
else:
return True
def __getitem__(self, url):
"""Load value at this URL
"""
record = self.db.webpage.find_one({'_id': url})
if record:
#return record['result']
return json.loads(zlib.decompress(record['result']).decode(self.encoding))
else:
raise KeyError(url + ' does not exist')
def __setitem__(self, url, result):
"""Save value for this URL
"""
#record = {'result': result, 'timestamp': datetime.utcnow()}
record = {'result': zlib.compress(bytes(json.dumps(result),self.encoding)), 'timestamp': datetime.utcnow()}
self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)
def clear(self):
self.db.webpage.drop()
print ('drop() successful')
link_crawler('http://example.webscraping.com', '/places/default'+'/(index|view)', cache=MongoCache(expires=timedelta(seconds=100)))
参考:https://blog.csdn.net/u014134180/article/details/55506984