[python web scraping 2nd edition]3 笔记

最新推荐文章于 2024-09-16 07:18:09 发布

yiranxd

最新推荐文章于 2024-09-16 07:18:09 发布

阅读量126

点赞数

分类专栏：爬虫文章标签：爬虫

本文链接：https://blog.csdn.net/yiranxd/article/details/88737398

版权

爬虫专栏收录该内容

4 篇文章 1 订阅

订阅专栏

本地缓存

第二章学习了如何爬取网页并保存数据，但如果想另外再提取数据，则需要重新下载网页，对于大型网站这是个不小的时间消耗，因此可以先把网页存储，而不用重新下载。

1.为链接爬虫添加缓存支持

下载网页之前需要先检查该网页是否已缓存
之后需检查是否有服务器错误
若都没问题可直接从缓存提取数据，否则需重新下载网页
另，只有在下载网页之前才需要限速

from random import choice
import requests

from ch1_link_crawler import Throttle


class Downloader:
    """ Downloader class to use cache and requests for downloading pages.
        For contructor, pass:
            delay (int): # of secs delay between requests (default: 5)
            user_agent (str): user agent string (default: 'wswp')
            proxies (list[dict]): list of possible proxies, each
                must be a dict with http / https keys and proxy values
            cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
            timeout (float/int): number of seconds to wait until timeout
    """
    def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={},
                 timeout=60):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.cache = cache
        self.num_retries = None  # we will set this per request
        self.timeout = timeout

    def __call__(self, url, num_retries=2):
        """ Call the downloader class, which will return HTML from cache
            or download it
            args:
                url (str): url to download
            kwargs:
                num_retries (int): # times to retry if 5xx code (default: 2)
        """
        self.num_retries = num_retries
        try:
            result = self.cache[url]
            print('Loaded from cache:', url)
        except KeyError:
            result = None
        if result and self.num_retries and 500 <= result['code'] < 600:
            # server error so ignore result from cache
            # and re-download
            result = None
        if result is None:
            # result was not loaded from cache, need to download
            self.throttle.wait(url)
            proxies = choice(self.proxies) if self.proxies else None
            headers = {'User-Agent': self.user_agent}
            result = self.download(url, headers, proxies)
            self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxies):
        """ Download a and return the page content
            args:
                url (str): URL
                headers (dict): dict of headers (like user_agent)
                proxies (dict): proxy dict w/ keys 'http'/'https', values
                    are strs (i.e. 'http(s)://IP') (default: None)
        """
        print('Downloading:', url)
        try:
            resp = requests.get(url, headers=headers, proxies=proxies,
                                timeout=self.timeout)
            html = resp.text
            if resp.status_code >= 400:
                print('Download error:', resp.text)
                html = None
                if self.num_retries and 500 <= resp.status_code < 600:
                    # recursively retry 5xx HTTP errors
                    self.num_retries -= 1
                    return self.download(url, headers, proxies)
        except requests.exceptions.RequestException as e:
            print('Download error:', e)
            return {'html': None, 'code': 500}
        return {'html': html, 'code': resp.status_code}

link_crawler函数也需要添加cache参数、删除throttle、以新的Downloader类代替download函数，来支持缓存。

def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
                 proxies=None, delay=3, max_depth=4, num_retries=2, cache={}, scraper_callback=None):
    """ Crawl from the given start URL following links matched by link_regex. In the current
        implementation, we do not actually scrape any information.
        args:
            start_url (str): web site to start crawl
            link_regex (str): regex to match for links
        kwargs:
            robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
            user_agent (str): user agent (default: wswp)
            proxies (list of dicts): a list of possible dicts for http / https proxies
                For formatting, see the requests library
            delay (int): seconds to throttle between requests to one domain (default: 3)
            max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
            num_retries (int): # of retries when 5xx error (default: 2)
            cache (dict): cache dict with urls as keys and dicts for responses (default: {})
            scraper_callback: function to be called on url and html content
    """
    crawl_queue = [start_url]
    # keep track which URL's have seen before
    seen = {}
    if not robots_url:
        robots_url = '{}/robots.txt'.format(start_url)
    rp = get_robots_parser(robots_url)
    D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, cache=cache)
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                print('Skipping %s due to depth' % url)
                continue
            html = D(url, num_retries=num_retries)
            if not html:
                continue
            if scraper_callback:
                links = scraper_callback(url, html) or []
            else:
                links = []
            # filter for links matching our regular expression
            for link in get_links(html) + links:
                if re.match(link_regex, link):
                    abs_link = urljoin(start_url, link)
                    if abs_link not in seen:
                        seen[abs_link] = depth + 1
                        crawl_queue.append(abs_link)
        else:
            print('Blocked by robots.txt:', url)

2.磁盘缓存

文件系统限制
不同文件系统有不同的限制，为使文件路径在不同文件系统都是安全的，需要限制为数字、字母和基本的符号，用’_'代替其他字符。

文件名及目录长度限制在255个字符以内
对于以’/‘结尾的URL，’/'后的空白会造成非法文件名，可添加’index.html’为文件名

url_to_path函数实现了符合上述限制的从URL到文件名的映射
__setitem__方法实现向磁盘写入给定URL的数据，保存当下时间与过期时间的和
__getitem__方法实现从磁盘取得给定URL的数据，对比当下时间和__setitem__中保存的时间，以检查是否过期。
其中，为节约磁盘空间，使用了zlim压缩序列化字符串

import os
import json
import re
import zlib

from datetime import datetime, timedelta
from urllib.parse import urlsplit


class DiskCache:
    """ DiskCache helps store urls and their responses to disk
        Intialization components:
            cache_dir (str): abs file path or relative file path
                for cache directory (default: ../data/cache)
            max_len (int): maximum filename length (default: 255)
            compress (bool): use zlib compression (default: True)
            encoding (str): character encoding for compression (default: utf-8)
            expires (datetime.timedelta): timedelta when content will expire
                (default: 30 days ago)
    """
    def __init__(self, cache_dir='cache/', max_len=255, compress=True,
                 encoding='utf-8', expires=timedelta(days=30)):
        self.cache_dir = cache_dir
        self.max_len = max_len
        self.compress = compress
        self.encoding = encoding
        self.expires = expires

    def url_to_path(self, url):
        """ Return file system path string for given URL """
        components = urlsplit(url)
        # append index.html to empty paths
        path = components.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = components.netloc + path + components.query
        # replace invalid characters
        filename = re.sub(r'[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(seg[:self.max_len] for seg in filename.split('/'))
        return os.path.join(self.cache_dir, filename)

    def __getitem__(self, url):
        """Load data from disk for given URL"""
        path = self.url_to_path(url)
        if os.path.exists(path):
            mode = ('rb' if self.compress else 'r')
            if os.path.isfile(path):
                with open(path, mode) as fp:
                    if self.compress:
                        data = zlib.decompress(fp.read()).decode(self.encoding)
                        data = json.loads(data)
                    else:
                        data = json.load(fp)
                exp_date = data.get('expires')
                if exp_date and datetime.strptime(exp_date,
                                              '%Y-%m-%dT%H:%M:%S') <= datetime.utcnow():
                    print('Cache expired!', exp_date)
                    raise KeyError(url + ' has expired.')
                return data
        else:
            # URL has not yet been cached
            raise KeyError(url + ' does not exist')

    def __setitem__(self, url, result):
        """Save data to disk for given url"""
        path = self.url_to_path(url)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        mode = ('wb' if self.compress else 'w')
        # Note: the timespec command requires Py3.6+ (if using 3.X you can
        # export using isoformat() and import with '%Y-%m-%dT%H:%M:%S.%f'
        result['expires'] = (datetime.utcnow() + self.expires).isoformat(
            timespec='seconds')
        if os.path.isfile(path):
            with open(path, mode) as fp:
                if self.compress:
                    data = bytes(json.dumps(result), self.encoding)
                    fp.write(zlib.compress(data))
                else:
                    json.dump(result, fp)

由于文件系统的限制以及URL的长度不止255，URL可能会映射到相同的文件名下，且目录和文件系统底下可存储的文件数有限，为避免这些，可使用数据库存储。

3.数据库缓存

在MongoDB中创建timestamp索引，到达给定时间戳后，会自动删除过期记录。
其他方法类似磁盘存储的实现。

class MongoCache:
    def __init__(self, client=None, expires=timedelta(days=30),encoding='utf-8'):
        """
        client: mongo database client
        expires: timedelta of amount of time before a cache entry is considered expired
        """
        # if a client object is not passed 
        # then try connecting to mongodb at the default localhost port 
        self.client = MongoClient('localhost', 27017) if client is None else client
        #create collection to store cached webpages,
        # which is the equivalent of a table in a relational database
        self.db = self.client.cache
        self.encoding=encoding
        #TTL索引：对“date”、“timestamp”类型的字段建立TTL索引，并在索引选项中指定索引保存的时长，
        #那么mongodb将会在一定时间之后自动移除那些“过期”的索引和documents。通过TTL索引，可以实现一个清洗数据的“定时任务”。
        self.db.webpage.create_index('timestamp100', expireAfterSeconds=expires.total_seconds())
    def __contains__(self, url):
        try:
            self[url]
        except KeyError:
            return False
        else:
            return True

    def __getitem__(self, url):
        """Load value at this URL
        """
        record = self.db.webpage.find_one({'_id': url})
        if record:
            #return record['result']
            return json.loads(zlib.decompress(record['result']).decode(self.encoding))
        else:
            raise KeyError(url + ' does not exist')

    def __setitem__(self, url, result):
        """Save value for this URL
        """
        #record = {'result': result, 'timestamp': datetime.utcnow()}
        record = {'result': zlib.compress(bytes(json.dumps(result),self.encoding)), 'timestamp': datetime.utcnow()}
        self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)
    def clear(self):
        self.db.webpage.drop()
        print ('drop() successful')

link_crawler('http://example.webscraping.com', '/places/default'+'/(index|view)', cache=MongoCache(expires=timedelta(seconds=100)))

参考：https://blog.csdn.net/u014134180/article/details/55506984

yiranxd

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录