《用Python写网络爬虫》读书笔记2

下载缓存

缓存使用的方式有很多种,对于数百万网页的网站来说,重新爬取会非常费劲,一开始就爬取可以让每个网页只下载一次

为链接爬虫添加缓存支持

我们把上一张的download函数写成一个类,让其拥有缓存的功能,用dict保存访问过的链接来过滤是一个比较不错的主意,key设置为url,以下的Downloader保存了上一章的核心代码

#!/usr/bin/env python
# encoding: utf-8

from random import choice

class Downloader:
    """ Downloader class to use cache and requests for downloading pages.
        For contructor, pass:
            delay (int): # of secs delay between requests (default: 5)
            user_agent (str): user agent string (default: 'wswp')
            proxies (list[dict]): list of possible proxies, each
                must be a dict with http / https keys and proxy values
            cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
            timeout (float/int): number of seconds to wait until timeout
    """
    def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={},
                 timeout=60):
        self.user_agent = user_agent
        self.proxies = proxies
        self.cache = cache
        self.num_retries = None  # we will set this per request
        self.timeout = timeout

    def __call__(self, url, num_retries=2):
        """ Call the downloader class, which will return HTML from cache
            or download it
            args:
                url (str): url to download
            kwargs:
                num_retries (int): # times to retry if 5xx code (default: 2)
        """
        self.num_retries = num_retries
        try:
            result = self.cache[url]
            print('Loaded from cache:', url)
        except KeyError:
            result = None
        if result and self.num_retries and 500 <= result['code'] < 600:
            # server error so ignore result from cache
            # and re-download
            result = None
        if result is None:
            # result was not loaded from cache, need to download
            proxies = choice(self.proxies) if self.proxies else None
            headers = {'User-Agent': self.user_agent}
            result = self.download(url, headers, proxies)
            self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxies):
        print('Downloading:', url)

if __name__ == "__main__":
    downloader = Downloader()
    downloader('http://example.python-scraping.com') # 首次访问无cache
    downloader('http://example.python-scraping.com') # 有cache的情况
    downloader('http://www.baidu.com') # 再来一个无cache的

执行结果如下

在这里插入图片描述

磁盘缓存

书中提到的磁盘缓存并不是把html都保存到本地中,仅仅是保存html中的json字符串,对这个json字符串进行zlib.compress和zlib.decompress实现压缩的功能,缓存有效期为30天

#!/usr/bin/env python
# encoding: utf-8

import os
import json
import re
import zlib

from datetime import datetime
from urllib.parse import urlsplit

import requests
from datetime import timedelta

class DiskCache:
    """ DiskCache helps store urls and their responses to disk
        Intialization components:
            cache_dir (str): abs file path or relative file path
                for cache directory (default: ../data/cache)
            max_len (int): maximum filename length (default: 255)
            compress (bool): use zlib compression (default: True)
            encoding (str): character encoding for compression (default: utf-8)
            expires (datetime.timedelta): timedelta when content will expire
                (default: 30 days ago)
    """
    def __init__(self, cache_dir='./data/cache', max_len=255, compress=True,
                 encoding='utf-8', expires=timedelta(days=30)):
        self.cache_dir = cache_dir
        self.max_len = max_len
        self.compress = compress
        self.encoding = encoding
        self.expires = expires

    def url_to_path(self, url):
        """ Return file system path string for given URL """
        components = urlsplit(url)
        # append index.html to empty paths
        path = components.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = components.netloc + path + components.query
        # replace invalid characters
        filename = re.sub(r'[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(seg[:self.max_len] for seg in filename.split('/'))
        return os.path.join(self.cache_dir, filename)

    def __getitem__(self, url):
        """Load data from disk for given URL"""
        path = self.url_to_path(url)
        if os.path.exists(path):
            mode = ('rb' if self.compress else 'r')
            with open(path, mode) as fp:
                if self.compress:
                    data = zlib.decompress(fp.read()).decode(self.encoding)
                    data = json.loads(data)
                else:
                    data = json.load(fp)
            exp_date = data.get('expires')
            if exp_date and datetime.strptime(exp_date,
                                              '%Y-%m-%dT%H:%M:%S') <= datetime.utcnow():
                print('Cache expired!', exp_date)
                raise KeyError(url + ' has expired.')
            return data
        else:
            # URL has not yet been cached
            raise KeyError(url + ' does not exist')

    def __setitem__(self, url, result):
        print(url, result, "==========")
        """Save data to disk for given url"""
        path = self.url_to_path(url)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        mode = ('wb' if self.compress else 'w')
        # Note: the timespec command requires Py3.6+ (if using 3.X you can
        # export using isoformat() and import with '%Y-%m-%dT%H:%M:%S.%f'
        result['expires'] = (datetime.utcnow() + self.expires).isoformat(
            timespec='seconds')
        with open(path, mode) as fp:
            if self.compress:
                data = bytes(json.dumps(result), self.encoding)
                fp.write(zlib.compress(data))
            else:
                json.dump(result, fp)

if __name__ == "__main__":
    dc = DiskCache()
    url = "http://example.python-scraping.com"
    dc.url_to_path(url)
    resp = requests.get(url)
    d = {"name": "zengraoli", "password": "123456"}
    html = resp.text
    # dc[url] = html
    dc[url] = d

使用测试后,可以看到文件夹生成了,内容是一个压缩过的html页面

在这里插入图片描述

磁盘缓存缺点

文件名即使我们做了替换,还是会出现重复的情况,解决方案是用哈希处理;另外一个情况是网站子类太多,那么查找起来会很慢,解决方案是使用多个网页合并到一个里面,用其他数据结构进行查找

键值对存储缓存

redis实现缓存

如果是使用redis作为缓存在合适不过了,redis还能提供对索引有效期的设置,替换了我们之前的手工处理,但是压缩还是需要靠自己来做

下面的代码是使用redis作为缓存的测试,请首先确保安装了对应的包

#!/usr/bin/env python
# encoding: utf-8

import json
import zlib
import requests
from datetime import timedelta
from redis import StrictRedis

class RedisCache:
    def __init__(self, client=None, expires=timedelta(days=30), encoding='utf-8', compress=True):
        self.client = (StrictRedis(host='localhost', port=6379, db=0)
                       if client is None else client)
        self.expires = expires
        self.encoding = encoding
        self.compress = compress

    def __getitem__(self, url):
        """Load data from Redis for given URL"""
        record = self.client.get(url)
        if record:
            if self.compress:
                record = zlib.decompress(record)
            return json.loads(record.decode(self.encoding))
        else:
            # URL has not yet been cached
            raise KeyError(url + ' does not exist')

    def __setitem__(self, url, result):
        """Save data to Redis for given url"""
        data = bytes(json.dumps(result), self.encoding)
        if self.compress:
            data = zlib.compress(data)
        self.client.setex(url, self.expires, data)

if __name__ == "__main__":
    dc = RedisCache()
    url = "http://example.python-scraping.com"
    resp = requests.get(url)
    d = {"name": "zengraoli", "password": "123456"}
    html = resp.text
    # dc[url] = html
    dc[url] = d
    print(dc[url])

探索requests-cache

requests-cache让我们自己免于实现cache类,他支持多种后端,redis、mongodb、sqlite以及内存。使用之前需要安装

pip install requests-cache

下面的代码时使用requests-cache的,中间设置了缓存第二次的访问基本不需要多少时间

import time
import requests_cache

if __name__ == "__main__":
    requests_cache.install_cache(backend='redis', expire_after=timedelta(days=30))
    url = "http://example.python-scraping.com"
    start = time.time()
    resp = requests.get(url)
    end = time.time()
    print("循环运行时间:%.2f秒" % (end - start)) # 循环运行时间:1.01秒

    resp = requests.get(url)
    end = time.time()
    print("循环运行时间:%.2f秒" % (end - start)) # 循环运行时间:1.02秒

并发下载

100万个网页

所谓的100w个网页是从亚马逊下载的一个压缩文件,里面是一个csv,用requests请求会比较慢,所以可以先用浏览器下载,然后再从本地读取

#!/usr/bin/env python
# encoding: utf-8

import csv
from zipfile import ZipFile
from io import TextIOWrapper, BytesIO
import requests

if __name__ == "__main__":
    # resp = requests.get('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip', stream=True)
    urls = []  # top 1 million URL's will be stored in this list
    content = ""
    # with ZipFile(BytesIO(content)) as zf:
    with ZipFile("top-1m.csv.zip", "r") as zf: # 从本地中读取url
        csv_filename = zf.namelist()[0]
        with zf.open(csv_filename) as csv_file:
            for _, website in csv.reader(TextIOWrapper(csv_file)):
                urls.append('http://' + website)

多线程爬虫

去掉一些其他功能,来模拟一下多线程的爬虫

#!/usr/bin/env python
# encoding: utf-8

import csv
from zipfile import ZipFile
from io import TextIOWrapper
import requests
import threading
import socket
import time

SLEEP_TIME = 1
socket.setdefaulttimeout(60)
requests.packages.urllib3.disable_warnings() # 我们对输出的ssl警告不感兴趣

def threaded_crawler(start_url, max_threads=5):
    if isinstance(start_url, list):
        crawl_queue = start_url
    else:
        crawl_queue = [start_url]

    def process_queue():
        while crawl_queue:
            url = crawl_queue.pop()
            headers = {'User-Agent': "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",}
            try:
                html = requests.get(url, headers=headers, verify=False)
                print(url, html.status_code)
            except Exception as ee:
                print(url, " ee:", ee)

    # wait for all download threads to finish
    threads = []
    print(max_threads)
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)  # set daemon so main thread can exit w/ ctrl-c
            thread.start()
            threads.append(thread)
        print(threads)
        for thread in threads:
            thread.join()
        time.sleep(SLEEP_TIME)

if __name__ == "__main__":
    urls = []  # top 1 million URL's will be stored in this list
    content = ""
    with ZipFile("top-1m.csv.zip", "r") as zf: # 从本地中读取url
        csv_filename = zf.namelist()[0]
        with zf.open(csv_filename) as csv_file:
            for _, website in csv.reader(TextIOWrapper(csv_file)):
                urls.append('http://' + website)

    # 拿出来10个url做多线程爬虫示例
    url_list = urls[:10]
    threaded_crawler(url_list)

输出如下

在这里插入图片描述

多进程爬虫

多进程在py中相对多线程会有速度的提升,因为GIL的缘故,但多进程又无法同时读取同一个下载队列,因此可以采用redis作为中间介质

#!/usr/bin/env python
# encoding: utf-8

import csv
from zipfile import ZipFile
from io import TextIOWrapper
import requests
import threading
import multiprocessing
import socket
import time
from redis_queue import RedisQueue

SLEEP_TIME = 1
socket.setdefaulttimeout(60)
requests.packages.urllib3.disable_warnings() # 我们对输出的ssl警告不感兴趣

def threaded_crawler_rq(start_url, max_threads=5):
    crawl_queue = RedisQueue()
    def process_queue():
        while crawl_queue:
            url = crawl_queue.pop()
            headers = {'User-Agent': "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",}
            try:
                html = requests.get(url, headers=headers, verify=False)
                print(url, html.status_code)
            except Exception as ee:
                print(url, " ee:", ee)

    # wait for all download threads to finish
    threads = []
    print(max_threads)
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)  # set daemon so main thread can exit w/ ctrl-c
            thread.start()
            threads.append(thread)
        print(threads)
        for thread in threads:
            thread.join()
        time.sleep(SLEEP_TIME)

def mp_threaded_crawler(*args, **kwargs):
    """ create a multiprocessing threaded crawler """
    processes = []
    num_procs = kwargs.pop('num_procs')
    if not num_procs:
        num_procs = multiprocessing.cpu_count()
    for _ in range(num_procs):
        proc = multiprocessing.Process(target=threaded_crawler_rq,
                                       args=args, kwargs=kwargs)
        proc.start()
        processes.append(proc)
    # wait for processes to complete
    for proc in processes:
        proc.join()

if __name__ == "__main__":
    urls = []  # top 1 million URL's will be stored in this list
    content = ""
    crawl_queue = RedisQueue()
    with ZipFile("top-1m.csv.zip", "r") as zf: # 从本地中读取url
        csv_filename = zf.namelist()[0]
        with zf.open(csv_filename) as csv_file:
            for _, website in csv.reader(TextIOWrapper(csv_file)):
                urls.append('http://' + website)
    # 拿出来10个url做多线程爬虫示例
    url_list = urls[:20]
    crawl_queue.push(url_list) # 把20个链接推进redis队列中
    # threaded_crawler(url_list)
    start_time = time.time()
    mp_threaded_crawler(url_list, num_procs=4)
    print('Total time: %ss' % (time.time() - start_time))

配套的redis代码,注意存放路径

# Based loosely on the Redis Cookbook FIFO Queue: http://www.rediscookbook.org/implement_a_fifo_queue.html
from redis import StrictRedis

class RedisQueue:
    def __init__(self, client=None, db=0, queue_name='wswp'):
        self.client = (StrictRedis(host='localhost', port=6379, db=db)
                       if client is None else client)
        self.name = "queue:%s" % queue_name
        self.seen_set = "seen:%s" % queue_name
        self.depth = "depth:%s" % queue_name

    def __len__(self):
        return self.client.llen(self.name)

    def push(self, element):
        """Push an element to the tail of the queue"""
        if isinstance(element, list):
            element = [e for e in element if not self.already_seen(e)]
            self.client.lpush(self.name, *element)
            self.client.sadd(self.seen_set, *element)
        elif not self.already_seen(element):
            self.client.lpush(self.name, element)
            self.client.sadd(self.seen_set, element)

    def already_seen(self, element):
        """ determine if an element has already been seen """
        return self.client.sismember(self.seen_set, element)

    def set_depth(self, element, depth):
        """ Set the seen hash and depth """
        self.client.hset(self.depth, element, depth)

    def get_depth(self, element):
        """ Get the seen hash and depth """
        return (lambda dep: int(dep) if dep else 0)(self.client.hget(self.depth, element))

    def pop(self):
        """Pop an element from the head of the queue"""
        return self.client.rpop(self.name).decode('utf-8')

输出如下

在这里插入图片描述

参考网站

python3 __call__方法
python计算时间的两种方式:time与datetime
python中zipfile模块实例化解析
python requests提示警告InsecureRequestWarning

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值