《用Python写网络爬虫》读书笔记2

最新推荐文章于 2021-12-13 14:27:00 发布

每天看一遍，防止恋爱&&堕落

最新推荐文章于 2021-12-13 14:27:00 发布

阅读量370

点赞数

分类专栏： IT书籍读书笔记

本文链接：https://blog.csdn.net/zengraoli/article/details/105064194

版权

IT书籍读书笔记专栏收录该内容

51 篇文章 3 订阅

订阅专栏

文章目录

下载缓存
并发下载
参考网站

下载缓存

缓存使用的方式有很多种，对于数百万网页的网站来说，重新爬取会非常费劲，一开始就爬取可以让每个网页只下载一次

为链接爬虫添加缓存支持

我们把上一张的download函数写成一个类，让其拥有缓存的功能，用dict保存访问过的链接来过滤是一个比较不错的主意，key设置为url，以下的Downloader保存了上一章的核心代码

#!/usr/bin/env python
# encoding: utf-8

from random import choice

class Downloader:
    """ Downloader class to use cache and requests for downloading pages.
        For contructor, pass:
            delay (int): # of secs delay between requests (default: 5)
            user_agent (str): user agent string (default: 'wswp')
            proxies (list[dict]): list of possible proxies, each
                must be a dict with http / https keys and proxy values
            cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
            timeout (float/int): number of seconds to wait until timeout
    """
    def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={},
                 timeout=60):
        self.user_agent = user_agent
        self.proxies = proxies
        self.cache = cache
        self.num_retries = None  # we will set this per request
        self.timeout = timeout

    def __call__(self, url, num_retries=2):
        """ Call the downloader class, which will return HTML from cache
            or download it
            args:
                url (str): url to download
            kwargs:
                num_retries (int): # times to retry if 5xx code (default: 2)
        """
        self.num_retries = num_retries
        try:
            result = self.cache[url]
            print('Loaded from cache:', url)
        except KeyError:
            result = None
        if result and self.num_retries and 500 <= result['code'] < 600:
            # server error so ignore result from cache
            # and re-download
            result = None
        if result is None:
            # result was not loaded from cache, need to download
            proxies = choice(self.proxies) if self.proxies else None
            headers = {'User-Agent': self.user_agent}
            result = self.download(url, headers, proxies)
            self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxies):
        print('Downloading:', url)

if __name__ == "__main__":
    downloader = Downloader()
    downloader('http://example.python-scraping.com') # 首次访问无cache
    downloader('http://example.python-scraping.com') # 有cache的情况
    downloader('http://www.baidu.com') # 再来一个无cache的

执行结果如下

磁盘缓存

书中提到的磁盘缓存并不是把html都保存到本地中，仅仅是保存html中的json字符串，对这个json字符串进行zlib.compress和zlib.decompress实现压缩的功能，缓存有效期为30天

#!/usr/bin/env python
# encoding: utf-8

import os
import json
import re
import zlib

from datetime import datetime
from urllib.parse import urlsplit

import requests
from datetime import timedelta

class DiskCache:
    """ DiskCache helps store urls and their responses to disk
        Intialization components:
            cache_dir (str): abs file path or relative file path
                for cache directory (default: ../data/cache)
            max_len (int): maximum filename length (default: 255)
            compress (bool): use zlib compression (default: True)
            encoding (str): character encoding for compression (default: utf-8)
            expires (datetime.timedelta): timedelta when content will expire
                (default: 30 days ago)
    """
    def __init__(self, cache_dir='./data/cache', max_len=255, compress=True,
                 encoding='utf-8', expires=timedelta(days=30)):
        self.cache_dir = cache_dir
        self.max_len = max_len
        self.compress = compress
        self.encoding = encoding
        self.expires = expires

    def url_to_path(self, url):
        """ Return file system path string for given URL """
        components = urlsplit(url)
        # append index.html to empty paths
        path = components.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = components.netloc + path + components.query
        # replace invalid characters
        filename = re.sub(r'[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(seg[:self.max_len] for seg in filename.split('/'))
        return os.path.join(self.cache_dir, filename)

    def __getitem__(self, url):
        """Load data from disk for given URL"""
        path = self.url_to_path(url)
        if os.path.exists(path):
            mode = ('rb' if self.compress else 'r')
            with open(path, mode) as fp:
                if self.compress:
                    data = zlib.decompress(fp.read()).decode(self.encoding)
                    data = json.loads(data)
                else:
                    data = json.load(fp)
            exp_date = data.get('expires')
            if exp_date and datetime.strptime(exp_date,
                                              '%Y-%m-%dT%H:%M:%S') <= datetime.utcnow():
                print('Cache expired!', exp_date)
                raise KeyError(url + ' has expired.')
            return data
        else:
            # URL has not yet been cached
            raise KeyError(url + ' does not exist')

    def __setitem__(self, url, result):
        print(url, result, "==========")
        """Save data to disk for given url"""
        path = self.url_to_path(url)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        mode = ('wb' if self.compress else 'w')
        # Note: the timespec command requires Py3.6+ (if using 3.X you can
        # export using isoformat() and import with '%Y-%m-%dT%H:%M:%S.%f'
        result['expires'] = (datetime.utcnow() + self.expires).isoformat(
            timespec='seconds')
        with open(path, mode) as fp:
            if self.compress:
                data = bytes(json.dumps(result), self.encoding)
                fp.write(zlib.compress(data))
            else:
                json.dump(result, fp)

if __name__ == "__main__":
    dc = DiskCache()
    url = "http://example.python-scraping.com"
    dc.url_to_path(url)
    resp = requests.get(url)
    d = {"name": "zengraoli", "password": "123456"}
    html = resp.text
    # dc[url] = html
    dc[url] = d

使用测试后，可以看到文件夹生成了，内容是一个压缩过的html页面

磁盘缓存缺点

文件名即使我们做了替换，还是会出现重复的情况，解决方案是用哈希处理；另外一个情况是网站子类太多，那么查找起来会很慢，解决方案是使用多个网页合并到一个里面，用其他数据结构进行查找

键值对存储缓存

redis实现缓存

如果是使用redis作为缓存在合适不过了，redis还能提供对索引有效期的设置，替换了我们之前的手工处理，但是压缩还是需要靠自己来做

下面的代码是使用redis作为缓存的测试，请首先确保安装了对应的包

#!/usr/bin/env python
# encoding: utf-8

import json
import zlib
import requests
from datetime import timedelta
from redis import StrictRedis

class RedisCache:
    def __init__(self, client=None, expires=timedelta(days=30), encoding='utf-8', compress=True):
        self.client = (StrictRedis(host='localhost', port=6379, db=0)
                       if client is None else client)
        self.expires = expires
        self.encoding = encoding
        self.compress = compress

    def __getitem__(self, url):
        """Load data from Redis for given URL"""
        record = self.client.get(url)
        if record:
            if self.compress:
                record = zlib.decompress(record)
            return json.loads(record.decode(self.encoding))
        else:
            # URL has not yet been cached
            raise KeyError(url + ' does not exist')

    def __setitem__(self, url, result):
        """Save data to Redis for given url"""
        data = bytes(json.dumps(result), self.encoding)
        if self.compress:
            data = zlib.compress(data)
        self.client.setex(url, self.expires, data)

if __name__ == "__main__":
    dc = RedisCache()
    url = "http://example.python-scraping.com"
    resp = requests.get(url)
    d = {"name": "zengraoli", "password": "123456"}
    html = resp.text
    # dc[url] = html
    dc[url] = d
    print(dc[url])

探索requests-cache

requests-cache让我们自己免于实现cache类，他支持多种后端，redis、mongodb、sqlite以及内存。使用之前需要安装

pip install requests-cache

下面的代码时使用requests-cache的，中间设置了缓存第二次的访问基本不需要多少时间

import time
import requests_cache

if __name__ == "__main__":
    requests_cache.install_cache(backend='redis', expire_after=timedelta(days=30))
    url = "http://example.python-scraping.com"
    start = time.time()
    resp = requests.get(url)
    end = time.time()
    print("循环运行时间:%.2f秒" % (end - start)) # 循环运行时间:1.01秒

    resp = requests.get(url)
    end = time.time()
    print("循环运行时间:%.2f秒" % (end - start)) # 循环运行时间:1.02秒

并发下载

100万个网页

所谓的100w个网页是从亚马逊下载的一个压缩文件，里面是一个csv，用requests请求会比较慢，所以可以先用浏览器下载，然后再从本地读取

#!/usr/bin/env python
# encoding: utf-8

import csv
from zipfile import ZipFile
from io import TextIOWrapper, BytesIO
import requests

if __name__ == "__main__":
    # resp = requests.get('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip', stream=True)
    urls = []  # top 1 million URL's will be stored in this list
    content = ""
    # with ZipFile(BytesIO(content)) as zf:
    with ZipFile("top-1m.csv.zip", "r") as zf: # 从本地中读取url
        csv_filename = zf.namelist()[0]
        with zf.open(csv_filename) as csv_file:
            for _, website in csv.reader(TextIOWrapper(csv_file)):
                urls.append('http://' + website)

多线程爬虫

去掉一些其他功能，来模拟一下多线程的爬虫

#!/usr/bin/env python
# encoding: utf-8

import csv
from zipfile import ZipFile
from io import TextIOWrapper
import requests
import threading
import socket
import time

SLEEP_TIME = 1
socket.setdefaulttimeout(60)
requests.packages.urllib3.disable_warnings() # 我们对输出的ssl警告不感兴趣

def threaded_crawler(start_url, max_threads=5):
    if isinstance(start_url, list):
        crawl_queue = start_url
    else:
        crawl_queue = [start_url]

    def process_queue():
        while crawl_queue:
            url = crawl_queue.pop()
            headers = {'User-Agent': "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",}
            try:
                html = requests.get(url, headers=headers, verify=False)
                print(url, html.status_code)
            except Exception as ee:
                print(url, " ee:", ee)

    # wait for all download threads to finish
    threads = []
    print(max_threads)
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)  # set daemon so main thread can exit w/ ctrl-c
            thread.start()
            threads.append(thread)
        print(threads)
        for thread in threads:
            thread.join()
        time.sleep(SLEEP_TIME)

if __name__ == "__main__":
    urls = []  # top 1 million URL's will be stored in this list
    content = ""
    with ZipFile("top-1m.csv.zip", "r") as zf: # 从本地中读取url
        csv_filename = zf.namelist()[0]
        with zf.open(csv_filename) as csv_file:
            for _, website in csv.reader(TextIOWrapper(csv_file)):
                urls.append('http://' + website)

    # 拿出来10个url做多线程爬虫示例
    url_list = urls[:10]
    threaded_crawler(url_list)

输出如下

多进程爬虫

多进程在py中相对多线程会有速度的提升，因为GIL的缘故，但多进程又无法同时读取同一个下载队列，因此可以采用redis作为中间介质

#!/usr/bin/env python
# encoding: utf-8

import csv
from zipfile import ZipFile
from io import TextIOWrapper
import requests
import threading
import multiprocessing
import socket
import time
from redis_queue import RedisQueue

SLEEP_TIME = 1
socket.setdefaulttimeout(60)
requests.packages.urllib3.disable_warnings() # 我们对输出的ssl警告不感兴趣

def threaded_crawler_rq(start_url, max_threads=5):
    crawl_queue = RedisQueue()
    def process_queue():
        while crawl_queue:
            url = crawl_queue.pop()
            headers = {'User-Agent': "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",}
            try:
                html = requests.get(url, headers=headers, verify=False)
                print(url, html.status_code)
            except Exception as ee:
                print(url, " ee:", ee)

    # wait for all download threads to finish
    threads = []
    print(max_threads)
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)  # set daemon so main thread can exit w/ ctrl-c
            thread.start()
            threads.append(thread)
        print(threads)
        for thread in threads:
            thread.join()
        time.sleep(SLEEP_TIME)

def mp_threaded_crawler(*args, **kwargs):
    """ create a multiprocessing threaded crawler """
    processes = []
    num_procs = kwargs.pop('num_procs')
    if not num_procs:
        num_procs = multiprocessing.cpu_count()
    for _ in range(num_procs):
        proc = multiprocessing.Process(target=threaded_crawler_rq,
                                       args=args, kwargs=kwargs)
        proc.start()
        processes.append(proc)
    # wait for processes to complete
    for proc in processes:
        proc.join()

if __name__ == "__main__":
    urls = []  # top 1 million URL's will be stored in this list
    content = ""
    crawl_queue = RedisQueue()
    with ZipFile("top-1m.csv.zip", "r") as zf: # 从本地中读取url
        csv_filename = zf.namelist()[0]
        with zf.open(csv_filename) as csv_file:
            for _, website in csv.reader(TextIOWrapper(csv_file)):
                urls.append('http://' + website)
    # 拿出来10个url做多线程爬虫示例
    url_list = urls[:20]
    crawl_queue.push(url_list) # 把20个链接推进redis队列中
    # threaded_crawler(url_list)
    start_time = time.time()
    mp_threaded_crawler(url_list, num_procs=4)
    print('Total time: %ss' % (time.time() - start_time))

配套的redis代码，注意存放路径

# Based loosely on the Redis Cookbook FIFO Queue: http://www.rediscookbook.org/implement_a_fifo_queue.html
from redis import StrictRedis

class RedisQueue:
    def __init__(self, client=None, db=0, queue_name='wswp'):
        self.client = (StrictRedis(host='localhost', port=6379, db=db)
                       if client is None else client)
        self.name = "queue:%s" % queue_name
        self.seen_set = "seen:%s" % queue_name
        self.depth = "depth:%s" % queue_name

    def __len__(self):
        return self.client.llen(self.name)

    def push(self, element):
        """Push an element to the tail of the queue"""
        if isinstance(element, list):
            element = [e for e in element if not self.already_seen(e)]
            self.client.lpush(self.name, *element)
            self.client.sadd(self.seen_set, *element)
        elif not self.already_seen(element):
            self.client.lpush(self.name, element)
            self.client.sadd(self.seen_set, element)

    def already_seen(self, element):
        """ determine if an element has already been seen """
        return self.client.sismember(self.seen_set, element)

    def set_depth(self, element, depth):
        """ Set the seen hash and depth """
        self.client.hset(self.depth, element, depth)

    def get_depth(self, element):
        """ Get the seen hash and depth """
        return (lambda dep: int(dep) if dep else 0)(self.client.hget(self.depth, element))

    def pop(self):
        """Pop an element from the head of the queue"""
        return self.client.rpop(self.name).decode('utf-8')

输出如下

参考网站

python3 __call__方法
 python计算时间的两种方式：time与datetime
python中zipfile模块实例化解析
 python requests提示警告InsecureRequestWarning

每天看一遍，防止恋爱&&堕落

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
《用Python写网络爬虫》读书笔记2

文章目录下载缓存为链接爬虫添加缓存支持磁盘缓存磁盘缓存缺点键值对存储缓存redis实现缓存探索requests-cache并发下载100万个网页多线程爬虫多进程爬虫参考网站下载缓存缓存使用的方式有很多种，对于数百万网页的网站来说，重新爬取会非常费劲，一开始就爬取可以让每个网页只下载一次为链接爬虫添加缓存支持我们把上一张的download函数写成一个类，让其拥有缓存的功能，用dict保存访问...
复制链接

扫一扫

专栏目录