python爬虫-->并发下载

为了提高爬虫的效率,我们可以采用多线程,多进程的爬取数据,本篇博文将主要讲解如何利用爬虫进行并发下载。

我们可以利用浏览Alexa网址获取网站列表文件,该文件中含有最受欢迎的100万个网站列表。可以通过http://s3.amazonaws.com/alexa-static/top-1m.csv.zip直接下载这一列表压缩文件。

下面分别利用单线程,多线程,多进程来下载上述文件内网址网页。并且对比其耗费时间。

比较方法:通过各种crawler函数,调用回调类AlexaCallback来对压缩文件进行解析,获取文件内所有urls,然后将这些所有的urls存放进行爬虫队列。再分别采用单线程,多线程,多进程来对这些所有的urls进行下载操作,比较耗时多少。

单线程下载
首先需要修改前几篇博文中的回调函数scrape_callback,alexa_cb.py文件:

# -*- coding: utf-8 -*-

import csv
from zipfile import ZipFile
from StringIO import StringIO
#from mongo_cache import MongoCache
from new_chapter3.mongo_cache import MongoCache


class AlexaCallback:
    def __init__(self, max_urls=1000):
        self.max_urls = max_urls
        self.seed_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'

    def __call__(self, url, html):
        '''
        当调用该回调函数时,首先会把传入的url和seed_url进行比较
        如果相同则解析该压缩文件,获得其内的所有url,添加到url列表中然后返回该url列表
        '''
        if url == self.seed_url:## 只在第一次执行
            urls = []
            cache = MongoCache()
            ##with ZipFile(StringIO(html)) as zf:
            with ZipFile('top-1m.csv.zip','r') as zf:
                csv_filename = zf.namelist()[0]
                for _, website in csv.reader(zf.open(csv_filename)):
                    if 'http://' + website not in cache:
                        urls.append('http://' + website)
                        if len(urls) == self.max_urls:
                            break
            return urls

本来想根据url=’http://s3.amazonaws.com/alexa-static/top-1m.csv.zip‘直接通过之前的download类来下载其压缩文件,然后进行解析,但是因为该网址在国内需要翻墙才能下载,在程序中下载总是失败,于是我直接在浏览器中下载了该文件,然后在程序中打开解析。

再将上述回调函数返回的url列表,存入爬虫队列中进行爬取,link_crawler.py:

#coding:utf-8
import re
import urlparse
import urllib2
import time
import datetime
import robotparser
from new_chapter3.downloader import Downloader



def link_crawler(seed_url, delay=5, user_agent='wswp', proxies=None,
                 num_retries=1, scrape_callback=None, cache=None,ignore_robots=False):
    """Crawl from the given seed URL following links matched by link_regex
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = [seed_url]
    # the URL's that have been seen and at what depth
    seen = set([seed_url])
    # track how many URL's have been downloaded
    rp = get_robots(seed_url)
    ##在此处创建一个Downloader类对象,在此处一次性的传入多个参数,在下面调用时就可以避免多次传入
    D=Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, cache=cache)

    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url) or ignore_robots==True:
            try:
                html = D(url)## 下载url对应的html网页
            except:
                pass
            if scrape_callback:
                try:
                    ##将回调函数返回的urls列表给links,该语句只在第一次执行
                    links = scrape_callback(url, html) or []
                except Exception as e:
                    print 'Error in callback for: {}: {}'.format(url, e)
                else:
                    for link in links:
                        link = normalize(seed_url, link)
                        # 检查link是否已经被抓取过
                        if link not in seen:
                            seen.add(link)
                            # 如果没有被抓取过,则将该url添加入爬虫队列中
                            crawl_queue.append(link)


def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link)  # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)


def same_domain(url1, url2):
    """Return True if both URL's belong to same domain
    """
    return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc


def get_robots(url):
    """Initialize robots parser for this domain
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp

以上抓取类其实就是将压缩文件内所有url存入爬虫队列中进行串行爬取,来做个测试,看看耗时多少,效果如何?sequential_test.py:

# -*- coding: utf-8 -*-

from link_crawler import link_crawler
from new_chapter3.mongo_cache import MongoCache
from alexa_cb import AlexaCallback
import time

def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache,user_agent='GoodCrawler',ignore_robots=True)


if __name__ == '__main__':
    start=time.time()
    main()
    end=time.time()
    print '串行爬取耗费 ',(end-start)

这里写图片描述

多线程爬虫

多线程爬取请求内容过快,可能会造成服务器过载,或者是ip被封。为了避免这一问题,我们的爬虫需要设置一个delay标识,用于请求同一个域名时的最小时间间隔。

但是在本篇博文的示例中,压缩文件内100w+多个网站,域名几乎都是不同,因而不会出现上述问题。

为了实现多线程爬虫,需要对link_crawler文件进行修改,使其支持多线程,修改以后的threaded_crawler.py文件:

#coding:utf-8

import time
import threading
import urlparse
from new_chapter3.downloader import Downloader

SLEEP_TIME = 1



def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl this website in multiple threads
    """
    # the queue of URL's that still need to be crawled
    #crawl_queue = Queue.deque([seed_url])
    crawl_queue = [seed_url]
    # the URL's that have been seen
    seen = set([seed_url])
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                # crawl queue is empty
                break
            else:
                try:
                    html = D(url)
                except:
                    pass
                if scrape_callback:
                    try:
                        ##将回调函数返回的urls列表给links,该语句只在第一次执行
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            # check whether already crawled this link
                            if link not in seen:
                                seen.add(link)
                                # add this new link to queue
                                crawl_queue.append(link)


    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue: ##当线程池还有线程可用时,并且crawl_queue里面还有url时
        # the crawl is still active
        for thread in threads:
            if not thread.is_alive():
                # remove the stopped threads
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        # all threads have been processed
        # sleep temporarily so CPU can focus execution on other threads
        time.sleep(SLEEP_TIME)


def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)

来对多线程进行测试,看看效果如何?
threaded_test.py:

#coding:utf-8

import sys
from threaded_crawler import threaded_crawler
from new_chapter3.mongo_cache import MongoCache
from alexa_cb import AlexaCallback
import time

def main(max_threads):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    threaded_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)


if __name__ == '__main__':
    start=time.time()
    max_threads = 5 ## 这里采用五个线程并行
    main(max_threads)
    end=time.time()
    print '并行爬取耗费 ', (end - start)

这里写图片描述

五个线程并发执行所耗时间不到单线程耗时的五分之一!!!

由多线程
多进程爬虫

在上述多线程爬虫中,爬虫队列crawl_queue是直接在threaded_crawler函数中定义,就是直接定义和存储在本地内存中,每个进程有各自的内存,如果多个进程同时调用threaded_crawler函数,就会创建各自不同的爬虫队列,这就导致多个进程无法统一的处理爬虫队列,为了解决这个问题,可以把爬虫队列转移到数据库中,即数据库模式。

我们采用数据库模式,就是把爬虫队列转移到数据库中,单独存储队列。这样多个爬取进程,URL的获取与增加都通过数据库操作。 并且不同的进程处理的是同一个内存中数据库里的爬虫队列。做到了协同处理。

数据库模式具有这些优势:开发便捷,天生具备读写保护及支持IPC(进程间通信),只需要写一个爬虫程序。

为此需要写一个数据库爬虫程序,上述的threaded_crawler函数或者之前link_crawler函数中,其爬虫队列crawler_queue都有插入,取出,添加,修改,判空等操作,这些是爬取队列必备的一些操作,同理在数据库爬虫中也必须有这些操作。
mongo_queue.py:

#coding:utf-8

from datetime import datetime, timedelta
from pymongo import MongoClient, errors


class MongoQueue:
    # 队列中url三种可能状态
    '''
    OUTSTANDING:当该url被添加进爬虫队列
    PROCESSING:当URL从队列中取出准备下载时
    COMPLETE:当下载结束时
    '''
    OUTSTANDING, PROCESSING, COMPLETE = range(3)

    def __init__(self, client=None, timeout=300):
        """
        host: the host to connect to MongoDB
        port: the port to connect to MongoDB
        timeout: the number of seconds to allow for a timeout
        """
        self.client = MongoClient() if client is None else client
        self.db = self.client.cache
        self.timeout = timeout

    def __nonzero__(self):
        """
        当下载完成以后status才会变为COMPLETE,这时爬虫队列也会停止,这里寻找的是不等于COMPLETE状态的队列,即是
        确定是否有线程在运行。
        '$ne':表示不等于
        """
        record = self.db.crawl_queue.find_one(
            {'status': {'$ne': self.COMPLETE}}
        )
        return True if record else False

    def push(self, url):
        """
        将该URL添入爬虫队列,_id属性值为该url,对应的status为OUTSTANDING
        """
        try:
            self.db.crawl_queue.insert({'_id': url, 'status': self.OUTSTANDING})
        except errors.DuplicateKeyError as e:
            pass # this is already in the queue

    def pop(self):
        """
        从爬虫队列中取出一个url,首先查找出一个状态为OUTSTANDING即刚刚插入队列的url
        并更新其状态为PROCESSING,timestamp为当前时间
        """
        record = self.db.crawl_queue.find_and_modify(
            query={'status': self.OUTSTANDING},
            update={'$set': {'status': self.PROCESSING, 'timestamp': datetime.now()}}
        )
        if record:
            return record['_id']
        else:
            self.repair()
            raise KeyError()

    def peek(self):
        ## 取出队列中一个url
        record = self.db.crawl_queue.find_one({'status': self.OUTSTANDING})
        if record:
            return record['_id']

    def complete(self, url):
        ## 当下载完成后,调用该函数将其url状态设置为COMPLETE
        self.db.crawl_queue.update({'_id': url}, {'$set': {'status': self.COMPLETE}})

    def repair(self):
        """
        '$lt':表示小于
        首先在队列中寻找状态不为COMPLETE的url,即该url正在被添加或者正在下载过程中。
        如果该url的timestamp<当前时间点-timeout,表示处理该url的进程超时,次数需要将url状态再次设置为OUTSTANDING,
        以便再次进行处理
        """
        record = self.db.crawl_queue.find_and_modify(
            query={
                'timestamp': {'$lt': datetime.now() - timedelta(seconds=self.timeout)},
                'status': {'$ne': self.COMPLETE}
            },
            update={'$set': {'status': self.OUTSTANDING}}
        )
        if record:
            print 'Released:', record['_id']

    def clear(self):
        self.db.crawl_queue.drop()

为了支持这个数据库爬虫队列,需要修改threaded_crawler函数,process_crawler.py:

#coding:utf-8

import time
import urlparse
import threading
import multiprocessing
from new_chapter3.mongo_cache import MongoCache
from mongo_queue import MongoQueue
from new_chapter3.downloader import Downloader

SLEEP_TIME = 1


def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    ## 每次不同的进程调用这个函数时,都会调用数据库队列,注意这里不会创建不同的数据库队列
    ## 调用的是同一个爬虫队列,这个爬虫队列是存在mongoDB数据库中是同一个内存地址中,不是存储在多个进程各自不同的内存中
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url) ##准备将该URL添加进队列,则将其状态设置为"OUTSTANDING"
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                """
                从爬虫队列中取出一个url,首先查找出一个状态为OUTSTANDING即刚刚插入队列的url
                并更新其url状态为PROCESSING
                """
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                try:
                    html = D(url)
                except:
                    pass
                if scrape_callback:
                    try:
                        ##将回调函数返回的urls列表给links,该语句只在第一次执行
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            # add this new link to queue
                            ##将该URL添加进队列,则将其url状态设置为"OUTSTANDING"
                            crawl_queue.push(normalize(seed_url, link))
                ## 当下载完成后,调用该函数将其url状态设置为COMPLETE
                crawl_queue.complete(url)


    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)


def process_crawler(args, **kwargs):
    num_cpus = multiprocessing.cpu_count() ##获取CPU个数
    #pool = multiprocessing.Pool(processes=num_cpus)
    print 'Starting {} processes'.format(num_cpus)
    processes = []
    for i in range(num_cpus):
        p = multiprocessing.Process(target=threaded_crawler, args=[args], kwargs=kwargs)
        #parsed = pool.apply_async(threaded_link_crawler, args, kwargs)
        ##启动多线程
        p.start()
        processes.append(p)
    # wait for processes to complete
    for p in processes:
        p.join()


def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)

说了这么多,来做个多进程测试吧,看看效果如何?
process_test.py

# -*- coding: utf-8 -*-

import sys
from process_crawler import process_crawler
from new_chapter3.mongo_cache import MongoCache
from alexa_cb import AlexaCallback
import time

def main(max_threads):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)


if __name__ == '__main__':
    start=time.time()
    max_threads = 4
    main(max_threads)
    end=time.time()
    print "多进程耗时",(end-start)

这里写图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值