python3爬虫学习系列04 - 并发下载


之前的博客:
爬虫学习系列02-常见的下载和抽取网页的方法
爬虫学习系列03-下载缓存
在之前的学习中我们都是以串行的方式执行代码,效率非常低下,如果爬取的url过多的话,这个效率是无法让人满意的,所以非常有必要实现并发爬虫。

实现并发可以通过多线程或多进程来实现。进程指在系统中能独立运行并作为资源分配的基本单位,它是由一组机器指令、数据和堆栈等组成的,是一个能独立运行的活动实体。线程是进程中的一个实体,作为系统调度和分派的基本单位。Linux下的线程看作轻量级进程。线程是进程内的一个相对独立的可执行的单元。若把进程称为任务的话,那么线程则是应用中的一个子任务的执行。具体线程和进程之间的关系和区别,请移步
程序,进程,线程的区别和联系

一、多线程爬虫

多线程请求内容速度过快,可能会造成服务器过载,或是ip地址被封禁。因此,需要给爬虫设置一个delay(延时)标识,用于设定请求同一域名时的最小时间间隔,这个时间间隔一般设置为1s较为合适。

# threaded_crawler.py
import time
import threading
import urllib.parse
from downloader import Downloader

SLEEP_TIME = 1



def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """多线程爬取网站
    :param seed_url: 最开始爬取的那个url
    :param delay: 延时时间,单位秒
    :param cache: 缓存
    :param scrape_callback: 爬虫的回调
    :param user_agent: 用户代理
    :param proxies: 代理
    :param num_retries: 请求次数
    :param max_threads: 最大线程数
    :param timeout: 超时时间,单位秒
    """
    # 需要爬取的url的队列,list实现
    #crawl_queue = Queue.deque([seed_url])
    crawl_queue = [seed_url]
    # 已经加入过队列的url 
    seen = set([seed_url])
    # 创建一个Downloader的实例
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
    	"""
    	多线程调用的任务函数
    	"""
        while True:
            try:
            	# 从队列中获取一个url
                url = crawl_queue.pop()
            except IndexError:
                # 队列为空,则退出循环
                break
            else:
            	# url的html内容
                html = D(url)
                # 如果存在回调函数,调用
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            # 检查是否已经加入过这个url
                            if link not in seen:
                                seen.add(link)
                                # 将新的url加入队列
                                crawl_queue.append(link)


    # 等待所有的线程完成
    threads = []
    while threads or crawl_queue:
        # 线程是否活动,移除不活动的线程
        for thread in threads:
            if not thread.is_alive():
                # remove the stopped threads
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # 可以启用更多的线程
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # 设置守护,当按下ctrl-c的时候,能够退出主线程。
            thread.start() # 开启线程
            threads.append(thread) # 将线程加入list
        # 所有线程都在处理中
        # 让cpu休息一段时间,可以执行其它线程
        time.sleep(SLEEP_TIME)


def normalize(seed_url, link):
    """通过删除哈希并添加域名来规范此URL
    """
    link, _ = urllib.parse.urldefrag(link) # remove hash to avoid duplicates
    return urllib.urljoin(seed_url, link)

二、多进程爬虫

# mongo_queue.py
from datetime import datetime, timedelta
from pymongo import MongoClient, errors


class MongoQueue:
    """
    >>> timeout = 1
    >>> url = 'http://example.webscraping.com'
    >>> q = MongoQueue(timeout=timeout)
    >>> q.clear() # 清空队列表,确保是个空表(mongo中叫集合)
    >>> q.push(url) # 添加测试url
    >>> q.peek() == q.pop() == url # 弹出这个url
    True
    >>> q.repair() # 爬取过程出错,则修改url的状态为OUTSTANDING
    >>> q.pop() # another pop should be empty
    >>> q.peek() 
    >>> import time; time.sleep(timeout) # 等待超时时间那么长,手动制造爬取超时
    >>> q.repair() # 此时,修改url的状态为OUTSTANDING
    Released: test
    >>> q.pop() == url # 再次弹出一个url
    True
    >>> bool(q) # queue is still active while outstanding
    True
    >>> q.complete(url) # 次url爬取完成
    >>> bool(q) # queue is not complete
    False
    """

    # 可能的下载状态,3种状态,
    # 刚添加入队列,状态为OUTSTANDING
    # 当url从队列中取出准备下载时,状态为PROCESSING
    # 下载结束后,状态为COMPLETE
    OUTSTANDING, PROCESSING, COMPLETE = range(3)

    def __init__(self, client=None, timeout=300):
        """
        client = MongoClient('mongodb://user:password@host:port') mongodb连接
        :param client: mongodb的连接
        :param timeout: 超时时间,单位秒
        """
        # 现在默认的MongoClient()是本地的,若是要做成分布式,那么需要指明host和port
        self.client = MongoClient() if client is None else client
        self.db = self.client.cache
        self.timeout = timeout

    def __nonzero__(self):
        """
        当有任务需要或正在处理时,返回True
        """
        record = self.db.crawl_queue.find_one(
            {'status': {'$ne': self.COMPLETE}} 
        )
        return True if record else False

    def push(self, url):
        """Add new URL to queue if does not exist
        如果这个url在表中不存在,则添加到队列中
        """
        try:
            self.db.crawl_queue.insert({'_id': url, 'status': self.OUTSTANDING})
        except errors.DuplicateKeyError as e:
            pass # this is already in the queue

    def pop(self):
        """Get an outstanding URL from the queue and set its status to processing.
        If the queue is empty a KeyError exception is raised.
        从队列中获取一个状态为OUTSTANDING的url,并且修改其状态为PROCESSING。
        如果队列是空的,那么抛出KeyError异常。
        """
        # 获取数据并修改数据
        record = self.db.crawl_queue.find_and_modify(
            query={'status': self.OUTSTANDING}, 
            update={'$set': {'status': self.PROCESSING, 'timestamp': datetime.now()}}
        )
        if record:
            return record['_id']
        else:
            self.repair()
            raise KeyError()

    def peek(self):
    	"""
    	从队列中返回一个状态为OUTSTANDING的url,不存在则返回None
    	"""
        record = self.db.crawl_queue.find_one({'status': self.OUTSTANDING})
        if record:
            return record['_id']

    def complete(self, url):
    	"""
    	修改url的状态为COMPLETE
    	"""
        self.db.crawl_queue.update({'_id': url}, {'$set': {'status': self.COMPLETE}})

    def repair(self):
        """Release stalled jobs
        释放失效的任务,
        如果超过了超时时间仍未爬取完成,那么重新将其状态就改为OUTSTANDING
        """
        record = self.db.crawl_queue.find_and_modify(
            query={
                'timestamp': {'$lt': datetime.now() - timedelta(seconds=self.timeout)},
                'status': {'$ne': self.COMPLETE}
            },
            update={'$set': {'status': self.OUTSTANDING}}
        )
        if record:
            print 'Released:', record['_id']

    def clear(self):
    	"""
    	清空队列
    	"""
        self.db.crawl_queue.drop()

# process_crawler.py
import time
import urllib.parse
import threading
import multiprocessing
# from mongo_cache import MongoCache
from mongo_queue import MongoQueue
from downloader import Downloader

SLEEP_TIME = 1


def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """多线程爬虫
    """
    # 需要爬取的url的队列
    crawl_queue = MongoQueue()
    crawl_queue.clear() # 清空待爬取的url队列
    crawl_queue.push(seed_url) # 将种子url加入队列
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
    	"""
    	多线程要执行的任务函数
    	"""
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            # 新域名加入队列中,此处不用担心重复添加,因为mongodb中url为主键,不会重复添加
                            crawl_queue.push(normalize(seed_url, link))
                crawl_queue.complete(url) # 该url的状态为COMPLETE


    # 等待所有下载的子线程完成
    threads = []
    while threads or crawl_queue:
        for thread in threads:
        	# 移除不活动的子线程
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # 可以创建更多的子线程
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # 设置为True,在按 ctrl-c 后可退出主线程
            thread.start() # 开启子线程
            threads.append(thread) # 开启之后的线程加入list
        time.sleep(SLEEP_TIME)


def process_crawler(args, **kwargs):
	"""
	多进程爬虫
	"""
	# 获取系统cpu核数
    num_cpus = multiprocessing.cpu_count()
    #pool = multiprocessing.Pool(processes=num_cpus)
    print 'Starting {} processes'.format(num_cpus)
    processes = []
    for i in range(num_cpus):
        p = multiprocessing.Process(target=threaded_crawler, args=[args], kwargs=kwargs)
        #parsed = pool.apply_async(threaded_link_crawler, args, kwargs)
        p.start() # 开启子进程
        processes.append(p)
    # 等待子进程完成
    for p in processes:
        p.join() # 阻塞父进程,等待此子进程


def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
    return urlparse.urljoin(seed_url, link)

三、参考文献

[1]《用python写web爬虫(web scraping with python)》

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值