多进程爬虫

本文介绍了一个基于MongoDB实现的爬虫队列管理系统,该系统通过Python编写,利用MongoDB作为数据存储来管理爬虫任务的状态。文章详细解释了如何通过状态标记来区分待处理、正在处理和已完成的任务,并提供了多线程和多进程处理任务的具体实现。
摘要由CSDN通过智能技术生成

1. mongoqueue.py

from datetime import datetime, timedelta
from pymongo import MongoClient, errors

class MongoQueue:
    OUTSTANDING, PROCESSING, COMPLETE = range(3)

    def __init__(self, client=None, timeout=300):
        self.client = MongoClient() if client is None else client
        self.db = self.client.cache
        self.timeout = timeout

    def __nonzero__(self):
        record = self.db.crawl_queue.find_one(
            {'status': {'$ne': self.COMPLETE}}
        )
        return True if record else False

    def push(self, url):
        try:
            self.db.crawl_queue.insert({'id': url, 'status':
                                        self.OUTSTANDING})
        except errors.DuplicateKeyError as e:
            pass

    def pop(self):
        record = self.db.crawl_queue.find_and_modify(
            query={'status': self.OUTSTANDING},
            update={'$set': {'status': self.PROCESSING,
                             'timestamp': datetime.now()}}
        )
        if record:
            return record['_id']
        else:
            self.repair()
            raise KeyError()

    def complete(self, url):
        record = self.db.crawl_queue.find_and_modify(
            query={
                'timestamp': {'$lt': datetime.now() -
                    timedelta(seconds=self.timeout)},
                'status': {'$ne': self.COMPLETE}
            },
            update={'$set': {'status': self.OUTSTANDING}}
        )
        if record:
            print 'Released', record['_id']

2.process_crawler.py
# -*- coding: utf-8 -*-
import time
import threading
from downloader import Downloader
import urlparse
import robotparser
import csv
import re
import lxml
from mongoqueue import MongoQueue
SLEEP_TIME = 1
DEFAULT_AGENT = 'wswp'
DEFAULT_DELAY = 5
DEFAULT_RETRIES = 1
DEFAULT_TIMEOUT = 60

def process_crawler(seed_url,cache=None, delay=DEFAULT_DELAY,
                     user_agent='wswp', proxies=None,num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT,
                     sleep_time=SLEEP_TIME,max_threads=10, scrape_callback=None):
    #多线程
    # crawl_queue = [seed_url]
    # seen = set([seed_url])
    #多进程
    crawl_queue = MongoQueue()
    crawl_queue.push(seed_url)

    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for: {}: {}'.format(url, e)
                    else:
                        for link in links:
                            #多线程
                            # link = normalize(seed_url, link)
                            # # check whether already crawled this link
                            # if link not in seen:
                            #     seen.add(link)
                            #     # add this new link to queue
                            #     crawl_queue.append(link)
                            #多进程
                            crawl_queue.push(normalize(seed_url, link))
                crawl_queue.complete(url)

        threads = []
        while threads or crawl_queue:
            for thread in threads:
                if not thread.is_alive():
                    threads.remove(thread)
            while len(threads) < max_threads and crawl_queue:
                thread = threading.Thread(target=process_queue())
                thread.setDaemon(True)
                thread.start()
                threads.append(thread)
                time.sleep(sleep_time)

def get_robots(url):
    """Initialize robots parser for this domain
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))#绝对链接
    rp.read()
    return rp

def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urlparse.urldefrag(link)  # remove hash to avoid duplicates urldefrag(url)url分解成去掉fragment的新url和去掉的fragment的二元组
    return urlparse.urljoin(seed_url, link)#绝对链接

def same_domain(url1, url2):
    """Return True if both URL's belong to same domain
    """
    #url分解成部件的6元组
    return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc

def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    #re.compile()函数将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例),最后使用Match实例获得信息,进行其他的操作。
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)

class ScrapeCallback:
    def __init__(self):
        self.writer = csv.writer(open('countries.csv', 'w'))
        self.fields = ('area', 'population', 'iso', 'country', 'capital',
          'continent', 'tld', 'currency_code', 'currency_name',
          'phone', 'postal_code_format', 'postal_code_regex', 'languages')
        self.writer.writerow(self.fields)

    def __call__(self, url, html):
        if re.search('view', url):
            tree = lxml.html.fromstring(html)
            row = []
            for field in self.fields:
                row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
                self.writer.writerow(row)

3.multiprocessing.py
import multiprocessing
from threaded_crawler import threaded_crawler

def process_link_crawler(args, **kwargs):
    num_cpus = multiprocessing.cpu_count()
    print 'Staring {} process'.format(num_cpus)
    processes = []
    for i in range(num_cpus):
        p = multiprocessing.process(target=threaded_crawler, args=[args], kwargs=kwargs)
        p.start()
        processes.append(p)

    for p in processes:
        p.join()
4.process_test.py
import sys
from process_crawler import process_crawler
from mongo_cache import MongoCache
from alexa_cb import AlexaCallback


def main(max_threads):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)


if __name__ == '__main__':
    sys.argv = ['$ time python process_test.py', 5]
    max_threads = int(sys.argv[1])
    main(max_threads)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值