1. mongoqueue.py
from datetime import datetime, timedelta from pymongo import MongoClient, errors class MongoQueue: OUTSTANDING, PROCESSING, COMPLETE = range(3) def __init__(self, client=None, timeout=300): self.client = MongoClient() if client is None else client self.db = self.client.cache self.timeout = timeout def __nonzero__(self): record = self.db.crawl_queue.find_one( {'status': {'$ne': self.COMPLETE}} ) return True if record else False def push(self, url): try: self.db.crawl_queue.insert({'id': url, 'status': self.OUTSTANDING}) except errors.DuplicateKeyError as e: pass def pop(self): record = self.db.crawl_queue.find_and_modify( query={'status': self.OUTSTANDING}, update={'$set': {'status': self.PROCESSING, 'timestamp': datetime.now()}} ) if record: return record['_id'] else: self.repair() raise KeyError() def complete(self, url): record = self.db.crawl_queue.find_and_modify( query={ 'timestamp': {'$lt': datetime.now() - timedelta(seconds=self.timeout)}, 'status': {'$ne': self.COMPLETE} }, update={'$set': {'status': self.OUTSTANDING}} ) if record: print 'Released', record['_id'] 2.process_crawler.py# -*- coding: utf-8 -*- import time import threading from downloader import Downloader import urlparse import robotparser import csv import re import lxml from mongoqueue import MongoQueue SLEEP_TIME = 1 DEFAULT_AGENT = 'wswp' DEFAULT_DELAY = 5 DEFAULT_RETRIES = 1 DEFAULT_TIMEOUT = 60 def process_crawler(seed_url,cache=None, delay=DEFAULT_DELAY, user_agent='wswp', proxies=None,num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, sleep_time=SLEEP_TIME,max_threads=10, scrape_callback=None): #多线程 # crawl_queue = [seed_url] # seen = set([seed_url]) #多进程 crawl_queue = MongoQueue() crawl_queue.push(seed_url) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: #多线程 # link = normalize(seed_url, link) # # check whether already crawled this link # if link not in seen: # seen.add(link) # # add this new link to queue # crawl_queue.append(link) #多进程 crawl_queue.push(normalize(seed_url, link)) crawl_queue.complete(url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue: thread = threading.Thread(target=process_queue()) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(sleep_time) def get_robots(url): """Initialize robots parser for this domain """ rp = robotparser.RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt'))#绝对链接 rp.read() return rp def normalize(seed_url, link): """Normalize this URL by removing hash and adding domain """ link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates urldefrag(url)将url分解成去掉fragment的新url和去掉的fragment的二元组 return urlparse.urljoin(seed_url, link)#绝对链接 def same_domain(url1, url2): """Return True if both URL's belong to same domain """ #将url分解成部件的6元组 return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc def get_links(html): """Return a list of links from html """ # a regular expression to extract all links from the webpage #re.compile()函数将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例),最后使用Match实例获得信息,进行其他的操作。 webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # list of all links from the webpage return webpage_regex.findall(html) class ScrapeCallback: def __init__(self): self.writer = csv.writer(open('countries.csv', 'w')) self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages') self.writer.writerow(self.fields) def __call__(self, url, html): if re.search('view', url): tree = lxml.html.fromstring(html) row = [] for field in self.fields: row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content()) self.writer.writerow(row)
3.multiprocessing.pyimport multiprocessing from threaded_crawler import threaded_crawler def process_link_crawler(args, **kwargs): num_cpus = multiprocessing.cpu_count() print 'Staring {} process'.format(num_cpus) processes = [] for i in range(num_cpus): p = multiprocessing.process(target=threaded_crawler, args=[args], kwargs=kwargs) p.start() processes.append(p) for p in processes: p.join()4.process_test.pyimport sys from process_crawler import process_crawler from mongo_cache import MongoCache from alexa_cb import AlexaCallback def main(max_threads): scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10) if __name__ == '__main__': sys.argv = ['$ time python process_test.py', 5] max_threads = int(sys.argv[1]) main(max_threads)