为了提高爬虫的效率,我们可以采用多线程,多进程的爬取数据,本篇博文将主要讲解如何利用爬虫进行并发下载。
我们可以利用浏览Alexa网址获取网站列表文件,该文件中含有最受欢迎的100万个网站列表。可以通过http://s3.amazonaws.com/alexa-static/top-1m.csv.zip直接下载这一列表压缩文件。
下面分别利用单线程,多线程,多进程来下载上述文件内网址网页。并且对比其耗费时间。
比较方法:通过各种crawler函数,调用回调类AlexaCallback来对压缩文件进行解析,获取文件内所有urls,然后将这些所有的urls存放进行爬虫队列。再分别采用单线程,多线程,多进程来对这些所有的urls进行下载操作,比较耗时多少。
单线程下载
首先需要修改前几篇博文中的回调函数scrape_callback,alexa_cb.py文件:
# -*- coding: utf-8 -*-
import csv
from zipfile import ZipFile
from StringIO import StringIO
#from mongo_cache import MongoCache
from new_chapter3.mongo_cache import MongoCache
class AlexaCallback:
def __init__(self, max_urls=1000):
self.max_urls = max_urls
self.seed_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
def __call__(self, url, html):
'''
当调用该回调函数时,首先会把传入的url和seed_url进行比较
如果相同则解析该压缩文件,获得其内的所有url,添加到url列表中然后返回该url列表
'''
if url == self.seed_url:## 只在第一次执行
urls = []
cache = MongoCache()
##with ZipFile(StringIO(html)) as zf:
with ZipFile('top-1m.csv.zip','r') as zf:
csv_filename = zf.namelist()[0]
for _, website in csv.reader(zf.open(csv_filename)):
if 'http://' + website not in cache:
urls.append('http://' + website)
if len(urls) == self.max_urls:
break
return urls
本来想根据url=’http://s3.amazonaws.com/alexa-static/top-1m.csv.zip‘直接通过之前的download类来下载其压缩文件,然后进行解析,但是因为该网址在国内需要翻墙才能下载,在程序中下载总是失败,于是我直接在浏览器中下载了该文件,然后在程序中打开解析。
再将上述回调函数返回的url列表,存入爬虫队列中进行爬取,link_crawler.py:
#coding:utf-8
import re
import urlparse
import urllib2
import time
import datetime
import robotparser
from new_chapter3.downloader import Downloader
def link_crawler(seed_url, delay=5, user_agent='wswp', proxies=None,
num_retries=1, scrape_callback=None, cache=None,ignore_robots=False):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = [seed_url]
# the URL's that have been seen and at what depth
seen = set([seed_url])
# track how many URL's have been downloaded
rp = get_robots(seed_url)
##在此处创建一个Downloader类对象,在此处一次性的传入多个参数,在下面调用时就可以避免多次传入
D=Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, cache=cache)
while crawl_queue:
url = crawl_queue.pop()
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url) or ignore_robots==True:
try:
html = D(url)## 下载url对应的html网页
except:
pass
if scrape_callback:
try:
##将回调函数返回的urls列表给links,该语句只在第一次执行
links = scrape_callback(url, html) or []
except Exception as e:
print 'Error in callback for: {}: {}'.format(url, e)
else:
for link in links:
link = normalize(seed_url, link)
# 检查link是否已经被抓取过
if link not in seen:
seen.add(link)
# 如果没有被抓取过,则将该url添加入爬虫队列中
crawl_queue.append(link)
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
以上抓取类其实就是将压缩文件内所有url存入爬虫队列中进行串行爬取,来做个测试,看看耗时多少,效果如何?sequential_test.py:
# -*- coding: utf-8 -*-
from link_crawler import link_crawler
from new_chapter3.mongo_cache import MongoCache
from alexa_cb import AlexaCallback
import time
def main():
scrape_callback = AlexaCallback()
cache = MongoCache()
cache.clear()
link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache,user_agent='GoodCrawler',ignore_robots=True)
if __name__ == '__main__':
start=time.time()
main()
end=time.time()
print '串行爬取耗费 ',(end-start)
多线程爬虫
多线程爬取请求内容过快,可能会造成服务器过载,或者是ip被封。为了避免这一问题,我们的爬虫需要设置一个delay标识,用于请求同一个域名时的最小时间间隔。
但是在本篇博文的示例中,压缩文件内100w+多个网站,域名几乎都是不同,因而不会出现上述问题。
为了实现多线程爬虫,需要对link_crawler文件进行修改,使其支持多线程,修改以后的threaded_crawler.py文件:
#coding:utf-8
import time
import threading
import urlparse
from new_chapter3.downloader import Downloader
SLEEP_TIME = 1
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
"""Crawl this website in multiple threads
"""
# the queue of URL's that still need to be crawled
#crawl_queue = Queue.deque([seed_url])
crawl_queue = [seed_url]
# the URL's that have been seen
seen = set([seed_url])
D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)
def process_queue():
while True:
try:
url = crawl_queue.pop()
except IndexError:
# crawl queue is empty
break
else:
try:
html = D(url)
except:
pass
if scrape_callback:
try:
##将回调函数返回的urls列表给links,该语句只在第一次执行
links = scrape_callback(url, html) or []
except Exception as e:
print 'Error in callback for: {}: {}'.format(url, e)
else:
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen.add(link)
# add this new link to queue
crawl_queue.append(link)
# wait for all download threads to finish
threads = []
while threads or crawl_queue: ##当线程池还有线程可用时,并且crawl_queue里面还有url时
# the crawl is still active
for thread in threads:
if not thread.is_alive():
# remove the stopped threads
threads.remove(thread)
while len(threads) < max_threads and crawl_queue:
# can start some more threads
thread = threading.Thread(target=process_queue)
thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
thread.start()
threads.append(thread)
# all threads have been processed
# sleep temporarily so CPU can focus execution on other threads
time.sleep(SLEEP_TIME)
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
来对多线程进行测试,看看效果如何?
threaded_test.py:
#coding:utf-8
import sys
from threaded_crawler import threaded_crawler
from new_chapter3.mongo_cache import MongoCache
from alexa_cb import AlexaCallback
import time
def main(max_threads):
scrape_callback = AlexaCallback()
cache = MongoCache()
cache.clear()
threaded_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
if __name__ == '__main__':
start=time.time()
max_threads = 5 ## 这里采用五个线程并行
main(max_threads)
end=time.time()
print '并行爬取耗费 ', (end - start)
五个线程并发执行所耗时间不到单线程耗时的五分之一!!!
由多线程
多进程爬虫
在上述多线程爬虫中,爬虫队列crawl_queue是直接在threaded_crawler函数中定义,就是直接定义和存储在本地内存中,每个进程有各自的内存,如果多个进程同时调用threaded_crawler函数,就会创建各自不同的爬虫队列,这就导致多个进程无法统一的处理爬虫队列,为了解决这个问题,可以把爬虫队列转移到数据库中,即数据库模式。
我们采用数据库模式,就是把爬虫队列转移到数据库中,单独存储队列。这样多个爬取进程,URL的获取与增加都通过数据库操作。 并且不同的进程处理的是同一个内存中数据库里的爬虫队列。做到了协同处理。
数据库模式具有这些优势:开发便捷,天生具备读写保护及支持IPC(进程间通信),只需要写一个爬虫程序。
为此需要写一个数据库爬虫程序,上述的threaded_crawler函数或者之前link_crawler函数中,其爬虫队列crawler_queue都有插入,取出,添加,修改,判空等操作,这些是爬取队列必备的一些操作,同理在数据库爬虫中也必须有这些操作。
mongo_queue.py:
#coding:utf-8
from datetime import datetime, timedelta
from pymongo import MongoClient, errors
class MongoQueue:
# 队列中url三种可能状态
'''
OUTSTANDING:当该url被添加进爬虫队列
PROCESSING:当URL从队列中取出准备下载时
COMPLETE:当下载结束时
'''
OUTSTANDING, PROCESSING, COMPLETE = range(3)
def __init__(self, client=None, timeout=300):
"""
host: the host to connect to MongoDB
port: the port to connect to MongoDB
timeout: the number of seconds to allow for a timeout
"""
self.client = MongoClient() if client is None else client
self.db = self.client.cache
self.timeout = timeout
def __nonzero__(self):
"""
当下载完成以后status才会变为COMPLETE,这时爬虫队列也会停止,这里寻找的是不等于COMPLETE状态的队列,即是
确定是否有线程在运行。
'$ne':表示不等于
"""
record = self.db.crawl_queue.find_one(
{'status': {'$ne': self.COMPLETE}}
)
return True if record else False
def push(self, url):
"""
将该URL添入爬虫队列,_id属性值为该url,对应的status为OUTSTANDING
"""
try:
self.db.crawl_queue.insert({'_id': url, 'status': self.OUTSTANDING})
except errors.DuplicateKeyError as e:
pass # this is already in the queue
def pop(self):
"""
从爬虫队列中取出一个url,首先查找出一个状态为OUTSTANDING即刚刚插入队列的url
并更新其状态为PROCESSING,timestamp为当前时间
"""
record = self.db.crawl_queue.find_and_modify(
query={'status': self.OUTSTANDING},
update={'$set': {'status': self.PROCESSING, 'timestamp': datetime.now()}}
)
if record:
return record['_id']
else:
self.repair()
raise KeyError()
def peek(self):
## 取出队列中一个url
record = self.db.crawl_queue.find_one({'status': self.OUTSTANDING})
if record:
return record['_id']
def complete(self, url):
## 当下载完成后,调用该函数将其url状态设置为COMPLETE
self.db.crawl_queue.update({'_id': url}, {'$set': {'status': self.COMPLETE}})
def repair(self):
"""
'$lt':表示小于
首先在队列中寻找状态不为COMPLETE的url,即该url正在被添加或者正在下载过程中。
如果该url的timestamp<当前时间点-timeout,表示处理该url的进程超时,次数需要将url状态再次设置为OUTSTANDING,
以便再次进行处理
"""
record = self.db.crawl_queue.find_and_modify(
query={
'timestamp': {'$lt': datetime.now() - timedelta(seconds=self.timeout)},
'status': {'$ne': self.COMPLETE}
},
update={'$set': {'status': self.OUTSTANDING}}
)
if record:
print 'Released:', record['_id']
def clear(self):
self.db.crawl_queue.drop()
为了支持这个数据库爬虫队列,需要修改threaded_crawler函数,process_crawler.py:
#coding:utf-8
import time
import urlparse
import threading
import multiprocessing
from new_chapter3.mongo_cache import MongoCache
from mongo_queue import MongoQueue
from new_chapter3.downloader import Downloader
SLEEP_TIME = 1
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
"""Crawl using multiple threads
"""
# the queue of URL's that still need to be crawled
## 每次不同的进程调用这个函数时,都会调用数据库队列,注意这里不会创建不同的数据库队列
## 调用的是同一个爬虫队列,这个爬虫队列是存在mongoDB数据库中是同一个内存地址中,不是存储在多个进程各自不同的内存中
crawl_queue = MongoQueue()
crawl_queue.clear()
crawl_queue.push(seed_url) ##准备将该URL添加进队列,则将其状态设置为"OUTSTANDING"
D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)
def process_queue():
while True:
# keep track that are processing url
try:
"""
从爬虫队列中取出一个url,首先查找出一个状态为OUTSTANDING即刚刚插入队列的url
并更新其url状态为PROCESSING
"""
url = crawl_queue.pop()
except KeyError:
# currently no urls to process
break
else:
try:
html = D(url)
except:
pass
if scrape_callback:
try:
##将回调函数返回的urls列表给links,该语句只在第一次执行
links = scrape_callback(url, html) or []
except Exception as e:
print 'Error in callback for: {}: {}'.format(url, e)
else:
for link in links:
# add this new link to queue
##将该URL添加进队列,则将其url状态设置为"OUTSTANDING"
crawl_queue.push(normalize(seed_url, link))
## 当下载完成后,调用该函数将其url状态设置为COMPLETE
crawl_queue.complete(url)
# wait for all download threads to finish
threads = []
while threads or crawl_queue:
for thread in threads:
if not thread.is_alive():
threads.remove(thread)
while len(threads) < max_threads and crawl_queue.peek():
# can start some more threads
thread = threading.Thread(target=process_queue)
thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
thread.start()
threads.append(thread)
time.sleep(SLEEP_TIME)
def process_crawler(args, **kwargs):
num_cpus = multiprocessing.cpu_count() ##获取CPU个数
#pool = multiprocessing.Pool(processes=num_cpus)
print 'Starting {} processes'.format(num_cpus)
processes = []
for i in range(num_cpus):
p = multiprocessing.Process(target=threaded_crawler, args=[args], kwargs=kwargs)
#parsed = pool.apply_async(threaded_link_crawler, args, kwargs)
##启动多线程
p.start()
processes.append(p)
# wait for processes to complete
for p in processes:
p.join()
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
说了这么多,来做个多进程测试吧,看看效果如何?
process_test.py
# -*- coding: utf-8 -*-
import sys
from process_crawler import process_crawler
from new_chapter3.mongo_cache import MongoCache
from alexa_cb import AlexaCallback
import time
def main(max_threads):
scrape_callback = AlexaCallback()
cache = MongoCache()
cache.clear()
process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
if __name__ == '__main__':
start=time.time()
max_threads = 4
main(max_threads)
end=time.time()
print "多进程耗时",(end-start)