之前的博客:
爬虫学习系列02-常见的下载和抽取网页的方法
爬虫学习系列03-下载缓存
在之前的学习中我们都是以串行的方式执行代码,效率非常低下,如果爬取的url过多的话,这个效率是无法让人满意的,所以非常有必要实现并发爬虫。
实现并发可以通过多线程或多进程来实现。进程指在系统中能独立运行并作为资源分配的基本单位,它是由一组机器指令、数据和堆栈等组成的,是一个能独立运行的活动实体。线程是进程中的一个实体,作为系统调度和分派的基本单位。Linux下的线程看作轻量级进程。线程是进程内的一个相对独立的可执行的单元。若把进程称为任务的话,那么线程则是应用中的一个子任务的执行。具体线程和进程之间的关系和区别,请移步
程序,进程,线程的区别和联系
一、多线程爬虫
多线程请求内容速度过快,可能会造成服务器过载,或是ip地址被封禁。因此,需要给爬虫设置一个delay(延时)标识,用于设定请求同一域名时的最小时间间隔,这个时间间隔一般设置为1s较为合适。
# threaded_crawler.py
import time
import threading
import urllib.parse
from downloader import Downloader
SLEEP_TIME = 1
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
"""多线程爬取网站
:param seed_url: 最开始爬取的那个url
:param delay: 延时时间,单位秒
:param cache: 缓存
:param scrape_callback: 爬虫的回调
:param user_agent: 用户代理
:param proxies: 代理
:param num_retries: 请求次数
:param max_threads: 最大线程数
:param timeout: 超时时间,单位秒
"""
# 需要爬取的url的队列,list实现
#crawl_queue = Queue.deque([seed_url])
crawl_queue = [seed_url]
# 已经加入过队列的url
seen = set([seed_url])
# 创建一个Downloader的实例
D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)
def process_queue():
"""
多线程调用的任务函数
"""
while True:
try:
# 从队列中获取一个url
url = crawl_queue.pop()
except IndexError:
# 队列为空,则退出循环
break
else:
# url的html内容
html = D(url)
# 如果存在回调函数,调用
if scrape_callback:
try:
links = scrape_callback(url, html) or []
except Exception as e:
print 'Error in callback for: {}: {}'.format(url, e)
else:
for link in links:
link = normalize(seed_url, link)
# 检查是否已经加入过这个url
if link not in seen:
seen.add(link)
# 将新的url加入队列
crawl_queue.append(link)
# 等待所有的线程完成
threads = []
while threads or crawl_queue:
# 线程是否活动,移除不活动的线程
for thread in threads:
if not thread.is_alive():
# remove the stopped threads
threads.remove(thread)
while len(threads) < max_threads and crawl_queue:
# 可以启用更多的线程
thread = threading.Thread(target=process_queue)
thread.setDaemon(True) # 设置守护,当按下ctrl-c的时候,能够退出主线程。
thread.start() # 开启线程
threads.append(thread) # 将线程加入list
# 所有线程都在处理中
# 让cpu休息一段时间,可以执行其它线程
time.sleep(SLEEP_TIME)
def normalize(seed_url, link):
"""通过删除哈希并添加域名来规范此URL
"""
link, _ = urllib.parse.urldefrag(link) # remove hash to avoid duplicates
return urllib.urljoin(seed_url, link)
二、多进程爬虫
# mongo_queue.py
from datetime import datetime, timedelta
from pymongo import MongoClient, errors
class MongoQueue:
"""
>>> timeout = 1
>>> url = 'http://example.webscraping.com'
>>> q = MongoQueue(timeout=timeout)
>>> q.clear() # 清空队列表,确保是个空表(mongo中叫集合)
>>> q.push(url) # 添加测试url
>>> q.peek() == q.pop() == url # 弹出这个url
True
>>> q.repair() # 爬取过程出错,则修改url的状态为OUTSTANDING
>>> q.pop() # another pop should be empty
>>> q.peek()
>>> import time; time.sleep(timeout) # 等待超时时间那么长,手动制造爬取超时
>>> q.repair() # 此时,修改url的状态为OUTSTANDING
Released: test
>>> q.pop() == url # 再次弹出一个url
True
>>> bool(q) # queue is still active while outstanding
True
>>> q.complete(url) # 次url爬取完成
>>> bool(q) # queue is not complete
False
"""
# 可能的下载状态,3种状态,
# 刚添加入队列,状态为OUTSTANDING
# 当url从队列中取出准备下载时,状态为PROCESSING
# 下载结束后,状态为COMPLETE
OUTSTANDING, PROCESSING, COMPLETE = range(3)
def __init__(self, client=None, timeout=300):
"""
client = MongoClient('mongodb://user:password@host:port') mongodb连接
:param client: mongodb的连接
:param timeout: 超时时间,单位秒
"""
# 现在默认的MongoClient()是本地的,若是要做成分布式,那么需要指明host和port
self.client = MongoClient() if client is None else client
self.db = self.client.cache
self.timeout = timeout
def __nonzero__(self):
"""
当有任务需要或正在处理时,返回True
"""
record = self.db.crawl_queue.find_one(
{'status': {'$ne': self.COMPLETE}}
)
return True if record else False
def push(self, url):
"""Add new URL to queue if does not exist
如果这个url在表中不存在,则添加到队列中
"""
try:
self.db.crawl_queue.insert({'_id': url, 'status': self.OUTSTANDING})
except errors.DuplicateKeyError as e:
pass # this is already in the queue
def pop(self):
"""Get an outstanding URL from the queue and set its status to processing.
If the queue is empty a KeyError exception is raised.
从队列中获取一个状态为OUTSTANDING的url,并且修改其状态为PROCESSING。
如果队列是空的,那么抛出KeyError异常。
"""
# 获取数据并修改数据
record = self.db.crawl_queue.find_and_modify(
query={'status': self.OUTSTANDING},
update={'$set': {'status': self.PROCESSING, 'timestamp': datetime.now()}}
)
if record:
return record['_id']
else:
self.repair()
raise KeyError()
def peek(self):
"""
从队列中返回一个状态为OUTSTANDING的url,不存在则返回None
"""
record = self.db.crawl_queue.find_one({'status': self.OUTSTANDING})
if record:
return record['_id']
def complete(self, url):
"""
修改url的状态为COMPLETE
"""
self.db.crawl_queue.update({'_id': url}, {'$set': {'status': self.COMPLETE}})
def repair(self):
"""Release stalled jobs
释放失效的任务,
如果超过了超时时间仍未爬取完成,那么重新将其状态就改为OUTSTANDING
"""
record = self.db.crawl_queue.find_and_modify(
query={
'timestamp': {'$lt': datetime.now() - timedelta(seconds=self.timeout)},
'status': {'$ne': self.COMPLETE}
},
update={'$set': {'status': self.OUTSTANDING}}
)
if record:
print 'Released:', record['_id']
def clear(self):
"""
清空队列
"""
self.db.crawl_queue.drop()
# process_crawler.py
import time
import urllib.parse
import threading
import multiprocessing
# from mongo_cache import MongoCache
from mongo_queue import MongoQueue
from downloader import Downloader
SLEEP_TIME = 1
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
"""多线程爬虫
"""
# 需要爬取的url的队列
crawl_queue = MongoQueue()
crawl_queue.clear() # 清空待爬取的url队列
crawl_queue.push(seed_url) # 将种子url加入队列
D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)
def process_queue():
"""
多线程要执行的任务函数
"""
while True:
# keep track that are processing url
try:
url = crawl_queue.pop()
except KeyError:
# currently no urls to process
break
else:
html = D(url)
if scrape_callback:
try:
links = scrape_callback(url, html) or []
except Exception as e:
print 'Error in callback for: {}: {}'.format(url, e)
else:
for link in links:
# 新域名加入队列中,此处不用担心重复添加,因为mongodb中url为主键,不会重复添加
crawl_queue.push(normalize(seed_url, link))
crawl_queue.complete(url) # 该url的状态为COMPLETE
# 等待所有下载的子线程完成
threads = []
while threads or crawl_queue:
for thread in threads:
# 移除不活动的子线程
if not thread.is_alive():
threads.remove(thread)
while len(threads) < max_threads and crawl_queue.peek():
# 可以创建更多的子线程
thread = threading.Thread(target=process_queue)
thread.setDaemon(True) # 设置为True,在按 ctrl-c 后可退出主线程
thread.start() # 开启子线程
threads.append(thread) # 开启之后的线程加入list
time.sleep(SLEEP_TIME)
def process_crawler(args, **kwargs):
"""
多进程爬虫
"""
# 获取系统cpu核数
num_cpus = multiprocessing.cpu_count()
#pool = multiprocessing.Pool(processes=num_cpus)
print 'Starting {} processes'.format(num_cpus)
processes = []
for i in range(num_cpus):
p = multiprocessing.Process(target=threaded_crawler, args=[args], kwargs=kwargs)
#parsed = pool.apply_async(threaded_link_crawler, args, kwargs)
p.start() # 开启子进程
processes.append(p)
# 等待子进程完成
for p in processes:
p.join() # 阻塞父进程,等待此子进程
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)