利用redis的分布式爬虫
redis是一个非常优秀的内存式key-value数据库,支持set、list等,而且在python上十分好用,提供的接口基本和python原生的一样。另外redis的各操作都具有原子性,因此用它来模拟缓存十分方便,甚至比直接写多线程都方便,因为不用考虑数据同步的问题。为了实际感受一下,就在以前写的爬虫的基础上修修改改,写了这个分布式的爬虫。
它由master.py、slaver.py、writer.py和run.sh构成,运行时直接运行run.sh就行
#run.sh
root_url="http://baike.baidu.com/view/21087.htm"
times=100
thread=5
fileplace="test.txt"
python master.py $root_url
sleep 2
for((j=1;j<=$thread;++j));do
python slaver.py $times $root_url &
done
python slaver.py $times $root_url
python writer.py $fileplace $times
#master.py
import redis
import sys
r = redis.Redis(host='127.0.0.1',port=6379,db=2)
r.flushdb()
r.sadd('urls',sys.argv[1])
r['times']=0
#slaver.py
import urllib2
import bs4
import re
import redis
import sys
r=redis.Redis('127.0.0.1',6379,2)
root_url = sys.argv[2]
times = int(sys.argv[1])
class SpiderMain(object):
def __init__(self):
self.urls = UrlManager()
self.downloader = HtmlDownloader()
self.parser = BsParser()
self.outputer = HtmlOutputer('test.txt')
def craw(self, root_urls):
self.urls.add_new_url(root_urls)
i = 0
while self.urls.has_new_url():
try:
new_url = self.urls.get_new_url()
print self.urls.times, new_url
html_cont = self.downloader.download(new_url)
new_urls, new_data = self.parser.parse(new_url, html_cont)
self.urls.add_new_urls(new_urls)
self.outputer.collect_data(new_data)
if int(r['times']) >= times:
break
except Exception as e:
print e.message
print 'task over'
class HtmlOutputer(object):
def __init__(self, file_path):
self.file_path = file_path
self.titles = []
self.summary = []
def collect_data(self, data):
if data is None:
return
r.rpush('titles',data['title'])
r.rpush('summary',data['summary'])
def output_html(self):
output = open(self.file_path, 'w')
# dictionary
for num, title in enumerate(self.titles):
output.write("{0}\t{1}\n".format(title, num+1))
output.write('\n')
# summary
i = 1
for title, summary in zip(self.titles, self.summary):
output.write("{0}\t{1}{2}\n".format(i, title, summary))
i += 1
output.close()
class UrlManager(object):
def __init__(self):
self.old_url = set()
self.new_url = set()
def add_new_url(self, url):
if not r.sismember('old_urls',url):
r.sadd('urls',url)
def get_new_url(self):
url = None
if r.scard('urls')>0:
url = r.spop('urls')
r.sadd('old_urls',url)
r.incr('times')
self.times = r['times']
return url
def has_new_url(self):
#if len(self.new_url) > 0:
# return True
if int(r['times']) <= times:
return True
return False
def add_new_urls(self, urls):
for url in urls:
self.add_new_url(url)
class HtmlDownloader(object):
def __init__(self):
pass
@staticmethod
def download(url):
html_file = urllib2.urlopen(url)
if html_file.code == 200:
s = html_file.read()
html_file.close()
return s
return None
class BsParser(object):
def __init__(self):
self.common_head = 'http://baike.baidu.com'
def parse(self, url, html_code):
if url is None or html_code is None:
return None, None
soup = bs4.BeautifulSoup(html_code, 'html.parser', from_encoding='utf-8')
return self._find_url(soup), self._find_data(soup)
def _find_url(self, soup):
tmp = soup.find_all(name='a', href=re.compile(r'/view'))
new_urls = [self.common_head+tp.get('href') for tp in tmp]
return new_urls
def _find_data(self, soup):
data = dict()
data['title'] = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').h1.string.encode('utf-8')
data['summary'] = soup.find('div', class_='lemma-summary').get_text().encode('utf-8')
return data
if __name__ == '__main__':
instance = SpiderMain()
instance.craw(root_url)
#writer.py
import redis
import sys
import time
fileplace = sys.argv[1]
times = int(sys.argv[2])
r=redis.Redis('127.0.0.1',6379,2)
while(r['times'] < times):
time.sleep(1)
titles = r.lrange('titles',0,-1)
summarys = r.lrange('summary',0,-1)
output = open(fileplace, 'w')
# dictionary
for num, title in enumerate(titles):
output.write("{0}\t{1}\n".format(title, num+1))
output.write('\n')
# summary
i = 1
for title, summary in zip(titles, summarys):
output.write("{0}\t{1}{2}\n".format(i, title, summary))
i += 1
output.close()
最后的结果是从百度百科的python页面开始,爬取run.sh中$times个页面,最后把摘要写到test.txt中。
如果要在多个计算机上执行一个爬虫任务,那么只要在其他机器执行slaver.py,并且把redis数据库的地址改成运行run.sh的那个机器的ip,以保证都连接到同一个数据库。
这个爬虫只是一个baseline,可能会遇到这样几个问题:
1、数据是直接写到redis中,最后再统一写出的。如果内存够用,这样自然好,速度还快。但是内存毕竟很有限,所以如果要爬取的页面很多,内存放不下,就应该把存储数据的数据库换成mysql之类的,以充分利用硬盘。
2、如果要爬取的页面是如此之多,以至于光是保存未爬取和已爬取的URL的数据集都放不进内存了,那么可以改用mongoDB,为了速度的话还可以再上面再加一层redis缓存。不过下载网页是IO密集操作,速度比硬盘IO还要慢,所以加缓存可能并不能带来性能的提高。
最后,感觉爬取的速度在根本上还是由网络带宽决定的。。。