该项目将主要有五个部分负责完成爬取任务,分别是:URL管理器,HTML下载器,HTML解析器,数据存储器,爬虫调度器。
具体代码如下:
URL管理器:
import hashlib
import pickle
import time
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
self.error_urls = set()
def get_new_url(self):
"""
从容器中获取新的url,并且转化成md5减少内存消耗加进old_urls
:return:
"""
new_url = self.new_urls.pop()
m = hashlib.md5()
m.update(new_url.encode('utf-8'))
md5_url = m.hexdigest()
self.old_urls.add(md5_url)
return new_url
def old_urls_size(self):
return len(self.old_urls)
def new_urls_size(self):
return len(self.new_urls)
def add_new_url(self,url):
"""
添加单个url
:param url:
:return:
"""
if url is None:
print('url is None!')
m = hashlib.md5()
m.update(url.encode('utf-8'))
md5_url = m.hexdigest()
if md5_url not in self.old_urls and url not in self.new_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
"""
添加多个url,urls是个可迭代对象
:param urls:
:return:
"""
if urls is None:
print('urls is None!')
for url in urls:
self.add_new_url(url)
def add_error_urls(self,url):
"""
装进响应错误的urls中
:param url:
:return:
"""
return self.error_urls.add(url)
def save_progress(self,path,data):
"""
保存进度
:return:
"""
with open(path,'wb') as f:
pickle.dump(data,f)
def load_progress(self,path):
'''
从本地文件加载进度
:return: 返回set()集合
'''
try:
with open(path, 'rb') as f:
tmp = pickle.load(f)
print('继续%s的进程' % path)
return tmp
except FileNotFoundError as e:
print(e,'无进度文件,创建:%s'%path)
return set()
此URL管理器具有去重的功能,爬取过的url不会重复爬取,并且使用了md5技术减少内存的消耗。
HTML下载器:
import requests
import random
from URLManager import UrlManager
class HtmlDownloader(object):
def __init__(self):
self.url_manager = UrlManager()
USER_AGENT = random.choice([
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1"
])
self.headers = {'User-Agent':USER_AGENT}
def downloader(self,url):
response = requests.get(url,headers=self.headers)
response.encoding = 'utf-8'
if response.status_code in [int('20