我们爬虫的时候可以通过正则爬取想要的东西,今天博主为大家分享一篇多线程爬取网页路径和内容保存到mongodb的源码,
# encoding=utf-8
# 代码自上而下运行,将被调用的函数或类放到前面
import os
import requests
from fake_useragent import UserAgent
from retrying import retry
import hashlib # 信息摘要 md5
import queue # 队列
import re # 正则
from urllib import robotparser # 解析网站robots.txt文件
from urllib.parse import urlparse, urljoin, urldefrag # 解析url
from threading import Thread # 多线程
from datetime import datetime
import time
import mongo_cache
MAX_URL = 2 # 定义爬虫爬取深度
def get_robots(url):
"""
解析robots文件
:param url:
:return:
"""
rp = robotparser.RobotFileParser()
rp.set_url(urljoin(url, 'robots.text'))
rp.read()
return rp
def get_html(url_str):
path = urlparse(url_str).path
path_list = path.split('/') # 以/分割形成列表
# print(path_list)
return path_list[len(path_list) - 1]
def save_url(html_content, url_str):
"""
存储下载内容
:param html_content:
:param url_str:
:return:
"""
md5 = hashlib.md5()
md5.update(html_content)
ul = os.getcwd() + r'/downloads/' # 查看路径
if not os.path.exists(ul):
os.mkdir(ul) # 没有创建文件
file_path = "./downloads/" + get_html(url_str) + ".html"
with open(file_path, 'wb') as f:
f.write(html_content)
def extracting_html_url(html_content):
"""
抽取网页中的其他url
:param html_content:
:return:
"""
url_regex = re.compile('<a[^>]\s*href=["\'](.*?)["\']', re.IGNORECASE)
return url_regex.findall(html_content)
class Throttle(object):
"""
限流器
"""
def __init__(self, delay):
self.domains = {}
self.delay = delay
def wait_url(self, url_str):
# 以netloc为取出域名部分基础进行休眠
domain_url = urlparse(url_str).netloc
last_accessed = self.domains.get(domain_url) # 根据字典键获取值
if self.delay > 0 and last_accessed is not None:
# 计算当前时间和上次访问时间段间隔,然后被规则时间减去,如果大于0,说明间隔时间不到,要继续休眠,否则的话直接下载下个网页
sleep_interval = self.delay - (datetime.now() - last_accessed).seconds
if sleep_interval > 0:
time.sleep(sleep_interval)
# 把当前时间以域名为key存到domains字典中
self.domains[domain_url] = datetime.now()
class CrawlerCommon(Thread):
def __init__(self, init_url,home):
"""
通用爬虫
:param init_url:
"""
super().__init__()
__ua = UserAgent() # 随机User-Agent
self.seed_url = init_url # 初始爬取得种子网站
self.crawler_queue = queue.Queue() # 使用不同队列会造成bfs和DFS的效果
self.crawler_queue.put(init_url) # 将种子网址放到队列
self.visited = {init_url: 0} # 初始化爬取深度为0
self.rp = get_robots(init_url) # 初始化robots解析器
self.headers = {'User-Agent': __ua.random} # 生成随机User-Agent
self.home = '/' + home
self.throttle = Throttle(5.0) # 下载限流间隔5秒
self.mgcache = mongo_cache.MongoCache() # 初始化mongo_cache
@retry(stop_max_attempt_number=3)
def retry_download(self, url_str, method, proxies):
"""
使用装饰器的充实下载类
:param url_str:
:param data:
:param method:
:param proxies:
:return:
"""
if method == 'POST':
result = requests.post(url_str, headers=self.headers, proxies=proxies)
else:
result = requests.get(url_str, headers=self.headers, proxies=proxies)
assert result.status_code == 200 # 此处为断言,判断状态码是否为200
return result.content
def download(self, url_str, method='GET', proxies={"http": '118.187.58.34:53281'}):
"""
真正的下载类
:param url_str:
:param method:
:param proxies:
:return:
"""
print("下载的域名有:", url_str)
try:
result = self.retry_download(url_str, method, proxies)
except Exception as e:
print(e.args)
result = None
return result
def nomalize(self, url_str):
"""
补全下载链接
:param url_str:
:return:
"""
real_url,_ = urldefrag(url_str)
return urljoin(self.seed_url, real_url)
def run(self):
"""
进行网页爬取主要方法
:return:
"""
while not self.crawler_queue.empty():
url_str = self.crawler_queue.get()
# 检测robots.text文件规则
if self.rp.can_fetch(self.headers['User-Agent'], url_str):
self.throttle.wait_url(url_str)
depth = self.visited[url_str]
if depth < MAX_URL:
html_content = self.download(url_str)
if html_content is not None:
self.mgcache[url_str] = html_content
save_url(html_content, url_str)
url_list = extracting_html_url(html_content.decode('utf-8'))
filter_urls = [link for link in url_list if re.search(self.home, link)]
for url in filter_urls:
real_url = self.nomalize(url)
if real_url not in self.visited:
self.visited[real_url] = depth + 1
self.crawler_queue.put(real_url)
else:
print('robots.text 禁止下载', url_str)
if __name__ == '__main__':
# url = input('请输入需要爬取的网站:')
# url_start = input('请定义初始路径:')
# 'http://www.runoob.com/django/django-tutorial.html', 'django'
crawler = CrawlerCommon('http://www.runoob.com/django/django-tutorial.html', 'django')
crawler.run()
mongo_cache文件源码
# coding=utf-8
import pickle
import zlib
from datetime import datetime, timedelta
import requests
from pymongo import MongoClient
from bson.binary import Binary
class MongoCache(object):
"""
数据库缓存
"""
def __init__(self, client=None, expires=timedelta(days=30)):
self.client = MongoClient("localhost", 27017)
self.db = self.client.cache
# 加速查找设置索引,设置超时时间,如果达到expireAfterSeconds设置的超时时间,mongodb会把超时数据自动删除
self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())
def __setitem__(self, key, value):
# 压缩数据,设置时间戳
record = {"result": Binary(zlib.compress(pickle.dumps(value))), "timestamp": datetime.utcnow()}
# 使用 update的upset(如果不存在之星inset,存在执行update)参数进行插入更新操作, $set内置函数表示覆盖原始数据
self.db.webpage.update({"_id": key}, {'$set': record}, upsert=True)
def __getitem__(self, item):
# 根据_id以item作为关键词(列如url:http://baidu.com)查找相关网页
record = self.db.webpage.find_one({"_id": item})
if record:
# return pickle.dumps(zlib.decompress(record["result"])) # 错误
return pickle.loads(zlib.decompress(record["result"])) # 解压缩
else:
raise KeyError(item + "does not exist") # 找不到跑出异常
def __contains__(self, item):
try:
# 这里会调用__gettitem__方法
self[item]
except KeyError:
return False # 捕获到KeyError异常说明没找到相关数据,参考33行跑出异常条件
else:
return True # 找到相应数据说明数据库包含下载内容
def clear(self):
self.db.webpage.drop()
博主正在完善整体结构不过基本以实现,其他待续。。。。