爬虫,路由与内容爬取

我们爬虫的时候可以通过正则爬取想要的东西,今天博主为大家分享一篇多线程爬取网页路径和内容保存到mongodb的源码,

# encoding=utf-8
#  代码自上而下运行,将被调用的函数或类放到前面
import os

import requests
from fake_useragent import UserAgent
from retrying import retry
import hashlib     # 信息摘要 md5
import queue     # 队列
import re                # 正则
from urllib import robotparser      # 解析网站robots.txt文件
from urllib.parse import urlparse, urljoin, urldefrag  # 解析url
from threading import Thread   # 多线程
from datetime import datetime
import time
import mongo_cache

MAX_URL = 2  # 定义爬虫爬取深度


def get_robots(url):
    """
    解析robots文件
    :param url:
    :return:
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(urljoin(url, 'robots.text'))
    rp.read()
    return rp


def get_html(url_str):
    path = urlparse(url_str).path
    path_list = path.split('/')     # 以/分割形成列表
    # print(path_list)
    return path_list[len(path_list) - 1]


def save_url(html_content, url_str):
    """
    存储下载内容
    :param html_content:
    :param url_str:
    :return:
    """
    md5 = hashlib.md5()
    md5.update(html_content)
    ul = os.getcwd() + r'/downloads/'    # 查看路径
    if not os.path.exists(ul):
        os.mkdir(ul)  # 没有创建文件
    file_path = "./downloads/" + get_html(url_str) + ".html"
    with open(file_path, 'wb') as f:
        f.write(html_content)


def extracting_html_url(html_content):
    """
    抽取网页中的其他url
    :param html_content:
    :return:
    """
    url_regex = re.compile('<a[^>]\s*href=["\'](.*?)["\']', re.IGNORECASE)
    return url_regex.findall(html_content)


class Throttle(object):
    """
    限流器
    """
    def __init__(self, delay):
        self.domains = {}
        self.delay = delay

    def wait_url(self, url_str):
        # 以netloc为取出域名部分基础进行休眠
        domain_url = urlparse(url_str).netloc
        last_accessed = self.domains.get(domain_url)  # 根据字典键获取值

        if self.delay > 0 and last_accessed is not None:
            # 计算当前时间和上次访问时间段间隔,然后被规则时间减去,如果大于0,说明间隔时间不到,要继续休眠,否则的话直接下载下个网页
            sleep_interval = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_interval > 0:
                time.sleep(sleep_interval)
        # 把当前时间以域名为key存到domains字典中
        self.domains[domain_url] = datetime.now()


class CrawlerCommon(Thread):
    def __init__(self, init_url,home):
        """
        通用爬虫
        :param init_url:
        """
        super().__init__()
        __ua = UserAgent()  # 随机User-Agent
        self.seed_url = init_url     # 初始爬取得种子网站
        self.crawler_queue = queue.Queue()     # 使用不同队列会造成bfs和DFS的效果
        self.crawler_queue.put(init_url)    # 将种子网址放到队列
        self.visited = {init_url: 0}     # 初始化爬取深度为0
        self.rp = get_robots(init_url)	# 初始化robots解析器
        self.headers = {'User-Agent': __ua.random}		# 生成随机User-Agent
        self.home = '/' + home
        self.throttle = Throttle(5.0) 	# 下载限流间隔5秒
        self.mgcache = mongo_cache.MongoCache()	# 初始化mongo_cache

    @retry(stop_max_attempt_number=3)
    def retry_download(self, url_str, method, proxies):
        """
        使用装饰器的充实下载类
        :param url_str:
        :param data:
        :param method:
        :param proxies:
        :return:
        """
        if method == 'POST':
            result = requests.post(url_str, headers=self.headers, proxies=proxies)
        else:
            result = requests.get(url_str, headers=self.headers, proxies=proxies)
        assert result.status_code == 200   # 此处为断言,判断状态码是否为200
        return result.content

    def download(self, url_str, method='GET', proxies={"http": '118.187.58.34:53281'}):
        """
        真正的下载类
        :param url_str:
        :param method:
        :param proxies:
        :return:
        """
        print("下载的域名有:", url_str)
        try:
            result = self.retry_download(url_str, method, proxies)
        except Exception as e:
            print(e.args)
            result = None
        return result

    def nomalize(self, url_str):
        """
        补全下载链接
        :param url_str:
        :return:
        """
        real_url,_ = urldefrag(url_str)
        return urljoin(self.seed_url, real_url)

    def run(self):
        """
        进行网页爬取主要方法
        :return:
        """
        while not self.crawler_queue.empty():
            url_str = self.crawler_queue.get()
            # 检测robots.text文件规则
            if self.rp.can_fetch(self.headers['User-Agent'], url_str):
                self.throttle.wait_url(url_str)
                depth = self.visited[url_str]
                if depth < MAX_URL:
                    html_content = self.download(url_str)
                    if html_content is not None:
                        self.mgcache[url_str] = html_content
                        save_url(html_content, url_str)
                        url_list = extracting_html_url(html_content.decode('utf-8'))
                        filter_urls = [link for link in url_list if re.search(self.home, link)]
                        for url in filter_urls:
                            real_url = self.nomalize(url)
                            if real_url not in self.visited:
                                self.visited[real_url] = depth + 1
                                self.crawler_queue.put(real_url)
            else:
                print('robots.text 禁止下载', url_str)


if __name__ == '__main__':
    # url = input('请输入需要爬取的网站:')
    # url_start = input('请定义初始路径:')
    # 'http://www.runoob.com/django/django-tutorial.html', 'django'
    crawler = CrawlerCommon('http://www.runoob.com/django/django-tutorial.html', 'django')
    crawler.run()

mongo_cache文件源码

# coding=utf-8
import pickle
import zlib
from datetime import datetime, timedelta
import requests
from pymongo import MongoClient
from bson.binary import Binary


class MongoCache(object):
    """
    数据库缓存
    """
    def __init__(self, client=None, expires=timedelta(days=30)):
        self.client = MongoClient("localhost", 27017)
        self.db = self.client.cache
        # 加速查找设置索引,设置超时时间,如果达到expireAfterSeconds设置的超时时间,mongodb会把超时数据自动删除
        self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())

    def __setitem__(self, key, value):
        # 压缩数据,设置时间戳
        record = {"result": Binary(zlib.compress(pickle.dumps(value))), "timestamp": datetime.utcnow()}
        # 使用 update的upset(如果不存在之星inset,存在执行update)参数进行插入更新操作, $set内置函数表示覆盖原始数据
        self.db.webpage.update({"_id": key}, {'$set': record}, upsert=True)

    def __getitem__(self, item):
        # 根据_id以item作为关键词(列如url:http://baidu.com)查找相关网页
        record = self.db.webpage.find_one({"_id": item})
        if record:
            # return pickle.dumps(zlib.decompress(record["result"]))  # 错误
            return pickle.loads(zlib.decompress(record["result"]))  # 解压缩
        else:
            raise KeyError(item + "does not exist")   # 找不到跑出异常

    def __contains__(self, item):
        try:
            # 这里会调用__gettitem__方法
            self[item]
        except KeyError:
            return False     # 捕获到KeyError异常说明没找到相关数据,参考33行跑出异常条件
        else:
            return True     # 找到相应数据说明数据库包含下载内容

    def clear(self):
        self.db.webpage.drop()

博主正在完善整体结构不过基本以实现,其他待续。。。。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值