爬虫并发下载

最新推荐文章于 2024-08-07 08:51:51 发布

北辰0518

最新推荐文章于 2024-08-07 08:51:51 发布

阅读量342

点赞数

本文链接：https://blog.csdn.net/beichen0518/article/details/80529977

版权

爬虫并发下载

一本非常好的关于并发编程的书《Python并行编程》

保存在内存

# 导入枚举
from enum import Enum, unique
from queue import Queue
from random import random
from time import sleep
from threading import Thread, current_thread
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup


def decode_page(page_bytes, charsets=('utf-8',)):
    page_html = None
    for charset in charsets:
        try:
            page_html = page_bytes.decode(charset)
            break
        except UnicodeDecodeError:
            pass
            # logging.error('[Decode]', err)
    return page_html


class Retry(object):

    def __init__(self, *, retry_times=3, wait_secs=1, errors=(Exception, )):
        """

        :param retry_times: 重试次数
        :param wait_secs: 最少等待时间
        :param errors: 抓取的错误列表
        """
        self.retry_times = retry_times
        self.wait_secs = wait_secs
        self.errors = errors

    # 可以将对象变成像函数一样可以调用
    def __call__(self, fn):

        def wrapper(*args, **kwargs):
            for _ in range(self.retry_times):
                try:
                    return fn(*args, **kwargs)
                # except后面可以跟元组
                except self.errors as e:
                    # logging.error(e)
                    # logging.info('[Retry]')
                    print(e)
                    sleep((random() + 1) * self.wait_secs)

            return None

        return wrapper


#  枚举的作用是定义常量，而且可以提供一些方法来对这些常量验证
#  保证里面没有重复值
@unique
class SpiderStatus(Enum):
    IDLE = 0
    WORKING = 1


class Spider(object):

    def __init__(self):
        self.status = SpiderStatus.IDLE

    # 重试的包装器
    @Retry()
    def fetch(self, current_url, *, charsets=('utf-8', ), user_agent=None, proxies=None):
        """

        :param current_url: 传入一个url
        :return: 一个html页面
        """
        thread_name = current_thread().name
        print(f'[{thread_name} Fetch]:{current_url}')
        headers = {'user-agent': user_agent} if user_agent else {}
        resp = requests.get(current_url,
                            headers=headers, proxies=proxies)

        return decode_page(resp.content, charsets) if resp.status_code == 200 else None

    def parse(self, html_page, *, domain='m.sohu.com'):
        """

        :param html_page: 传入一个html页面
        :return:返回一个新的url列表
        """
        soup = BeautifulSoup(html_page, 'lxml')
        url_links= []
        for a_tag in soup.body.select('a[href]'):
            # 解析url, 可以将url分解
            parser = urlparse(a_tag.attrs['href'])
            # 提取域名
            netloc = parser.netloc or domain
            # 提取协议
            scheme = parser.scheme or 'http'
            if scheme != 'javascript' and netloc == domain:
                # 提取路由
                path = parser.path
                # 提取参数
                query = '?' + parser.query if parser.query else ''
                # 新的格式化字符串的方式，可以直接将变量传进去
                full_url = f'{scheme}://{netloc}{path}{query}'
                if full_url not in visited_urls:
                    url_links.append(full_url)

        return url_links

    def extract(self, html_page):
        """

        :param html_page: 传入一个html页面
        :return: 返回提取到的数据
        """
        pass

    def store(self, data_dict):
        """

        :param data_dict: 传入数据
        :return:
        """
        pass


class SpiderThread(Thread):

    def __init__(self, name, spider, tasks_queue):
        """

        :param spider: 爬虫
        :param tasks_queue: 需要执行的任务
        """
        super().__init__(name=name, daemon=True)
        self.spider = spider
        self.tasks_queue = tasks_queue

    def run(self):
        while True:
            # 如果队列没东西时，会阻塞在这里，而且可以设置等待时间
            current_url = self.tasks_queue.get()
            visited_urls.add(current_url)
            self.spider.status = SpiderStatus.WORKING
            # 抓取页面
            html_page = self.spider.fetch(current_url)
            if html_page not in [None, '']:
                # 解析页面，提取其中的url
                url_links = self.spider.parse(html_page)
                for url_link in url_links:
                    self.tasks_queue.put(url_link)

            self.spider.status = SpiderStatus.IDLE


def is_any_alive(spider_threads):
    """

    :param spider_threads: 传入所有线程
    :return: 如果所有工作都完成，返回false, 如果有一个没结束，返回True
    """
    # 只要有任何一个结果是true 结果就是true 与all正好相反
    return any([spider_thread.spider.status == SpiderStatus.WORKING
                for spider_thread in spider_threads])

visited_urls = set()


def main():
    #  用队列是因为队列有锁，保证线程安全，
    #  而且可以设置最大长度，可以控制内存的
    #  先进先出，后进后出
    task_queue = Queue()
    # 将种子url放进队列， 并放在尾部
    task_queue.put('http://m.sohu.com/')
    #  创建一个线程队列
    spider_threads = [SpiderThread('t%d' % i, Spider(), task_queue) for i in range(10)]
    #  启动线程
    for spider_thread in spider_threads:
        spider_thread.start()
    #  如果队列内不为空， 或者还有任务没完成
    while not task_queue.empty() or is_any_alive(spider_threads):
        pass

    print('Over!')


if __name__ == '__main__':
    main()

保存在redis和mongodb中

# 导入枚举
from enum import Enum, unique
from hashlib import sha1
from random import random
from time import sleep
from threading import Thread, current_thread
from urllib.parse import urlparse

import pymongo
import requests
import redis
from bs4 import BeautifulSoup


def decode_page(page_bytes, charsets=('utf-8',)):
    page_html = None
    for charset in charsets:
        try:
            page_html = page_bytes.decode(charset)
            break
        except UnicodeDecodeError:
            pass
            # logging.error('[Decode]', err)
    return page_html


class Retry(object):

    def __init__(self, *, retry_times=3, wait_secs=1, errors=(Exception, )):
        """

        :param retry_times: 重试次数
        :param wait_secs: 最少等待时间
        :param errors: 抓取的错误列表
        """
        self.retry_times = retry_times
        self.wait_secs = wait_secs
        self.errors = errors

    # 可以将对象变成像函数一样可以调用
    def __call__(self, fn):

        def wrapper(*args, **kwargs):
            for _ in range(self.retry_times):
                try:
                    return fn(*args, **kwargs)
                # except后面可以跟元组
                except self.errors as e:
                    # logging.error(e)
                    # logging.info('[Retry]')
                    print(e)
                    sleep((random() + 1) * self.wait_secs)

            return None

        return wrapper


#  枚举的作用是定义常量，而且可以提供一些方法来对这些常量验证
#  保证里面没有重复值
@unique
class SpiderStatus(Enum):
    IDLE = 0
    WORKING = 1


class Spider(object):

    def __init__(self):
        self.status = SpiderStatus.IDLE

    # 重试的包装器
    @Retry()
    def fetch(self, current_url, *, charsets=('utf-8', ), user_agent=None, proxies=None):
        """

        :param current_url: 传入一个url
        :return: 一个html页面
        """
        thread_name = current_thread().name
        print(f'[{thread_name} Fetch]:{current_url}')
        headers = {'user-agent': user_agent} if user_agent else {}
        resp = requests.get(current_url,
                            headers=headers, proxies=proxies)

        return decode_page(resp.content, charsets) if resp.status_code == 200 else None

    def parse(self, html_page, *, domain='m.sohu.com'):
        """

        :param html_page: 传入一个html页面
        :return:返回一个新的url列表
        """
        soup = BeautifulSoup(html_page, 'lxml')
        url_links= []
        for a_tag in soup.body.select('a[href]'):
            # 解析url, 可以将url分解
            parser = urlparse(a_tag.attrs['href'])
            # 提取域名
            netloc = parser.netloc or domain
            # 提取协议
            scheme = parser.scheme or 'http'
            if scheme != 'javascript' and netloc == domain:
                # 提取路由
                path = parser.path
                # 提取参数
                query = '?' + parser.query if parser.query else ''
                # 新的格式化字符串的方式，可以直接将变量传进去
                full_url = f'{scheme}://{netloc}{path}{query}'
                if not redis_client.sismember('visited_urls', full_url) :
                    redis_client.rpush('m_sohu_task', full_url)

    def extract(self, html_page):
        """

        :param html_page: 传入一个html页面
        :return: 返回提取到的数据
        """
        pass

    def store(self, data_dict):
        """

        :param data_dict: 传入数据
        :return:
        """
        pass


class SpiderThread(Thread):

    def __init__(self, name, spider):
        """

        :param spider: 爬虫
        :param tasks_queue: 需要执行的任务
        """
        super().__init__(name=name, daemon=True)
        self.spider = spider

    def run(self):
        while True:

            # 如果队列没东西时，会阻塞在这里，而且可以设置等待时间
            current_url = redis_client.lpop('m_sohu_task')
            # 抓取不到继续抓取直到抓到为止
            while not current_url:
                current_url = redis_client.lpop('m_sohu_task')
            self.spider.status = SpiderStatus.WORKING
            current_url = current_url.decode('utf-8')
            if not redis_client.sismember('visited_urls', current_url):
                redis_client.sadd('visited_urls', current_url)
                # 抓取页面
                html_page = self.spider.fetch(current_url)
                if html_page not in [None, '']:
                    hasher = hasher_proto.copy()
                    hasher.update(current_url.encode('utf-8'))
                    doc_id = hasher.hexdigest()
                    #  find 和 findone 的区别
                    # find如果找不到数据，返回一个游标
                    # findone如果找不到数据，返回False
                    if sohu_data_coll.find({'_id': doc_id}).count() == 0:
                        print(sohu_data_coll)
                        sohu_data_coll.insert_one(
                            {'_id':doc_id,
                             'url':current_url,
                             'page': html_page})

                    # 如果往mongo中放二进制数据,需要用下面的方法包装
                    # from bson import Binary
                    # bson这个包千万不要自己下，而是要下mongo时自带下载，不然mongo操作中会包莫名错误
                    # Binary()
                    # 解析页面，提取其中的url
                    self.spider.parse(html_page)

                self.spider.status = SpiderStatus.IDLE


def is_any_alive(spider_threads):
    """

    :param spider_threads: 传入所有线程
    :return: 如果所有工作都完成，返回false, 如果有一个没结束，返回True
    """
    # 只要有任何一个结果是true 结果就是true 与all正好相反
    return any([spider_thread.spider.status == SpiderStatus.WORKING
                for spider_thread in spider_threads])


redis_client = redis.Redis(host='',
                               port=, password='')

mongo_client = pymongo.MongoClient(host='', port=)
db = mongo_client.msohu
sohu_data_coll = db.webpages
hasher_proto = sha1()


def main():
    print(mongo_client)
    # 判断redis是否有m_sohu_task这个键
    if not redis_client.exists('m_sohu_task'):
        # 将种子url放进队列， 并放在尾部
        redis_client.rpush('m_sohu_task', 'http://m.sohu.com/')
    else:
        pass
    #  创建一个线程队列
    # redis本身是线程安全的是，单线程异步io模式
    spider_threads = [SpiderThread('t%d' % i, Spider()) for i in range(10)]
    #  启动线程
    for spider_thread in spider_threads:
        spider_thread.start()
    #  如果redis中m_sohu_task长度是否为0， 或者还有任务没完成
    while redis_client.exists('m_sohu_task') or is_any_alive(spider_threads):
        pass

    print('Over!')


if __name__ == '__main__':
    main()

上面的代码有些问题，由于多个线程共用了一个redis和mongo客户端，会产生竞争资源的问题，所有这里导入了threading,local，来解决这个问题

import pickle
import zlib
from enum import Enum, unique
from hashlib import sha1
from random import random
from threading import Thread, current_thread, local
from time import sleep
from urllib.parse import urlparse

import pymongo
import redis
import requests
from bs4 import BeautifulSoup
from bson import Binary


@unique
class SpiderStatus(Enum):
    IDLE = 0
    WORKING = 1


def decode_page(page_bytes, charsets=('utf-8',)):
    page_html = None
    for charset in charsets:
        try:
            page_html = page_bytes.decode(charset)
            break
        except UnicodeDecodeError:
            pass
    return page_html


class Retry(object):

    def __init__(self, *, retry_times=3,
                 wait_secs=5, errors=(Exception, )):
        self.retry_times = retry_times
        self.wait_secs = wait_secs
        self.errors = errors

    def __call__(self, fn):

        def wrapper(*args, **kwargs):
            for _ in range(self.retry_times):
                try:
                    return fn(*args, **kwargs)
                except self.errors as e:
                    print(e)
                    sleep((random() + 1) * self.wait_secs)
            return None

        return wrapper


class Spider(object):

    def __init__(self):
        self.status = SpiderStatus.IDLE

    @Retry()
    def fetch(self, current_url, *, charsets=('utf-8', ),
              user_agent=None, proxies=None):
        thread_name = current_thread().name
        print(f'[{thread_name}]: {current_url}')
        headers = {'user-agent': user_agent} if user_agent else {}
        resp = requests.get(current_url,
                            headers=headers, proxies=proxies)
        return decode_page(resp.content, charsets) \
            if resp.status_code == 200 else None

    def parse(self, html_page, *, domain='m.sohu.com'):
        soup = BeautifulSoup(html_page, 'lxml')
        for a_tag in soup.body.select('a[href]'):
            parser = urlparse(a_tag.attrs['href'])
            scheme = parser.scheme or 'http'
            netloc = parser.netloc or domain
            if scheme != 'javascript' and netloc == domain:
                path = parser.path
                query = '?' + parser.query if parser.query else ''
                full_url = f'{scheme}://{netloc}{path}{query}'
                redis_client = thread_local.redis_client
                if not redis_client.sismember('visited_urls', full_url):
                    redis_client.rpush('m_sohu_task', full_url)

    def extract(self, html_page):
        pass

    def store(self, data_dict):
        # redis_client = thread_local.redis_client
        # mongo_db = thread_local.mongo_db
        pass


class SpiderThread(Thread):

    def __init__(self, name, spider):
        super().__init__(name=name, daemon=True)
        self.spider = spider

    def run(self):
        redis_client = redis.Redis(host='', port=, password='')
        mongo_client = pymongo.MongoClient(host='', port=)
        thread_local.redis_client = redis_client
        thread_local.mongo_db = mongo_client.msohu 
        while True:
            current_url = redis_client.lpop('m_sohu_task')
            while not current_url:
                current_url = redis_client.lpop('m_sohu_task')
            self.spider.status = SpiderStatus.WORKING
            current_url = current_url.decode('utf-8')
            if not redis_client.sismember('visited_urls', current_url):
                redis_client.sadd('visited_urls', current_url)
                html_page = self.spider.fetch(current_url)
                if html_page not in [None, '']:
                    hasher = hasher_proto.copy()
                    hasher.update(current_url.encode('utf-8'))
                    doc_id = hasher.hexdigest()
                    sohu_data_coll = mongo_client.msohu.webpages
                    if not sohu_data_coll.find_one({'_id': doc_id}):
                        sohu_data_coll.insert_one({
                            '_id': doc_id,
                            'url': current_url,
                            'page': Binary(zlib.compress(pickle.dumps(html_page)))
                        })
                    self.spider.parse(html_page)
            self.spider.status = SpiderStatus.IDLE


def is_any_alive(spider_threads):
    return any([spider_thread.spider.status == SpiderStatus.WORKING
                for spider_thread in spider_threads])


thread_local = local()
hasher_proto = sha1()


def main():
    redis_client = redis.Redis(host='120.77.222.217', port=6379, password='1qaz2wsx')
    if not redis_client.exists('m_sohu_task'):
        redis_client.rpush('m_sohu_task', 'http://m.sohu.com/')

    spider_threads = [SpiderThread('thread-%d' % i, Spider())
                      for i in range(10)]
    for spider_thread in spider_threads:
        spider_thread.start()

    while redis_client.exists('m_sohu_task') or is_any_alive(spider_threads):
        pass

    print('Over!')


if __name__ == '__main__':
    main()

抓取格言网数据

import re
from enum import Enum, unique
from threading import Thread,current_thread
from time import sleep
from urllib.parse import urlparse

import pymongo
import requests
import redis
import zlib
import pickle
from bs4 import BeautifulSoup
from bson import Binary



@unique
class SpiderStatus(Enum):
    IDEL = 0
    WORKING = 1


def decode_page(page_bytes, charsets):
    page_html = None
    for charset in charsets:
        try:
            page_html = page_bytes.decode(charset)

            break
        except UnicodeDecodeError as e:
            pass
    return page_html


def data_classify(soup, url, title):
    contents = soup.body.select_one('div[class="content"]')
    if contents:
        title = soup.body.select_one('div[class="title"] h2').text
        data = {
            'url': url,
            'title': title,
            'content': contents.text
        }
    else:
        data = {
            'url': url,
            'title': title
        }
    return data


class Retry(object):

    def __init__(self, retry_times=3, wait_time=3, errors=(Exception, )):
        self.retry_times = retry_times
        self.wait_time = wait_time
        self.errors = errors

    def __call__(self, fn):
        def wrapper(*args, **kwargs):
            for _ in range(self.retry_times):
                try:

                    return fn(*args, **kwargs)
                except self.errors as e:
                    print(e)
                    sleep(self.wait_time)

            return None

        return wrapper


class Spider(object):

    def __init__(self):
        self.status = SpiderStatus.IDEL

    @Retry()
    def fecth(self, url, *, charsets=('gb2312', 'utf-8'), user_agent=None, proxies=None):


        print(f'[{current_thread().name}{url}]')
        headers = {'user_agent': user_agent} if user_agent else {}
        resp = requests.get(url, headers=headers, proxies=proxies)
        return decode_page(resp.content, charsets) if resp.status_code == 200 else None

    def parse(self, html, *, domain='geyanw.com'):
        soup = BeautifulSoup(html, 'lxml')

        for a_tag in soup.body.select('a[href]'):
            parser = urlparse(a_tag.attrs['href'])
            netloc = parser.netloc or domain
            scheme = parser.scheme or 'http'
            if scheme != 'javascript' and netloc == domain:
                path = parser.path
                query = '?' + parser.query if parser.query else ''
                full_url = f'{scheme}://{netloc}{path}{query}'
                if not redis_client.sismember('visited_urls', full_url) and \
                        not redis_client.sismember('url_set', full_url):
                    redis_client.sadd('url_set', full_url)

    """
        lizhimingyan 励志名言
        renshenggeyan 人生格言
        mingyanjingju 名言警句
        mingrenmingyan 名人名言
        dushumingyan 读书名言
        jingdianmingyan 经典名言
        aiqingmingyan 爱情名言

    """

    def get_data(self, html, current_url):
        soup = BeautifulSoup(html, 'lxml')
        content = soup.body.select_one('div[class="content"]')
        if content:
            href_re = re.compile('index.html')
            topic = soup.body.find('a', {'href': href_re}).text
            topic_url = soup.body.find('a', {'href': href_re}).attrs['href']
            title = soup.body.select_one('div[class="title"] h2').text
            data = {
                'topic':topic,
                'topic_url':topic_url,
                'title':title,
                'url': current_url,
                'content':Binary(zlib.compress(pickle.dumps(content.text)))
            }

            coll = topic_url.split('/')[1]
            if coll == 'html':
                coll = topic_url.split('/')[2]
            return data, coll
        return None

        # if re.search('lizhimingyan', current_url):
        #     print('lizhi')
        #     data = data_classify(soup, current_url, 'lizhimingyan')
        #     return (data, 'lizhi')
        # elif re.search('renshenggeyan', current_url):
        #     print('rensheng')
        #     data = data_classify(soup, current_url, 'lizhimingyan')
        #     return (data, 'rensheng')
        # elif re.search('mingyanjingju', current_url):
        #     print('mingyan')
        #     data = data_classify(soup, current_url, 'lizhimingyan')
        #     return (data, 'mingyan')
        # elif re.search('mingrenmingyan', current_url):
        #     print('mingren')
        #     data = data_classify(soup, current_url, 'lizhimingyan')
        #     return (data, 'mingren')
        # elif re.search('dushumingyan', current_url):
        #     data = data_classify(soup, current_url, 'lizhimingyan')
        #     return (data, 'dushu')
        # elif re.search('jingdianmingyan', current_url):
        #     data = data_classify(soup, current_url, 'lizhimingyan')
        #     return (data, 'jingdian')
        # elif re.search('aiqingmingyan', current_url):
        #     data = data_classify(soup, current_url, 'lizhimingyan')
        #     return (data, 'aiqing')
        # return None

    def store(self, data):
        if data:
            collection = db[data[1]]
            collection.insert(data[0])
            return True
        return False


class SpiderThread(Thread):

    def __init__(self, name, spider):
        super().__init__(name=name, daemon=True)
        self.name = name
        self.spider = spider

    def run(self):
        while True:
            current_url = redis_client.spop('url_set')

            while not current_url:
                current_url = redis_client.spop('url_set')

            self.spider.status = SpiderStatus.WORKING
            current_url = current_url.decode('utf-8')

            if not redis_client.sismember('visited_urls', current_url):
                redis_client.sadd('visited_urls', current_url)
                html = self.spider.fecth(current_url)

                if html:
                    self.spider.parse(html)
                    data = self.spider.get_data(html, current_url)
                    if self.spider.store(data):
                        print('储存成功')
                    else:
                        print('储存失败')
                self.spider.status = SpiderStatus.IDEL


redis_client = redis.Redis(host='47.98.172.171', port=11223, password='544619417wxz')

mongo_client = pymongo.MongoClient(host='47.98.172.171', port=27017)
db = mongo_client.geyanw
SEEDURL = 'https://geyanw.com/'


def any_is_working(spiderthreads):
    return any([spiderthread.spider.status == SpiderStatus.WORKING for spiderthread in  spiderthreads])


def main():
    """url:https://geyanw.com/"""
    if not redis_client.exists('url_set'):
        redis_client.sadd('url_set', SEEDURL)
    spider_threads = [SpiderThread(f'thread[{i}]:', Spider()) for i in range(10)]
    for spider_thread in spider_threads:
        spider_thread.start()

    sleep(20)
    while redis_client.exists('url_set') and any_is_working(spider_threads):
        pass

    print('OVER!')


if __name__ == '__main__':
    main()

北辰0518

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬虫并发下载

爬虫并发下载保存在内存# 导入枚举from enum import Enum, uniquefrom queue import Queuefrom random import randomfrom time import sleepfrom threading import Thread, current_threadfrom urllib.parse import u...
复制链接

扫一扫