多进程多线程异步爬虫(1)

最新推荐文章于 2024-04-29 09:45:50 发布

when_september_ends

最新推荐文章于 2024-04-29 09:45:50 发布

阅读量1.5k

点赞数

分类专栏：异步编程文章标签：异步编程 asyncio aiohttp 多进程多线程高性能

本文链接：https://blog.csdn.net/when_september_ends/article/details/72803903

版权

异步编程专栏收录该内容

2 篇文章 0 订阅

订阅专栏

**多进程多线程异步爬虫
忽略爬虫具体规则策略cookie登录等，专注高性能，高并发。**

初步

爬取煎蛋图片，存图片链接到mongodb

#!/usr/bin/python
#-*- coding: utf-8 -*-
import os
import json
import functools
import requests
import urllib.request
from urllib.request import FancyURLopener
import urllib.parse
import urllib.error
from lxml import etree
import time
from pymongo import MongoClient

from concurrent.futures import ProcessPoolExecutor, as_completed,ThreadPoolExecutor
import asyncio
import aiohttp

TsHeader = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}

PATH = r'..\Spider_PoolExecutor'
CHUNK_SIZE = 1024

P_MAX_WORKERS_PAGE = 3
P_MAX_WORKERS_IMGSAVE = 10

T_MAX_WORKERS_PAGE = 3
T_MAX_WORKERS_IMGSAVE = 10

LOCK_TIME = 0.2
EVENT_TIME = 0.2

SEMA_NUM = 3

class DataBase():
    def __init__(self):
        client = MongoClient('127.0.0.1', 27017)
        self._db = client['jandan']

class JDBase():
    def __init__(self):
        self._chunk_size = CHUNK_SIZE
        try:
            db = DataBase()
            self._db = db._db
        except:
            print('数据库连接失败')
            self._db = None

    def insert_header(self,header, img_url, call_url):
        referer = call_url
        if call_url.find('#') > 0:
            referer = call_url[:len(call_url) - 9]
        host = img_url[7:21]
        header.addheader("Host", host)
        header.addheader("User-Agent",
                         "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36")
        if img_url.find('.gif') > 0:
            header.addheader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
        else:
            header.addheader("Accept", "*/*")
        header.addheader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
        header.addheader("Accept-Encoding", "gzip,deflate")
        header.addheader("Referer", referer)
        header.addheader("Connection", "keep-alive")
        header.addheader("Upgrade-Insecure-Requests", "1")
        return header

    def GetUrl(self,url, data=None):
        f = ""
        if data: f = "?" + urllib.parse.urlencode(data)
        curl = url + f
        try:
            req = urllib.request.Request(curl, headers=TsHeader)
            response = urllib.request.urlopen(req)
            html = response.read()
        except Exception as e:
            print(e)
            print(curl)
            return False
        return html

    def PostUrl(self,url, data):
        data = urllib.parse.urlencode(data).encode('utf-8')
        try:
            request = urllib.request.Request(url, headers=TsHeader)
            request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
            f = urllib.request.urlopen(request, data)
            a = f.read().decode('utf-8')
        except Exception as e:
            print(e)
            print(url)
            return False
        return a

    def page_list(self):
        p = ('72', 'http://jandan.net/ooxx/page-')
        pagelist = []
        for i in range(10):
            t = (str(int(p[0]) - i), p[1] + str(int(p[0]) - i))
            pagelist.append(t)
        return pagelist

    def url_dict_onepage(self,agrs):
        num = agrs[0]
        page = agrs[1]
        pagedict = {}
        r = self.GetUrl(page)
        html = etree.HTML(r)
        pic_lis = html.xpath('//ol//li//img/@src')
        git_lis = html.xpath('//ol//li//img/@org_src')
        git_lis.extend(pic_lis)
        pagedict['page'] = num
        pagedict['urls'] = git_lis
        return pagedict

    def url_list_db(self):
        urls = []
        try:
            db_urls = self._db.jd_url.find({}, {'page': 1, 'urls': 1})
        except:
            print('db find error')
            return False
        for url in db_urls:
            urls.extend(url['urls'])
        return urls

    def file_download(self,url):
        urlx = 'http:' + url
        imgname = url.split('/')[-1]
        if imgname.split('.')[-1] == 'gif':
            imgPath = os.path.join(PATH, 'gif', imgname)
        else:
            imgPath = os.path.join(PATH, 'pic', imgname)
        if not os.path.lexists(imgPath):
            # urllib.request.urlretrieve(urlx, imgPath)
            opener = FancyURLopener()
            opener.addheaders.clear()
            opener = self.insert_header(opener, urlx, imgPath)
            with open(imgPath, 'wb+') as f:
                while True:
                    chunk = opener.open(urlx,self._chunk_size)
                    if not chunk: break
                    f.write(chunk)

ProcessPoolExecutor 多进程

class ProcessPE(JDBase):
    def __init__(self):
        super(ProcessPE, self).__init__()
        self._max_workers_page = P_MAX_WORKERS_PAGE
        self._max_workers_imgsave = P_MAX_WORKERS_IMGSAVE

    def page_ProcessPE(self,urls):
        with ProcessPoolExecutor(max_workers=self._max_workers_page) as executor:
            future_to_url = {executor.submit(self.url_dict_onepage, url): url for url in urls}
            for future in as_completed(future_to_url):
                try:
                    result = future.result()
                    self._db.jd_url.insert_one(result)
                except Exception as e:
                    print('raise an exception: {}'.format(e))
                else:
                    print('[db]{}'.format(result))

    def img_download_ProcessPE(self,urls):
        with ProcessPoolExecutor(max_workers=self._max_workers_imgsave) as executor:
            future_to_url = {executor.submit(self.file_download, url): url for url in urls}
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                except Exception as e:
                    print('raise an exception: {}'.format(e))
                else:
                    print('[save]OK')

    def Execute(self):
        page_list = self.page_list()
        self.page_ProcessPE(page_list)
        urls = self.url_list_db()
        if not urls:
            self.img_download_ProcessPE(urls)
        else:
            print('数据库无数据')

ThreadPoolExecutor 多线程

class ThreadPE(JDBase):
    def __init__(self):
        super(ThreadPE, self).__init__()
        self._max_workers_page = T_MAX_WORKERS_PAGE
        self._max_workers_imgsave = T_MAX_WORKERS_IMGSAVE

    def page_ThreadPE(self,urls):
        with ThreadPoolExecutor(max_workers=self._max_workers_page) as executor:
            future_to_url = {executor.submit(self.url_dict_onepage, url): url for url in urls}
            for future in as_completed(future_to_url):
                try:
                    result = future.result()
                    self._db.jd_url.insert_one(result)
                except Exception as e:
                    print('raise an exception: {}'.format(e))
                else:
                    print('[db]{}'.format(result))

    def img_download_ThreadPE(self,urls):
        with ThreadPoolExecutor(max_workers=self._max_workers_imgsave) as executor:
            future_to_url = {executor.submit(self.file_download, url): url for url in urls}
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                except Exception as e:
                    print('raise an exception: {}'.format(e))
                else:
                    print('[save]OK')

    def Execute(self):
        page_list = self.page_list()
        self.page_ThreadPE(page_list)
        urls = self.url_list_db()
        # print(urls)
        if urls:
            self.img_download_ThreadPE(urls)
        else:
            print('数据库无数据')

asyncio 异步

(获取页面下图片链接)

class Async_only(JDBase):
    def __init__(self):
        super(Async_only, self).__init__()
        self._pagedict = {}

    async def url_dict_onepage_Async(self, agrs):
        print('[A]',agrs)
        num = agrs[0]
        page = agrs[1]
        pagedict = {}
        # 异步调用
        # 这里只是例子 正确做法是用aiohttp这个异步网络库，而不是urllib这种阻塞库
        #         r = await self.GetUrl(page)
        #         html = etree.HTML(r)
        #         pic_lis = html.xpath('//ol//li//img/@src')
        #         git_lis = html.xpath('//ol//li//img/@org_src')
        #         git_lis.extend(pic_lis)
        #         pagedict['page'] = num
        #         pagedict['urls'] = git_lis
        #         print(pagedict)
        # 用异步sleep(1)模拟异步io操作
        # r = await asyncio.sleep(2)
        # 用time.sleep(1)模拟阻塞io操作,和同步执行一样
        r = await time.sleep(2)
        pagedict['page'] = num
        pagedict['urls'] = page
        print('[B]',pagedict)

    def page_ELoop(self, urls):
        # 获取EventLoop
        loop = asyncio.get_event_loop()
        tasks = [self.url_dict_onepage_Async(agrs) for agrs in urls]
        # print(tasks)
        # 执行coroutine
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

    def Execute(self):
        page_list = self.page_list()
        self.page_ELoop(page_list)

asyncio + aiohttp

替换阻塞io为异步比如请求比如数据库操作等

class AsyncAiohttp(JDBase):
    def __init__(self):
        super(AsyncAiohttp,self).__init__()
        self._chunk_size = CHUNK_SIZE

    def page_list(self):
        p = ('74', 'http://jandan.net/ooxx/page-')
        pagelist = []
        for i in range(2):
            t = (str(int(p[0]) - i), p[1] + str(int(p[0]) - i))
            pagelist.append(t)
            # print(a)
        return pagelist

    # @asyncio_Semaphore(3)
    async def url_dict_onepage_async(self,agrs):
        pagedict = {}
        num = agrs[0]
        page = agrs[1]
        async with aiohttp.request('GET',page) as r :
            data = await r.read()
        data = data.decode()
        html = etree.HTML(data)
        pic_lis = html.xpath('//ol//li//img/@src')
        git_lis = html.xpath('//ol//li//img/@org_src')
        git_lis.extend(pic_lis)
        pagedict['page'] = num
        pagedict['urls'] = git_lis
        self._db.jd_url.insert_one(pagedict) #TODO 替换成一次插入多值

    async def file_download_async(self, url):
        urlx = 'http:' + url
        imgname = url.split('/')[-1]
        if imgname.split('.')[-1] == 'gif':
            imgPath = os.path.join(PATH, 'gif', imgname)
        else:
            imgPath = os.path.join(PATH, 'pic', imgname)
        if not os.path.lexists(imgPath):
            # urllib.request.urlretrieve(urlx, imgPath)
            # async with aiohttp.request('GET', urlx) as r:
            async with aiohttp.ClientSession() as session:
                async with session.get( urlx) as r:
                    with open(imgPath, 'wb+') as f:
                        while True:
                            chunk = await r.content.read(self._chunk_size)
                            if not chunk:break
                            f.write(chunk)

    def page_ELoop(self, urls):
        # 获取EventLoop
        loop = asyncio.get_event_loop()
        tasks = [self.url_dict_onepage_async(url) for url in urls]
        # 执行coroutine
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

    def img_download_ELoop(self,urls):
        # 获取EventLoop
        loop = asyncio.get_event_loop()
        tasks = [self.file_download_async(url) for url in urls]
        # 执行coroutine
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

    def Execute(self):
        page_list = self.page_list()
        self.page_ELoop(page_list)
        urls = self.url_list_db()
        if urls:
            self.img_download_ELoop(urls)
        else:
            print('数据库无数据')

if __name__ == '__main__':
    start = time.time()
    # AA = ProcessPE()

    # AA = ThreadPE()

    # AA = Async_only()

    AA = AsyncAiohttp()
    AA.Execute()
    print('[endtime][{}]'.format(time.time() - start))

when_september_ends

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
多进程多线程异步爬虫(1)

**多进程多线程异步爬虫忽略爬虫具体规则策略cookie登录等，专注高性能，高并发。**初步爬取煎蛋图片，存图片链接到mongodb#!/usr/bin/python#-*- coding: utf-8 -*-import osimport jsonimport functoolsimport requestsimport urllib.requestfrom urllib.
复制链接

扫一扫