多进程 多线程 异步 爬虫(1)

**多进程 多线程 异步 爬虫
忽略爬虫具体规则策略cookie登录等,专注高性能,高并发。**

初步

爬取煎蛋图片,存图片链接到mongodb

#!/usr/bin/python
#-*- coding: utf-8 -*-
import os
import json
import functools
import requests
import urllib.request
from urllib.request import FancyURLopener
import urllib.parse
import urllib.error
from lxml import etree
import time
from pymongo import MongoClient

from concurrent.futures import ProcessPoolExecutor, as_completed,ThreadPoolExecutor
import asyncio
import aiohttp

TsHeader = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}

PATH = r'..\Spider_PoolExecutor'
CHUNK_SIZE = 1024

P_MAX_WORKERS_PAGE = 3
P_MAX_WORKERS_IMGSAVE = 10

T_MAX_WORKERS_PAGE = 3
T_MAX_WORKERS_IMGSAVE = 10

LOCK_TIME = 0.2
EVENT_TIME = 0.2

SEMA_NUM = 3

class DataBase():
    def __init__(self):
        client = MongoClient('127.0.0.1', 27017)
        self._db = client['jandan']

class JDBase():
    def __init__(self):
        self._chunk_size = CHUNK_SIZE
        try:
            db = DataBase()
            self._db = db._db
        except:
            print('数据库连接失败')
            self._db = None

    def insert_header(self,header, img_url, call_url):
        referer = call_url
        if call_url.find('#') > 0:
            referer = call_url[:len(call_url) - 9]
        host = img_url[7:21]
        header.addheader("Host", host)
        header.addheader("User-Agent",
                         "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36")
        if img_url.find('.gif') > 0:
            header.addheader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
        else:
            header.addheader("Accept", "*/*")
        header.addheader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
        header.addheader("Accept-Encoding", "gzip,deflate")
        header.addheader("Referer", referer)
        header.addheader("Connection", "keep-alive")
        header.addheader("Upgrade-Insecure-Requests", "1")
        return header

    def GetUrl(self,url, data=None):
        f = ""
        if data: f = "?" + urllib.parse.urlencode(data)
        curl = url + f
        try:
            req = urllib.request.Request(curl, headers=TsHeader)
            response = urllib.request.urlopen(req)
            html = response.read()
        except Exception as e:
            print(e)
            print(curl)
            return False
        return html

    def PostUrl(self,url, data):
        data = urllib.parse.urlencode(data).encode('utf-8')
        try:
            request = urllib.request.Request(url, headers=TsHeader)
            request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
            f = urllib.request.urlopen(request, data)
            a = f.read().decode('utf-8')
        except Exception as e:
            print(e)
            print(url)
            return False
        return a

    def page_list(self):
        p = ('72', 'http://jandan.net/ooxx/page-')
        pagelist = []
        for i in range(10):
            t = (str(int(p[0]) - i), p[1] + str(int(p[0]) - i))
            pagelist.append(t)
        return pagelist

    def url_dict_onepage(self,agrs):
        num = agrs[0]
        page = agrs[1]
        pagedict = {}
        r = self.GetUrl(page)
        html = etree.HTML(r)
        pic_lis = html.xpath('//ol//li//img/@src')
        git_lis = html.xpath('//ol//li//img/@org_src')
        git_lis.extend(pic_lis)
        pagedict['page'] = num
        pagedict['urls'] = git_lis
        return pagedict

    def url_list_db(self):
        urls = []
        try:
            db_urls = self._db.jd_url.find({}, {'page': 1, 'urls': 1})
        except:
            print('db find error')
            return False
        for url in db_urls:
            urls.extend(url['urls'])
        return urls

    def file_download(self,url):
        urlx = 'http:' + url
        imgname = url.split('/')[-1]
        if imgname.split('.')[-1] == 'gif':
            imgPath = os.path.join(PATH, 'gif', imgname)
        else:
            imgPath = os.path.join(PATH, 'pic', imgname)
        if not os.path.lexists(imgPath):
            # urllib.request.urlretrieve(urlx, imgPath)
            opener = FancyURLopener()
            opener.addheaders.clear()
            opener = self.insert_header(opener, urlx, imgPath)
            with open(imgPath, 'wb+') as f:
                while True:
                    chunk = opener.open(urlx,self._chunk_size)
                    if not chunk: break
                    f.write(chunk)

ProcessPoolExecutor 多进程

class ProcessPE(JDBase):
    def __init__(self):
        super(ProcessPE, self).__init__()
        self._max_workers_page = P_MAX_WORKERS_PAGE
        self._max_workers_imgsave = P_MAX_WORKERS_IMGSAVE

    def page_ProcessPE(self,urls):
        with ProcessPoolExecutor(max_workers=self._max_workers_page) as executor:
            future_to_url = {executor.submit(self.url_dict_onepage, url): url for url in urls}
            for future in as_completed(future_to_url):
                try:
                    result = future.result()
                    self._db.jd_url.insert_one(result)
                except Exception as e:
                    print('raise an exception: {}'.format(e))
                else:
                    print('[db]{}'.format(result))

    def img_download_ProcessPE(self,urls):
        with ProcessPoolExecutor(max_workers=self._max_workers_imgsave) as executor:
            future_to_url = {executor.submit(self.file_download, url): url for url in urls}
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                except Exception as e:
                    print('raise an exception: {}'.format(e))
                else:
                    print('[save]OK')

    def Execute(self):
        page_list = self.page_list()
        self.page_ProcessPE(page_list)
        urls = self.url_list_db()
        if not urls:
            self.img_download_ProcessPE(urls)
        else:
            print('数据库无数据')

ThreadPoolExecutor 多线程

class ThreadPE(JDBase):
    def __init__(self):
        super(ThreadPE, self).__init__()
        self._max_workers_page = T_MAX_WORKERS_PAGE
        self._max_workers_imgsave = T_MAX_WORKERS_IMGSAVE

    def page_ThreadPE(self,urls):
        with ThreadPoolExecutor(max_workers=self._max_workers_page) as executor:
            future_to_url = {executor.submit(self.url_dict_onepage, url): url for url in urls}
            for future in as_completed(future_to_url):
                try:
                    result = future.result()
                    self._db.jd_url.insert_one(result)
                except Exception as e:
                    print('raise an exception: {}'.format(e))
                else:
                    print('[db]{}'.format(result))

    def img_download_ThreadPE(self,urls):
        with ThreadPoolExecutor(max_workers=self._max_workers_imgsave) as executor:
            future_to_url = {executor.submit(self.file_download, url): url for url in urls}
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                except Exception as e:
                    print('raise an exception: {}'.format(e))
                else:
                    print('[save]OK')

    def Execute(self):
        page_list = self.page_list()
        self.page_ThreadPE(page_list)
        urls = self.url_list_db()
        # print(urls)
        if urls:
            self.img_download_ThreadPE(urls)
        else:
            print('数据库无数据')

asyncio 异步

(获取页面下图片链接)

class Async_only(JDBase):
    def __init__(self):
        super(Async_only, self).__init__()
        self._pagedict = {}

    async def url_dict_onepage_Async(self, agrs):
        print('[A]',agrs)
        num = agrs[0]
        page = agrs[1]
        pagedict = {}
        # 异步调用
        # 这里只是例子 正确做法是用aiohttp这个异步网络库,而不是urllib这种阻塞库
        #         r = await self.GetUrl(page)
        #         html = etree.HTML(r)
        #         pic_lis = html.xpath('//ol//li//img/@src')
        #         git_lis = html.xpath('//ol//li//img/@org_src')
        #         git_lis.extend(pic_lis)
        #         pagedict['page'] = num
        #         pagedict['urls'] = git_lis
        #         print(pagedict)
        # 用异步sleep(1)模拟异步io操作
        # r = await asyncio.sleep(2)
        # 用time.sleep(1)模拟阻塞io操作,和同步执行一样
        r = await time.sleep(2)
        pagedict['page'] = num
        pagedict['urls'] = page
        print('[B]',pagedict)

    def page_ELoop(self, urls):
        # 获取EventLoop
        loop = asyncio.get_event_loop()
        tasks = [self.url_dict_onepage_Async(agrs) for agrs in urls]
        # print(tasks)
        # 执行coroutine
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

    def Execute(self):
        page_list = self.page_list()
        self.page_ELoop(page_list)

asyncio + aiohttp

替换阻塞io为异步比如 请求 比如 数据库操作等

class AsyncAiohttp(JDBase):
    def __init__(self):
        super(AsyncAiohttp,self).__init__()
        self._chunk_size = CHUNK_SIZE

    def page_list(self):
        p = ('74', 'http://jandan.net/ooxx/page-')
        pagelist = []
        for i in range(2):
            t = (str(int(p[0]) - i), p[1] + str(int(p[0]) - i))
            pagelist.append(t)
            # print(a)
        return pagelist

    # @asyncio_Semaphore(3)
    async def url_dict_onepage_async(self,agrs):
        pagedict = {}
        num = agrs[0]
        page = agrs[1]
        async with aiohttp.request('GET',page) as r :
            data = await r.read()
        data = data.decode()
        html = etree.HTML(data)
        pic_lis = html.xpath('//ol//li//img/@src')
        git_lis = html.xpath('//ol//li//img/@org_src')
        git_lis.extend(pic_lis)
        pagedict['page'] = num
        pagedict['urls'] = git_lis
        self._db.jd_url.insert_one(pagedict) #TODO 替换成一次插入多值

    async def file_download_async(self, url):
        urlx = 'http:' + url
        imgname = url.split('/')[-1]
        if imgname.split('.')[-1] == 'gif':
            imgPath = os.path.join(PATH, 'gif', imgname)
        else:
            imgPath = os.path.join(PATH, 'pic', imgname)
        if not os.path.lexists(imgPath):
            # urllib.request.urlretrieve(urlx, imgPath)
            # async with aiohttp.request('GET', urlx) as r:
            async with aiohttp.ClientSession() as session:
                async with session.get( urlx) as r:
                    with open(imgPath, 'wb+') as f:
                        while True:
                            chunk = await r.content.read(self._chunk_size)
                            if not chunk:break
                            f.write(chunk)

    def page_ELoop(self, urls):
        # 获取EventLoop
        loop = asyncio.get_event_loop()
        tasks = [self.url_dict_onepage_async(url) for url in urls]
        # 执行coroutine
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

    def img_download_ELoop(self,urls):
        # 获取EventLoop
        loop = asyncio.get_event_loop()
        tasks = [self.file_download_async(url) for url in urls]
        # 执行coroutine
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

    def Execute(self):
        page_list = self.page_list()
        self.page_ELoop(page_list)
        urls = self.url_list_db()
        if urls:
            self.img_download_ELoop(urls)
        else:
            print('数据库无数据')
if __name__ == '__main__':
    start = time.time()
    # AA = ProcessPE()

    # AA = ThreadPE()

    # AA = Async_only()

    AA = AsyncAiohttp()
    AA.Execute()
    print('[endtime][{}]'.format(time.time() - start))
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值