Python---爬虫---速---aiomultiprocess


# -*- coding: utf-8 -*-

#
# Imports
#

import asyncio
import hashlib
import time
import datetime
import json

from aiomultiprocess import Pool
from redis import *
from pybloom_live import BloomFilter
import aiohttp

#
# Public variable
#

Bloom_data = BloomFilter(1000000000,0.01)
DB_get_question=StrictRedis(host='', port=6480,
                 password='',db=4)
pipeline_redis = DB_get_question.pipeline()

#
# Public functions
#

def md5(data):
    """
    对数据进行MD5加密
    :param data:
    :return:
    """
    md5_qa = hashlib.md5(data.encode('utf8')).hexdigest()
    md5_qa = bytes(md5_qa, encoding='utf8')
    return md5_qa

async def get(data):
    """
    协程函数
    :param url:
    :return:
    """
    # while True:
    # print('data:',data)
    # try:
    url = ''
    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False)) as session:
        get_proxy = DB_get_question.spop('IP_PROXY')
        response = await session.post(url,json = data,timeout = 7,proxy = {"http": "http://{}".format(get_proxy)})
        result = await response.text()
        hjson = json.loads(result)
        content = hjson['results'][0]['values']['text']
        # print('data:',data)
        print('\033[32;1mget_question\033[0m:', content)
        await asyncio.sleep(0.1)
        return content
    # except:
    #     open('error_url.txt','a').write(url + '\n')
    #     await get(data)

async def request():
    """
    使用进程加异步协程发送请求
    :return:
    """

    key_number = 0
    datas = ['']
    split_key = DB_get_question.spop('key2_set').decode('utf8').split(': ')
    key = split_key[-1].replace('\'', '').replace('}', '')
    phone = split_key[0].replace('\'', '').replace('{', '').replace('b', '')
    while len(datas) != 0:
        key_number += 1
        if len(datas) > 1:
            async with Pool() as pool:
                get_proxy = DB_get_question.spop('IP_PROXY')
                result_list = await pool.map(get, datas)
                # print(result_list)
                for result in result_list:
                    if result:
                        # print('key',key)
                        # print('phone', phone)
                        if '请求次数' in result or 'key不对' in result or '请求内容为空' in result:
                            split_key = DB_get_question.spop('key2_set').decode('utf8').split(': ')
                            key = split_key[-1].replace('\'', '').replace('}', '')
                            phone = split_key[0].replace('\'', '').replace('{', '')
                            break
                        md5_qa = md5(result)
                        if md5_qa not in Bloom_data:
                            Bloom_data.add(md5_qa)
                        #     pipeline_redis.lpush('total_question_list', result)
                            pipeline_redis.sadd('get_question',result)
                pipeline_redis.execute()
        datas.clear()
        question_number = 0
        while True:
            question_number += 1
            pipeline_redis.spop('original_question_set')
            if question_number == 100:
                question_list = pipeline_redis.execute()
                break

        datas = {} 
        print('datas',datas)
        print(datas)
        if key_number == 500:
            split_key = DB_get_question.spop('key2_set').decode('utf8').split(': ')
            key = split_key[-1].replace('\'', '').replace('}', '')
            phone = split_key[0].replace('\'', '').replace('{', '')
            key2_set_number=DB_get_question.scard('key2_set')
            if key2_set_number < 5:
                with open('key2_total.txt', 'r')as f_key:
                    for key in f_key:
                        key = key.strip()
                        pipeline_redis.sadd('key2_set', key)
                    pipeline_redis.execute()
            key_number = 0

coroutine = request()
task = asyncio.ensure_future(coroutine)
loop = asyncio.get_event_loop()
loop.run_until_complete(task)

 

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值