aiohttp异步爬取数据发送请求--小试

最新推荐文章于 2024-05-08 03:36:20 发布

笑笑曦

最新推荐文章于 2024-05-08 03:36:20 发布

阅读量706

点赞数

分类专栏： python 自动化测试爬虫

本文链接：https://blog.csdn.net/lssrain/article/details/84615022

版权

python 同时被 3 个专栏收录

77 篇文章 1 订阅

订阅专栏

自动化测试

33 篇文章 1 订阅

订阅专栏

爬虫

5 篇文章 0 订阅

订阅专栏

import aiohttp
import asyncio
import time
from bs4 import BeautifulSoup
import re
import requests

#限制启动线程数
sema=asyncio.Semaphore(100)

#判断链接是否正常打开
async def get_url(url):
    # conn=aiohttp.TCPConnector(limit_per_host=10)
    async with sema:
        async with aiohttp.ClientSession() as session:
            async with session.get(url,timeout=None) as rep:
                if rep.status==200:
                    print('%s' % url)
                    print('success')
                else:
                    print('%s ' % url)
                    print('fail')

#异步获取最大页数
# async def get_page_max(url):
#     async with aiohttp.ClientSession() as session:
#         async with session.get(url) as rep:
#             if rep.status==200:
#                 page_soup=BeautifulSoup(await rep.text(),'html.parser')
#                 page_max=page_soup.find('div',class_='thPages').find_all('a')[-3].text
#                 return page_max
#             else:
#                 print('failed: %s' % url)

#获取最大页数
def get_page_max(url):
    rep=requests.get(url)
    page_soup=BeautifulSoup(rep.text,'html.parser')
    page_max=page_soup.find('div',class_='thPages').find_all('a')[-3].text
    
    return page_max

#异步获取当前主页广告、logo链接
def get_main_html_pageurl(url):
    rep_pictureurl=[]
    rep=requests.get(url)
    rep_page=BeautifulSoup(rep.text,'html.parser')
    rep_page_url=rep_page.find('div',class_='topbanner').find('img').get('src')
    rep_pictureurl.append(rep_page_url)

    rep_logo=rep_page.find('div',class_='logo').find('img').get('src')
    rep_pictureurl.append('http://www.tianhong.cn'+rep_logo)

    return rep_pictureurl

#获取当前页的商品图片链接
def get_main_pictureurl(url):
    rep_pictureurl=[]
    rep=requests.get(url)
    rep_page=BeautifulSoup(rep.text,'html.parser')
    rep_page_url=rep_page.find('ul',class_='spList').find_all('img')
    for line in rep_page_url:
        line=re.findall(r'.*src="(.*)" .*',str(line))[0]
        rep_pictureurl.append(line)

    return rep_pictureurl

#获取当前页面商品链接
def get_commodity_url(url):
    rep_url=[]
    rep=requests.get(url)
    page_soup=BeautifulSoup(rep.text,'html.parser')
    page_url=page_soup.find('ul',class_='spList').find_all('a')
    for line in page_url:
        line=re.findall(r'.*a href="(.*)" tag=.*',str(line))
        rep_url.extend(line)

    return  rep_url

#获取商品详情页的图片链接
def get_Details_url(url):
    rep_url=[]
    rep=requests.get(url)
    page_soup=BeautifulSoup(rep.text,'html.parser')
    page_url=page_soup.find('div',class_='m1l').find_all('a')
    for line in page_url:
        line1=re.findall('"(http.*?)"',str(line))
        line2=re.findall(r'\'(http.*?)\'',str(line))
        rep_url.extend(line1)
        rep_url.extend(line2)

    details_url=page_soup.find('div',class_='box').find_all('img')
    for lines in details_url:
        rep_url.append(lines.get('src'))

    return rep_url

#异步运行
def get_html():
    # page_max=asyncio.get_event_loop().run_until_complete(asyncio.wait([asyncio.ensure_future(get_page_max('http://www.tianhong.cn/list-5835.html'))]))
    # page=re.findall(r'.*result=\'(.*)\'.*',str(page_max[0]))[0]
    page=get_page_max('http://www.tianhong.cn/list-5835.html')
    tasks=[]
    tasks1=[]
    tasks2=[]
    for i in range(1,int(page)+1):
        url_l='http://www.tianhong.cn/catalog/product_list.html?categoryId=5835&districtCode=100005&orderType=1&justDisplayInventory=0&justDisplayBySelfSupport=0&minSalePrice=0&maxSalePrice=0&pager.pageNumber='+str(i)
        # tasks.append(asyncio.ensure_future(get_url(url_l)))
        for line in (get_main_html_pageurl(url_l)+get_main_pictureurl(url_l)):
            # task1=asyncio.ensure_future(get_url(line))
            # task1.add_done_callback(callable)
            # tasks1.append(task1)
            tasks1.append(line)
        # for lines in (get_commodity_url(url_l)):
        #     lines = 'http://www.tianhong.cn' + lines
        #     tasks2.append(asyncio.ensure_future(get_url(lines)))
        #     for j in (get_Details_url(lines)):
        #         tasks1.append(j)
    print(len(tasks1))
    return tasks1

if __name__=='__main__':
    start = time.time()
    loop = asyncio.get_event_loop()
    coroutine=[get_url(url) for url in get_html()]
    loop.run_until_complete(asyncio.wait(coroutine))
    loop.close()
    end = time.time()
    print(end - start)



1、数据量越大，就需要限制启动线程数sema=asyncio.Semaphore(n)，n值要设置得更小（避免程序报错too many file descriptors in select）。
2、数据量太大，使用asyncio.ensure_future，程序报错too many file descriptors in select。


结语：总共有4000多个coroutine，程序运行大概10min，大概每分钟运行400个。

笑笑曦

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
aiohttp异步爬取数据发送请求--小试

import aiohttpimport asyncioimport timefrom bs4 import BeautifulSoupimport reimport requests#限制启动线程数sema=asyncio.Semaphore(100)#判断链接是否正常打开async def get_url(url): # conn=aiohttp.TCPConn...
复制链接

扫一扫

专栏目录