import aiohttp
import asyncio
import time
from bs4 import BeautifulSoup
import re
import requests
#限制启动线程数
sema=asyncio.Semaphore(100)
#判断链接是否正常打开
async def get_url(url):
# conn=aiohttp.TCPConnector(limit_per_host=10)
async with sema:
async with aiohttp.ClientSession() as session:
async with session.get(url,timeout=None) as rep:
if rep.status==200:
print('%s' % url)
print('success')
else:
print('%s ' % url)
print('fail')
#异步获取最大页数
# async def get_page_max(url):
# async with aiohttp.ClientSession() as session:
# async with session.get(url) as rep:
# if rep.status==200:
# page_soup=BeautifulSoup(await rep.text(),'html.parser')
# page_max=page_soup.find('div',class_='thPages').find_all('a')[-3].text
# return page_max
# else:
# print('failed: %s' % url)
#获取最大页数
def get_page_max(url):
rep=requests.get(url)
page_soup=BeautifulSoup(rep.text,'html.parser')
page_max=page_soup.find('div',class_='thPages').find_all('a')[-3].text
return page_max
#异步获取当前主页广告、logo链接
def get_main_html_pageurl(url):
rep_pictureurl=[]
rep=requests.get(url)
rep_page=BeautifulSoup(rep.text,'html.parser')
rep_page_url=rep_page.find('div',class_='topbanner').find('img').get('src')
rep_pictureurl.append(rep_page_url)
rep_logo=rep_page.find('div',class_='logo').find('img').get('src')
rep_pictureurl.append('http://www.tianhong.cn'+rep_logo)
return rep_pictureurl
#获取当前页的商品图片链接
def get_main_pictureurl(url):
rep_pictureurl=[]
rep=requests.get(url)
rep_page=BeautifulSoup(rep.text,'html.parser')
rep_page_url=rep_page.find('ul',class_='spList').find_all('img')
for line in rep_page_url:
line=re.findall(r'.*src="(.*)" .*',str(line))[0]
rep_pictureurl.append(line)
return rep_pictureurl
#获取当前页面商品链接
def get_commodity_url(url):
rep_url=[]
rep=requests.get(url)
page_soup=BeautifulSoup(rep.text,'html.parser')
page_url=page_soup.find('ul',class_='spList').find_all('a')
for line in page_url:
line=re.findall(r'.*a href="(.*)" tag=.*',str(line))
rep_url.extend(line)
return rep_url
#获取商品详情页的图片链接
def get_Details_url(url):
rep_url=[]
rep=requests.get(url)
page_soup=BeautifulSoup(rep.text,'html.parser')
page_url=page_soup.find('div',class_='m1l').find_all('a')
for line in page_url:
line1=re.findall('"(http.*?)"',str(line))
line2=re.findall(r'\'(http.*?)\'',str(line))
rep_url.extend(line1)
rep_url.extend(line2)
details_url=page_soup.find('div',class_='box').find_all('img')
for lines in details_url:
rep_url.append(lines.get('src'))
return rep_url
#异步运行
def get_html():
# page_max=asyncio.get_event_loop().run_until_complete(asyncio.wait([asyncio.ensure_future(get_page_max('http://www.tianhong.cn/list-5835.html'))]))
# page=re.findall(r'.*result=\'(.*)\'.*',str(page_max[0]))[0]
page=get_page_max('http://www.tianhong.cn/list-5835.html')
tasks=[]
tasks1=[]
tasks2=[]
for i in range(1,int(page)+1):
url_l='http://www.tianhong.cn/catalog/product_list.html?categoryId=5835&districtCode=100005&orderType=1&justDisplayInventory=0&justDisplayBySelfSupport=0&minSalePrice=0&maxSalePrice=0&pager.pageNumber='+str(i)
# tasks.append(asyncio.ensure_future(get_url(url_l)))
for line in (get_main_html_pageurl(url_l)+get_main_pictureurl(url_l)):
# task1=asyncio.ensure_future(get_url(line))
# task1.add_done_callback(callable)
# tasks1.append(task1)
tasks1.append(line)
# for lines in (get_commodity_url(url_l)):
# lines = 'http://www.tianhong.cn' + lines
# tasks2.append(asyncio.ensure_future(get_url(lines)))
# for j in (get_Details_url(lines)):
# tasks1.append(j)
print(len(tasks1))
return tasks1
if __name__=='__main__':
start = time.time()
loop = asyncio.get_event_loop()
coroutine=[get_url(url) for url in get_html()]
loop.run_until_complete(asyncio.wait(coroutine))
loop.close()
end = time.time()
print(end - start)
1、数据量越大,就需要限制启动线程数sema=asyncio.Semaphore(n),n值要设置得更小(避免程序报错too many file descriptors in select)。
2、数据量太大,使用asyncio.ensure_future,程序报错too many file descriptors in select。
结语:总共有4000多个coroutine,程序运行大概10min,大概每分钟运行400个。