记录一下学习使用aiohttp+async抓取商城价格
import time
import aiohttp
import asyncio
from lxml import etree
import pandas as pd
def get_url():
df=pd.read_excel('商城链接.xlsx')
return df
headers={
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
}
proxy="http://ip"
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
async def fetch(url):
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=64,verify_ssl=False)) as session:
async with session.get(url,headers=headers,proxy=proxy) as response:
return await response.text()
async def parse(url,semaphore):
async with semaphore:
try:
html=await fetch(url)
tree=etree.HTML(html)
try:
price_page=tree.xpath('//b[@id="bSalePrice"]/text()')[0]
price=price_page.strip()
price=price.replace('¥','')
print(price)
except IndexError:
price='-1.00'
return price_list
except aiohttp.ClientConnectorError:
time.sleep(5)
def run(data):
semaphore = asyncio.Semaphore(300)
loop=asyncio.get_event_loop()
tasks=[asyncio.ensure_future(parse(url,semaphore)) for url in data]
tasks=asyncio.gather(*tasks)
loop.run_until_complete(tasks)
if __name__ == '__main__':
start=time.time()
df=get_url()
data=df['链接']
data=list(data)
run(data)
print('花费时间为:{}秒'.format(time.time()-start))
理解很浅薄,后面会继续优化