import aiohttp
import asyncio
from lxml import etree
import time
import json
import pandas as pd
result=[]
urls=[]
for i in range(10):
urls.append('https://movie.douban.com/top250?start={}&filter='.format(i*25))
print(urls)
async def get_title(url):
async with aiohttp.ClientSession()as session:
async with session.get(url) as resp:
global result
html=await resp.read() #不能用 text
titles=etree.HTML(html).xpath('.//div[@id="content"]/div/div[1]/ol')
for i in titles[0]: #这个[0]一定要加
print(url)
dict={}
dict['paiming']=int(i.xpath('.//div/div[1]/em/text()')[0])
dict['rank']=i.xpath('.//div / div[2] / div[2] / div / span[2]/text()')[0]
dict['title']=i.xpath('.//div[@class="hd"]/a/span[1]/text()')[0]
result.append(dict)
def main():
loop=asyncio.get_event_loop()
tasks=[get_title(url) for url in urls]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
result.sort(key=lambda x: x["paiming"]) #排序
s = json.dumps(result, indent=4, ensure_ascii=False)
with open('xiecheng', 'w', encoding='utf-8') as f:
f.write(s)
if __name__ == '__main__':
start = time.time()
main() # 调用方
print('总耗时:%.5f秒' % float(time.time()-start))
速度还是很快的 ~
不知道有没有其他的比较好用的库(gevent?,celery?),求大佬指点