python3.7 协程爬虫,简单实现了一下,用的jupyter,有些调用方法不太一样…仅为技术交流使用,如有侵权请联系删除
一、引入相关库
from lxml import etree #html解析库
from time import time #获取时间
import asyncio #协程库
import aiohttp #协程http请求
二、准备好调用的Headrs,url
url = "https://movie.douban.com/top250"
headers = {
'Host':'movie.douban.com',
'Connection':'keep-alive',
'Cache-Control':'max-age=0',
'Accept': 'text/html, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
'DNT':'1',
'Referer': 'http://example.com/',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,ja;q=0.6'
}
三、定义一个抓取函数
#主要做html页面抓取
async def fetch_content(url):
await asyncio.sleep(2) # 防止请求过快 等待2秒
async with aiohttp.ClientSession(
headers=headers, connector=aiohttp.TCPConnector(ssl=sslcontext)
) as session:
async with session.get(url) as response:
return await response.text()
四、定义一个解析函数
#主要做html页面解析
async def parse(url):
page=await fetch_content(url)
html = etree.HTML(page)
xpath_movie = '//*[@id="content"]/div/div[1]/ol/li'
xpath_title = './/span[@class="title"]'
xpath_pages = '//*[@id="content"]/div/div[1]/div[2]/a'
pages = html.xpath(xpath_pages) # 所有页面的链接都在底部获取
fetch_list = [] #此定义主要用于组装所有页面完整链接
result = [] #此定义主要用于版块解析
for element_movie in html.xpath(xpath_movie):
result.append(element_movie)
for p in pages:
fetch_list.append(url + p.get("href")) # 解析翻页按钮对应的链接 组成完整后边页面链接
tasks = [fetch_content(url) for url in fetch_list] # 并行处理所有翻页的页面
#并发执行
pages = await asyncio.gather(*tasks)
for page in pages:
html = etree.HTML(page)
for element_movie in html.xpath(xpath_movie):
result.append(element_movie)
for i, movie in enumerate(result, 1):
title = movie.find(xpath_title).text
print(i,title)
五、调用
async def main():
start = time()
await parse(url)
end = time()
print("Cost {} seconds".format((end - start) / 5))
if __name__ == "__main__":
await main()