python 协程爬虫--爬取豆瓣电影top250

最新推荐文章于 2024-01-23 20:08:43 发布

长方体空间移动工程师

最新推荐文章于 2024-01-23 20:08:43 发布

阅读量953

点赞数

文章标签： python

本文链接：https://blog.csdn.net/qq_40599186/article/details/103716293

版权

python3.7 协程爬虫，简单实现了一下，用的jupyter，有些调用方法不太一样…仅为技术交流使用，如有侵权请联系删除

一、引入相关库

from lxml import etree #html解析库
from time import time  #获取时间
import asyncio #协程库
import aiohttp #协程http请求

二、准备好调用的Headrs，url

url = "https://movie.douban.com/top250"
headers = { 
          'Host':'movie.douban.com',
          'Connection':'keep-alive',
          'Cache-Control':'max-age=0',
          'Accept': 'text/html, */*; q=0.01',
          'X-Requested-With': 'XMLHttpRequest',
          'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
          'DNT':'1',
          'Referer': 'http://example.com/',
          'Accept-Encoding': 'gzip, deflate, sdch',
          'Accept-Language': 'zh-CN,zh;q=0.8,ja;q=0.6'
    }

三、定义一个抓取函数

#主要做html页面抓取
async def fetch_content(url):
    await asyncio.sleep(2) # 防止请求过快 等待2秒
    async with aiohttp.ClientSession(
        headers=headers, connector=aiohttp.TCPConnector(ssl=sslcontext)
    ) as session:
        async with session.get(url) as response:
            return await response.text()

四、定义一个解析函数

#主要做html页面解析
async def parse(url):
    page=await fetch_content(url)
    html = etree.HTML(page)

    xpath_movie = '//*[@id="content"]/div/div[1]/ol/li'
    xpath_title = './/span[@class="title"]'
    xpath_pages = '//*[@id="content"]/div/div[1]/div[2]/a'

    pages = html.xpath(xpath_pages)  # 所有页面的链接都在底部获取
    
    fetch_list = [] #此定义主要用于组装所有页面完整链接
    result = []		#此定义主要用于版块解析

    for element_movie in html.xpath(xpath_movie):
        result.append(element_movie)
        
    for p in pages:
        fetch_list.append(url + p.get("href"))  # 解析翻页按钮对应的链接 组成完整后边页面链接    
    
    tasks = [fetch_content(url) for url in fetch_list]  # 并行处理所有翻页的页面

	#并发执行
    pages = await asyncio.gather(*tasks)
	
    for page in pages:
        html = etree.HTML(page)
        for element_movie in html.xpath(xpath_movie):
            result.append(element_movie)
    
    for i, movie in enumerate(result, 1):
        title = movie.find(xpath_title).text
        print(i,title)

五、调用

async def main():
  start = time()
  await parse(url)
  end = time()
  print("Cost {} seconds".format((end - start) / 5))


if __name__ == "__main__":
  await main()