python爬虫使用pyppeteer爬取非静态页面内容，使用事件循环批量爬取，提升效率

IvanWKQ

已于 2024-01-03 10:32:47 修改

阅读量473

点赞数 7

文章标签： python 爬虫开发语言

于 2023-12-27 14:27:03 首次发布

本文链接：https://blog.csdn.net/IvanWKQ/article/details/135244900

版权

最近写的是彩票系统，需要爬取很多彩票信息，展示的代码只是我整个爬虫程序的一小部分

首先是使用pyppeteer抓取非静态页面，将整个逻辑封装成一个异步方法，有多个页面爬取，将多个页面爬取存储到任务列表，并行执行任务列表内部的异步函数

我这代码里面有个main（）函数，内部并行执行了各自的任务列表内部协程，算是一个拓展，有兴趣可以看看

我这个demo爬取的是竞彩网的体育资讯，爬取的都是近两天的部分体育资讯文章

至于为啥这么封装爬取数据，因为后台数据接收接口也是我写的，需要这么接收

最后

requests
pyppeteer
scrapy

以上的工具包自己记得下载

# _*_ coding: utf-8 _*_
# @Time: 2023/12/26
# @TODO: 体育资讯爬取
# @Author: wkq
from datetime import datetime, timedelta

import asyncio
import requests
from pyppeteer import launch
from scrapy import Selector

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/58.0.3029.110 Safari/537.3'}
# 基础url，用于爬取资讯内容的url拼接
base_url = 'https://www.sporttery.cn/'
# 体育资讯接收地址
sport_receive = '你的数据接收服务路径'

# 体育资讯爬取数据
all_sport_info = {'bk': [], 'soccer': []}
# 资讯爬取时间、开发阶段改为减去1，也就是昨天
crawl_date = datetime.today().date() - timedelta(days=1)
# 爬取字典{url类型: url}
crawl_dic = {'bk': 'https://www.sporttery.cn/htmlfrag/1053.html',
             'soccer': 'https://www.sporttery.cn/htmlfrag/1041.html'}
# 存储上次爬取的文章标题
old_title_cache = []


async def fetch_page_content(urlDic):
    """
    todo 异步抓取非静态页面的文章内容
    :param urlDic: url字典
    :return:
    """
    url = urlDic['content']
    # 启动无界面浏览器
    browser = await launch(headless=True)
    # 创建新的页面
    page = await browser.newPage()
    # 访问URL并等待网络空闲
    await page.goto(url, waitUntil='networkidle0')
    # 等待页面上的JavaScript完成渲染
    await page.waitForSelector('body')
    # class="g-bodyText"
    # 获取页面的HTML内容
    content = await page.content()
    # 关闭浏览器
    await browser.close()

    res = Selector(text=content)
    text = res.xpath("//div[@class='g-bodyText style-1 height-1']")
    # ".//div[@class='artical-share']"表达式说明   根据当前盒子元素向下寻找class=artical-share的div元素
    # 找到class=artical-share的div盒子并删除
    for son_div in text.xpath(".//div[@class='artical-share']"):
        son_div.remove()
    content_div = text.css('div.g-bodyText.style-1.height-1 *')
    # div盒子内部元素的包括标签在内的内容作为字符串拼接
    content_str = ''.join(element.get() for element in content_div)
    urlDic['content'] = content_str


async def crawlSportInfo():
    """
    todo 爬取资体育资讯
    :return:
    """

    each_titles = []
    for crawl_type in crawl_dic:
        url = crawl_dic[crawl_type]
        response = requests.get(url, headers=headers)
        res = Selector(text=response.text)
        # todo '//div[@class='m-jczx-list']'表达式说明--------- 表示从文档根节点开始查找div的class属性='m-jczx-list'的所有div元素
        bk_div_list = res.xpath("//div[@class='m-jczx-list']")
        for div in bk_div_list:
            # 获取资讯日期
            content_date = div.css('div.u-list-data::text').get().strip()
            # 获取资讯url并完成完整字符串拼接
            content_url = base_url + div.css('div.u-list-title a::attr(href)').get().strip()
            # 获取文章标题
            title = div.css('div.u-list-title a::text').get().strip()
            each_titles.append(title)

            print(f'{crawl_type}---\t日期={content_date}\turl={content_url}\ttitle=={title}')
            # 若资讯发布时间大于等于昨天，则将其封装到all_sport_info缓存中
            if datetime.strptime(content_date, "%Y-%m-%d").date() >= crawl_date \
                    and title not in old_title_cache:
                all_sport_info[crawl_type].append(
                    {'pubTime': content_date, 'title': title, 'content': content_url, 'category': crawl_type})
    # 到此已经获取到符合条件的文章内容url，根据url去爬取体育资讯文章内容
    for info_dic in all_sport_info:
        # 单种类型的体育资讯爬取信息： 篮球bk、足球soccer
        type_list = all_sport_info[info_dic]
        if type_list:
            # 封装协程任务列表
            task = [fetch_page_content(url_dic) for url_dic in type_list]
            # 并行启动任务列表任务，等待协程任务列表所有任务执行成功
            await asyncio.gather(*task)
    # requests.post(sport_receive, json=all_sport_info)
    print(f'有效数据-------------------{all_sport_info}')

    # 清空爬取数据
    for data_type in all_sport_info:
        content_list = all_sport_info[data_type]
        if content_list:
            content_list.clear()
    old_title_cache.clear()
    old_title_cache.extend(each_titles)
    print(f'清空数据-------------------{all_sport_info}')
    print(f'each_titles-------------------{each_titles}')
    each_titles.clear()



async def main():
     """
        todo 测试协程任务二次封装，这里是我的一个测试，两个并行协程任务内部各自又包含了多个并行执行的协程任务
     """
    # 创建并启动两个crawlSportInfo异步函数，你可以在里面追加其他异步任务
    tasks = [asyncio.create_task(crawlSportInfo()),asyncio.create_task(crawlSportInfo())]
    # 使用asyncio.gather来并行执行任务
    await asyncio.gather(*tasks)


if __name__ == '__main__':
    # 通过事件循环执行异步任务
    asyncio.get_event_loop().run_until_complete(crawlSportInfo())
    # 可以试试这个 asyncio.get_event_loop().run_until_complete(main())

IvanWKQ

关注

7
点赞
踩
8

收藏

觉得还不错? 一键收藏
1
评论
python爬虫使用pyppeteer爬取非静态页面内容，使用事件循环批量爬取，提升效率

首先是使用pyppeteer抓取非静态页面，将整个逻辑封装成一个异步方法，有多个页面爬取，将多个页面爬取存储到任务列表，执行任务列表。最近写的是彩票系统，需要爬取很多彩票信息，展示的代码只是我整个爬虫程序的一小部分。我这个demo爬取的是竞彩网的体育资讯，爬取的都是近两天的部分体育资讯文章。至于为啥这么封装爬取数据，因为后台数据接收接口也是我写的，需要这么接收。
复制链接

扫一扫