【单线程异步协程】批量下载糗事百科视频

导包

import asyncio
import aiohttp
from lxml import etree
import requests
import time 

获取最大页数

def get_maxpage():
    url = "https://www.qiushibaike.com/video/page"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
    }
    response = requests.get(url,headers=headers)
    html = etree.HTML(response.text)
    maxpage = html.xpath('//ul[@class = "pagination"]//span/text()')[-2].strip()
    print(maxpage)
    return maxpage

依次抓取每个页面上的视频详情链接,并调用函数

def get_page_url(maxpage):
    for i in range(1,int(maxpage)+1):
        url = f"https://www.qiushibaike.com/video/page/{i}/"
        headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
        }
        response = requests.get(url,headers=headers)
        html = etree.HTML(response.text)
        url = html.xpath('//div[@class="col1 old-style-col1"]/div/a/@href')
        url = list(set(url))
        url = ["https://www.qiushibaike.com"+ j for j in url]
        task2 = [get_down_data(url) for url in url]
        asyncio.run(asyncio.wait(task2))

依次抓取每个视频详情页上的视频名字及下载链接【协程函数】

async def get_down_data(url2):
    print(f"打开{url2}")
    async with aiohttp.ClientSession() as session:

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
            }
        async with await session.get(url=url2,headers=headers) as response:
            res = await response.text()
            html = etree.HTML(res)
            name = html.xpath('//div[@class="video"]/div/text()')[0]
            down_url = html.xpath('//*[@id="article-video"]/source/@src')[0]
            down_url1 = "https:" + down_url
            download_date(name,down_url1)

打开下载链接,并保存数据

def download_date(name,down_url1):
    print(name,down_url1)
    houzhui = down_url1.split('.')[-1]
    res = requests.get(down_url1).content
    with open(f'e:/糗事百科视频/{name}.{houzhui}','wb') as f:
        f.write(res)
        print(name,"下载完成!")
        

if __name__ == "__main__":

    maxpage = get_maxpage()
    get_page_url(maxpage)

完整代码

import asyncio
import aiohttp
from lxml import etree
import requests
import time 
#获取最大页数
def get_maxpage():
    url = "https://www.qiushibaike.com/video/page"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
    }
    response = requests.get(url,headers=headers)
    html = etree.HTML(response.text)
    maxpage = html.xpath('//ul[@class = "pagination"]//span/text()')[-2].strip()
    print(maxpage)
    return maxpage

# def get_page_url(maxpage):
#     url = []
#     for i in range(1,int(maxpage)+1):
#         url1 = f"https://www.qiushibaike.com/video/page/{i}/"
#
#         url.append(url1)
#     return url

def get_page_url(maxpage):
    for i in range(1,int(maxpage)+1):
        url = f"https://www.qiushibaike.com/video/page/{i}/"
        headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
        }
        response = requests.get(url,headers=headers)
        html = etree.HTML(response.text)
        url = html.xpath('//div[@class="col1 old-style-col1"]/div/a/@href')
        url = list(set(url))
        url = ["https://www.qiushibaike.com"+ j for j in url]
        task2 = [get_down_data(url) for url in url]
        asyncio.run(asyncio.wait(task2))

# async def open_url(url):
#     async with aiohttp.ClientSession() as session:
#         headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
#         }
#         async with await session.get(url=url,headers=headers) as response:
#             resp = await response.text()
#             html = etree.HTML(resp)
#             url = html.xpath('//div[@class="col1 old-style-col1"]/div/a/@href')
#             url = list(set(url))
#             url = ["https://www.qiushibaike.com"+ j for j in url]
#             print(url)
#             task2 = [get_down_data(url2) for url2 in url]
#             asyncio.run(asyncio.wait(task2))

async def get_down_data(url2):
    print(f"打开{url2}")
    async with aiohttp.ClientSession() as session:

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41'
            }
        async with await session.get(url=url2,headers=headers) as response:
            res = await response.text()
            html = etree.HTML(res)
            name = html.xpath('//div[@class="video"]/div/text()')[0]
            down_url = html.xpath('//*[@id="article-video"]/source/@src')[0]
            down_url1 = "https:" + down_url
            download_date(name,down_url1)

def download_date(name,down_url1):
    print(name,down_url1)
    houzhui = down_url1.split('.')[-1]
    res = requests.get(down_url1).content
    with open(f'e:/糗事百科视频/{name}.{houzhui}','wb') as f:
        f.write(res)
        print(name,"下载完成!")
        

if __name__ == "__main__":

    maxpage = get_maxpage()
    # url = get_page_url(maxpage)

    # task1 = [open_url(url) for url in url]
    # asyncio.run(asyncio.wait(task1))

    get_page_url(maxpage)  

备注:
注释部分为,每个页面url 也想采用协程,但是会报错,如果谁有解决的方式或者更好的方法,欢迎留言。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值