python 多进程爬虫实践小项目

多进程:

from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
import time
import requests,os
from lxml import etree
import multiprocessing


'''
项目名称:Age动漫视频爬取器  
项目使用方法:于“自行配置参数区域”输入相应参数后直接运行
项目亮点:应用了多并发的多进程模块,在进行大量爬取的情况下能够更加有效地利用CPU资源,大大提高爬取效率:
          以 "火影忍者" 爬取24集 最大进程数4 为例 效率比单进程提高了近38% 
项目不足:该网站上有小部分视频如 "schooldays" 采用了blob协议加密,对于这样的视频暂时无法爬取   
目标网站:https://www.agedm.org/
作者:wzq(QQ:3217418192)
'''


def get_mp4_link(orgin_url,n):
    options = ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-cpu')
    driver = Chrome(options=options)
    url = orgin_url.replace('detail', 'play') + f'/1/{n}'
    driver.get(url)
    time.sleep(2)
    driver.switch_to.frame('''iframeForVideo''')
    tree = etree.HTML(driver.page_source)
    return tree.xpath('''/html/body/div/div/video/@src''')[0]



def download_mp4(mp4_link,n,headers,father_path):
    print(f'开始下载第{n}集\n')
    start = time.time()
    mp4_data = requests.get(url=mp4_link, headers=headers).content
    time.sleep(1)
    with open(f"D:\code\爬虫练习\Age动漫视频捏\{father_path}\\{n}.mp4", 'wb') as fp:
        fp.write(mp4_data)
    end = time.time()
    print(f"第{n}集下载完成,花费时间{format(end - start)}秒钟\n")



def mian(orgin_url,n,headers,father_path):
    mp4_link=get_mp4_link(orgin_url,n)
    download_mp4(mp4_link,n,headers,father_path)

    
def main():
    print("开始计时捏")
    StartTime = time.time()

    ##··········自行配置参数区域····························
    ##···················································
    orgin_url = "https://www.agedm.org/detail/20020006"  # 输入起始url 例如 https://www.agedm.org/detail/20240086
    father_path = "火隐忍者1"  # 输入作品名称,用以命名文件夹
    limit = 24  # 输入一共爬取多少集
    max_processes = 4  # 输入最大进程数
    ##··········自行配置参数区域····························
    ##···················································


    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
    }


    try:
        os.mkdir(f"D:\code\爬虫练习\Age动漫视频捏\\{father_path}")
    except:
        print("文件夹已存在")

    with multiprocessing.Pool(processes=max_processes) as pool:
        for n in range(1, limit + 1):
            pool.apply_async(mian, args=(orgin_url, n,headers,father_path))
        pool.close()
        pool.join()
    EndTime=time.time()
    print(f"多进程程序共耗时{format(EndTime-StartTime)}秒")


if __name__ == '__main__':
    main()

运行耗时:

单进程:

from selenium.webdriver import Chrome#实例化对象
from selenium.webdriver import ChromeOptions#实例化对象
import time
import requests,os
from lxml import etree


options = ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-cpu')
driver = Chrome(options=options)

orgin_url = "https://www.agedm.org/detail/20020006"
limit=24

headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

def download_mp4(mp4_link,n):
    mp4_data = requests.get(url=mp4_link, headers=headers).content
    time.sleep(1)
    with open(f"D:\code\爬虫练习\Age动漫视频捏\{father_path}\\{n}.mp4", 'wb') as fp:
        fp.write(mp4_data)


father_path="火影忍者2"
try :
    os.mkdir(f"D:\code\爬虫练习\Age动漫视频捏\\{father_path}")
except:
    print("文件夹已存在")

StartTime=time.time()
for n in range(1,limit+1):
    url=orgin_url.replace('detail','play')+f'/1/{n}'
    driver.get(url)
    time.sleep(2)
    driver.switch_to.frame('''iframeForVideo''')
    tree = etree.HTML(driver.page_source)
    mp4_link = tree.xpath('''/html/body/div/div/video/@src''')[0]

    print(f'开始下载第{n}集\n')
    start = time.time()
    download_mp4(mp4_link=mp4_link,n=n)
    end=time.time()
    print(f"第{n}集下载完成,花费时间{end-start}秒钟\n")

EndTime=time.time()
print(f"单进程程序共耗时{format(EndTime-StartTime)}秒")

运行耗时:

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值