获取网站的所有ppt模板

Talisens

已于 2022-07-01 09:43:24 修改

阅读量231

点赞数

文章标签： python 爬虫开发语言

于 2022-07-01 09:38:10 首次发布

本文链接：https://blog.csdn.net/Talisens/article/details/125552683

版权

ppt下载

from bs4 import BeautifulSoup
import requests
import time
from threading import Thread

print("正在下载ppt")

#进入ppt下载主页
url = "https://www.1ppt.com/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37"
           ,"Referer": "https://www.1ppt.com/"
           }
response = requests.get(url=url,headers=headers)
response.encodings='utf-8'
# print(response.text)

#获取进入ppt模板网页的链接
html = BeautifulSoup(response.text,"html.parser")
col_nav = html.find(class_="col_nav i_nav clearfix").find_all("a")
list_a = list()

#遍历ppt下载链接
for i in col_nav:
    href = i.get("href")
    hrefs = "https://www.1ppt.com/" + href
    time.sleep(1)
#进入ppt模板
    response1 = requests.get(url=hrefs,headers=headers)
    # print(response1.text)
    html1 = BeautifulSoup(response1.text,"html.parser")
    a = html1.find(class_="tplist").find_all("a")
#遍历ppt模板里面的所有超链接
    for j in a:
#建立一个列表放a来去重
        href1 = j.get("href")
        a_href = "https://www.1ppt.com" + href1
        if a_href not in list_a:
            list_a.append(a_href)
            time.sleep(0.5)
            print(f"获取第{j}个网页")
# 遍历列表里面的链接
for i in list_a:
    print(f"下载第{i}个网页")
    reponse2 = requests.get(url=i)
    html2 = BeautifulSoup(reponse2.text,"html.parser")

    # 获取下载链接zip网页
    downurllist_a = html2.find(class_="downurllist").find_all("a")


    #点击进入第一PPT素材下载页面
    down_ppt = requests.get(url=downurllist_a)
    html2 = BeautifulSoup(down_ppt.text, 'html.parser')
    a1 = html2.find(class_='c1').find_all('a')
    for x in a1:
        href1 = x.get('href')
        down_zip_ppt = requests.get(url=href1)
    with open(i + ".zip",mode="wb") as f:
        f.write(down_zip_ppt.content)