写太久了,可能出问题了,因为网站是在不断更新的,所以博主又更新了一个比较好的,地址如下:https://blog.csdn.net/weixin_42813107/article/details/118581499?spm=1001.2014.3001.5501
目标网址:https://www.ypppt.com/moban/zhongguo/
爬取内容为PPT模板,分类:中国风,改变URL可以下载其他分类,请自行修改
部分下载地址为百度网盘,使用跳过,爬取获得的是rar或者zip文件(编号命名),需要自己解压,全选解压到当前文件夹就行了
下载如图:
解压后:
每个压缩文件内容:
下面上代码(仅供参考学习,有不足):
import requests
import re
import os
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
def details_urls(url):#获取每一个页面的详情页地址,返回一个连接列表
#url = 'https://www.ypppt.com/moban/zhongguo/list-5.html'
req = requests.get(url,headers=headers)
data=req.content.decode('utf-8')
url1=re.findall('<a href="/article(.*?)" class', data)
return url1
def downloads(url):#获取每一个详情页的下载页面地址,返回一个连接
req = requests.get(url,headers=headers)
data=req.content.decode('utf-8')
down=re.findall('<a href="/p/(.*?)"', data)
url_= 'https://www.ypppt.com/p/'+down[0]
return url_
def downloads_urls(url):
req = requests.get(url,headers=headers)
data=req.content.decode('utf-8')
down=re.findall('<a href="/uploads/(.*?)"', data)
name = re.findall('<a href="/uploads/soft/.*?/(.*?)"', data)
url_= 'https://www.ypppt.com/uploads/'+down[0]
return name[0],url_
def download(name,url):
path = r'./PPTS/'
if not os.path.exists(path):
os.makedirs(path)
with open(path+name,'wb') as fp:
fp.write(requests.get(url,headers=headers).content)
page = int(input("请输入下载页数:"))
for i in range(1,page+1):
if i == 1:
url = 'https://www.ypppt.com/moban/zhongguo/'
else:
url = 'https://www.ypppt.com/moban/zhongguo/list-'+str(i)+'.html'
for j in details_urls(url):
url_ = 'https://www.ypppt.com/article'+j
print(url_)
download(downloads_urls(downloads(url_))[0],downloads_urls(downloads(url_))[1])