Python爬虫-ppt全站爬取
import re import requests wz = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'} page = 1 # 1.请求ppt模板的总页面 https://www.ypppt.com/moban/ 找到很多个下载页面的网址 while True: if page == 1: url1 = 'https://www.ypppt.com/moban/' else: url1 = f'https://www.ypppt.com/moban/list-{page}.html' res = requests.get(url1, headers=wz) res.encoding = 'utf-8' # python最常用最常用的编码 万国码! aids = re.findall('<a href="/article/.*?/(.*?).html" class="p-title"', res.text) if not aids: break titles = re.findall('class="p-title" target="_blank">(.*?)</a>', res.text) for i in range(len(aids)): url2 = 'https://www.ypppt.com/p/d.php?aid=' + aids[i] # 每一个编号aids[i] res = requests.get(url2, headers=wz) # 请求这个链接 找到下载地址 download_url = re.findall('<li><a href="(.*?)">下载地址1</a></li>', res.text) res = requests.get(download_url[-1], headers=wz) # 请求下载的链接 保存在文件中 open(f'PPT模板/{titles[i]}.zip', 'wb').write(res.content) # 每一个标题titles[i] print(f'已经下载{titles[i]}.zip~~~') page += 1 # url2 = 'https://www.ypppt.com/p/d.php?aid=' + aid # 2.请求下载页面的网址 https://www.ypppt.com/p/d.php?aid=6157 找到下载地址 # 3.https://down.ypppt.com/uploads/soft/200617/1-20061H34358.zip 下载文件