Python59行代码爬取某站中秋节ppt模板
- 网址:https://www.1ppt.com/moban/zhongqiujie/
- 用到的库:requests、re、os、lxml
- 请求方式:GET
- 项目需求:将所有页模板下载到对应的文件夹中-
一共7页,分别将对应的模板下载到对应的文件夹中
-代码如下:
import requests
import re
import os
from lxml import etree
def path_replace(path):
"""路径分隔符转换"""
after_path = path.replace("\\", "/")
return after_path
def pre(o_url, agent, encoding="utf-8"):
"""预处理"""
pre_response_data = requests.get(o_url, agent)
pre_response_data.encoding = encoding
pre_solve_data = etree.HTML(pre_response_data.text)
return pre_solve_data
def create_directory(path, e_name):
"""创建文件目录"""
if not os.path.exists(path):
os.mkdir(path)
print(f"{e_name}目录 创建成功!")
w_path = "F:\PycharmFiles\pachong\\first_ppt\saves\ppts\zhongqiu"
w_path = path_replace(w_path)
site = "https://www.1ppt.com/moban/zhongqiujie/"
ua_agent = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0"}
solve_data = pre(site, ua_agent, encoding='gbk')
page_data = solve_data.xpath("//a[text()='末页']/@href")
if page_data:
page_str = page_data[0]
total_page = re.findall("\d+", page_str)
else:
total_page = 1
o_url = "https://www.1ppt.com/moban/zhongqiujie/ppt_zhongqiujie_"
page_hrefs = [o_url + str(page) + ".html" for page in list(range(2, int(total_page[0]) + 1))]
page_hrefs.insert(0, "https://www.1ppt.com/moban/zhongqiujie")
for page_href in page_hrefs:
solve_data1 = pre(page_href, ua_agent, encoding='gbk')
file_names = solve_data1.xpath("//ul[@class='tplist']/li/a/img/@alt")
ppt_urls = solve_data1.xpath("//ul[@class='tplist']/li/a/@href")
dict_datas = dict(zip(file_names, ppt_urls))
for file_name, ppt_url in dict_datas.items():
create_directory(w_path + "/" + file_name, file_name)
ppt_url1 = "https://www.1ppt.com"
solve_data2 = pre(ppt_url1 + ppt_url, ua_agent)
download_urls = solve_data2.xpath("//ul[@class='downurllist']/li/a/@href")
solve_data3 = pre(ppt_url1 + download_urls[0], ua_agent)
end_urls = solve_data3.xpath("//li[@class='c1']/a/@href")
ppt_zip = requests.get(end_urls[0], ua_agent).content
s_path = w_path + "/" + file_name + "/" + end_urls[0].split('/')[-1]
print(s_path)
with open(s_path, 'wb') as f:
f.write(ppt_zip)
print(f"{file_name}文件下载成功啦!")
运行结果展示
结语
- 至此,本次爬虫项目分享结束,欢迎各位道友批评指正,也欢迎小白交流讨论,因为我也是一个正在学习的小白。最后,如果觉得对您有用,盼请各位看官点赞收藏。