import requests
from lxml import etree
import os
url = "https://sc.chinaz.com/ppt"
headers = {
"headers": "Mozilla/5.0 "
}
r = requests.get(url, headers=headers)
r.encoding = 'utf8'
tree = etree.HTML(r.text)
url_list = tree.xpath("//div[@class='bot-div']")
if not os.path.exists('./resume'):
os.mkdir('./resume')
for u in url_list:
#解析出全部的ppt url
ptt_url_href = u.xpath("./a/@href")[0]
ptt_url_title = u.xpath("./a/@title")[0]
title = f"{ptt_url_title}.pptx"
ppt_url = f"https://sc.chinaz.com{ptt_url_href}"
#再获得每个ppt url
r1 = requests.get(ppt_url, headers=headers)
r1.encoding = 'utf8'
tree_1 = etree.HTML(r1.text)
ppt = tree_1.xpath("//div[@class='download-url']/a/@href")[0]
print(ppt)
# 再下载ppt
ppt_Data = requests.get(ppt, headers=headers).content
with open(f"./resume/{title}", 'wb') as f:
f.write(ppt_Data)
python爬虫项目批量下载ppt(用xpath)
最新推荐文章于 2023-11-23 14:42:13 发布