python爬虫项目批量下载ppt（用xpath）

最新推荐文章于 2023-11-23 14:42:13 发布

_fox

最新推荐文章于 2023-11-23 14:42:13 发布

阅读量156

点赞数

文章标签： 1024程序员节

本文链接：https://blog.csdn.net/qq_19599237/article/details/134022590

版权

import requests
from lxml import etree
import os

url = "https://sc.chinaz.com/ppt"
headers = {
    "headers": "Mozilla/5.0 "
}

r = requests.get(url, headers=headers)
r.encoding = 'utf8'
tree = etree.HTML(r.text)
url_list = tree.xpath("//div[@class='bot-div']")

if not os.path.exists('./resume'):
    os.mkdir('./resume')

for u in url_list:
    #解析出全部的ppt url
    ptt_url_href = u.xpath("./a/@href")[0]
    ptt_url_title = u.xpath("./a/@title")[0]
    title = f"{ptt_url_title}.pptx"
    ppt_url = f"https://sc.chinaz.com{ptt_url_href}"

    #再获得每个ppt url
    r1 = requests.get(ppt_url, headers=headers)
    r1.encoding = 'utf8'
    tree_1 = etree.HTML(r1.text)
    ppt = tree_1.xpath("//div[@class='download-url']/a/@href")[0]
    print(ppt)

    # 再下载ppt
    ppt_Data = requests.get(ppt, headers=headers).content
    with open(f"./resume/{title}", 'wb') as f:
        f.write(ppt_Data)