import requests
import os
import re
from lxml import etree
# 判断简历文件夹是否存在,不存在创建
if not os.path.exists("./简历"):
os.mkdir("./简历")
for z in range(1, 3):
if z == 1:
# 第一页url地址
url = "https://sc.chinaz.com/jianli/free.html"
else:
# 第二页url地址
url = f"https://sc.chinaz.com/jianli/free_{z}.html"
# 请求头
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.69"
}
page_text = requests.get(url=url, headers=headers)
# 避免请求数据为乱码
page_text.encoding = "utf-8"
result = page_text.text
# 将源码数据加载到etree对象中
tree = etree.HTML(result)
# 解析源码
url_list = tree.xpath("//div[@class='box col3 ws_block']/p")
for i in url_list:
# 每个简历模板的url地址
resume_url = i.xpath("./a/@href")[0]
# 每个简历模板的名称
resumeName = i.xpath("./a/text()")[0]
# 加载简历模板页面
page_text_2 = requests.get(url=resume_url, headers=headers)
# 处理乱码
page_text_2.encoding = "utf-8"
page_a = page_text_2.text
# 正则匹配下载连接
ree = """<li><a href='(.*?)' target="_blank">福建电信下载</a></li>"""
# 匹配到第一页所有的简历模板直接下载的url
download_url = re.findall(ree, page_a)
# 下载二进制数据到缓存
expect = requests.get(url=download_url[0], headers=headers).content
# 创建单个简历模板文件
with open("./简历/" + resumeName + ".rar", "wb") as fp:
# 写入数据
fp.write(expect)
print(resumeName + "下载完成!!!")