话不多说吧 直接上
from lxml import etree
import requests
import os
# 封装解析下载函数
def cv_down(tree, headers):
cv_href = tree.xpath('//div[@class="sc_warp mt20"]/div/div/div/a/@href')
for href in cv_href:
act_response = requests.get(url=href, headers=headers).text
act_tree = etree.HTML(act_response)
cv_title = act_tree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()')
cv_title = cv_title[0].encode('ISO-8859-1').decode('utf-8') + '.rar'
dow_url = act_tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[1]/a/@href')[0]
doc = requests.get(url=dow_url, headers=headers).content
cv_path = './免费简历/' + cv_title
with open(cv_path, 'wb') as fp:
fp.write(doc)
print(cv_title, '下载完成!!!')
# 检查文件夹是否存在,并创建文件夹
if not os.path.exists('./免费简历'):
os.mkdir('./免费简历')
first_page_url = 'http://sc.chinaz.com/jianli/free.html'
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
}
tree = etree.HTML(requests.get(url=first_page_url, headers=headers).text)
page_limit = tree.xpath('//div[@class="pagination fr clearfix clear"]/a[8]/b/text()')[0]
while True:
print(page_limit + " pages at most")
page = input("Please enter how many page you want: ")
if page.isdigit() and 1 <= int(page) <= int(page_limit):
for i in range(1, int(page) + 1):
if i == 1:
cv_down(tree, headers)
else:
other_page_url = 'http://sc.chinaz.com/jianli/free_' + str(i) + '.html' # 一页以上的特殊性需要重新制定链接
response = requests.get(url=other_page_url, headers=headers).text
tree = etree.HTML(response)
cv_down(tree, headers)
break
else:
print("You can only enter right numbers!!!")
continue