仅做学习用,不对相关网站抱有任何恶意或不良影响。
目标是爬取一些简历,在这里选中简历
随便挑一个类别:
import requests
from lxml import etree
import os
header = {
把你自己的搞进去啊
}
if not os.path.exists('d:/jianli'):
os.mkdir('d:/jianli')
# 进行翻页下载
for page in range(0, 4):
if page == 0:
url = 'https://sc.chinaz.com/tag_jianli/GongChengShi.html' # 首页不带数字下标
else:
url = 'https://sc.chinaz.com/tag_jianli/GongChengShi_' + str(page) + '.html'
response = requests.get(url, headers=header).text
# response.encode('utf-8')
tree = etree.HTML(response)
resume_list = tree.xpath('//div[@id="main"]/div/div/a/@href') # 这个很重要,要定位准确
# 对一个简历详情进行查找下载
for deatil in resume_list:
every_url = 'https://sc.chinaz.com' + deatil
info = requests.get(every_url, headers=header).content # 乱码解决方案1
little_tree = etree.HTML(info)
try: # 存在付费情况,无相关链接会报错
resume_name = little_tree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()')[0] # 列表取出字符
download_url = little_tree.xpath('//div[@class="down_wrap"]/div[2]/ul/li[1]/a/@href')[0]
resume_get = requests.get(download_url, headers=header).content
with open("d:/jianli/" + resume_name + '.rar', 'wb') as f:
f.write(resume_get)
print(resume_name + '下载成功')
except:
continue