python爬虫入门学习7-xpath-chinaz-简历爬取下载-retry

import requests
from util import headers_utils as hd, download_util as dl
from lxml import html
import os

if __name__ == '__main__':
    page_num = 744 # 总页数 懒得也逻辑查了 这里自己下载前到网站上确认下
    dir_path = './resume'
    # 文件夹不存在则自动创建
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
        
    # 找不到lxml.etree的从html里获取 lxml 版本升级导致
	etree = html.etree
	https_url = 'https:'
	# 分页查询
    for page in range(1, page_num):
        url = 'https://sc.chinaz.com/jianli/free.html'
        # 配合分页url拼接
        if page != 1:
            url = 'https://sc.chinaz.com/jianli/free_%s.html' % page
        print('开始下载第%d页' % page)
        resp = requests.get(url=url, headers=hd.headers())
        resp.encoding = 'utf8'
        # 解析网页
        tree = etree.HTML(resp.text)
        a_list = tree.xpath('//div [@id="container"]/div/a[1]')

		# 循环下载
        for a in a_list:
            resume_name = a.xpath('./img/@alt')[0] + '.zip'
            resume_url = https_url + a.xpath('./@href')[0]
            # 解析内层
            resp = requests.get(url=resume_url, headers=hd.headers())
            resp.encoding = 'utf8'
            tree = etree.HTML(resp.text)
            download_url = tree.xpath('//ul[@class="clearfix"]/li/a/@href')[0]
            # 50页以后缺少https://前缀 加个容错
            if not str(download_url).__contains__(https_url):
                download_url = https_url + download_url
            dl.download_file(dir_path, resume_name, download_url)
import requests
from util import headers_utils as hd
from retrying import retry
# 下载重试组件 未测试
'''
retry参数说明:
retry(wait_fixed = 1000)    #设置重试间隔时长(ms  1000ms = 1s)
retry(wait_random_min = 1000,wait_random_max = 2000,)    #随机重试间隔,将在1~2s内
retry(stop_max_attempt_number = 3)        #最大重试次数,超过后正常抛出异常
retry(stop_max_delay = 2000)            #最大延迟时长,2s内未满足条件则抛出异常
retry(retry_on_exception = 自定义函数)        #当发生指定异常时会执行函数
retry(retry_on_result=自定义函数)  #每次都会执行函数,当返回返回True就重试,否则异常退出    
'''
@retry(wait_random_min=5000, wait_random_max=10000, stop_max_attempt_number=10)
def download_file(dir_path, resume_name, download_url):
    print('开始下载', resume_name, download_url)
    byte_img_data = requests.get(url=download_url, headers=hd.headers()).content
    img_path = dir_path + '/' + resume_name
    with  open(img_path, 'wb') as fp:
        fp.write(byte_img_data)
        print(resume_name, '下载成功')
# 统一放一个文件里用直接自取
def headers():
    return {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
    }
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值