import requests
from util import headers_utils as hd, download_util as dl
from lxml import html
import os
if __name__ == '__main__' :
page_num = 744
dir_path = './resume'
if not os. path. exists( dir_path) :
os. mkdir( dir_path)
etree = html. etree
https_url = 'https:'
for page in range ( 1 , page_num) :
url = 'https://sc.chinaz.com/jianli/free.html'
if page != 1 :
url = 'https://sc.chinaz.com/jianli/free_%s.html' % page
print ( '开始下载第%d页' % page)
resp = requests. get( url= url, headers= hd. headers( ) )
resp. encoding = 'utf8'
tree = etree. HTML( resp. text)
a_list = tree. xpath( '//div [@id="container"]/div/a[1]' )
for a in a_list:
resume_name = a. xpath( './img/@alt' ) [ 0 ] + '.zip'
resume_url = https_url + a. xpath( './@href' ) [ 0 ]
resp = requests. get( url= resume_url, headers= hd. headers( ) )
resp. encoding = 'utf8'
tree = etree. HTML( resp. text)
download_url = tree. xpath( '//ul[@class="clearfix"]/li/a/@href' ) [ 0 ]
if not str ( download_url) . __contains__( https_url) :
download_url = https_url + download_url
dl. download_file( dir_path, resume_name, download_url)
import requests
from util import headers_utils as hd
from retrying import retry
'''
retry参数说明:
retry(wait_fixed = 1000) #设置重试间隔时长(ms 1000ms = 1s)
retry(wait_random_min = 1000,wait_random_max = 2000,) #随机重试间隔,将在1~2s内
retry(stop_max_attempt_number = 3) #最大重试次数,超过后正常抛出异常
retry(stop_max_delay = 2000) #最大延迟时长,2s内未满足条件则抛出异常
retry(retry_on_exception = 自定义函数) #当发生指定异常时会执行函数
retry(retry_on_result=自定义函数) #每次都会执行函数,当返回返回True就重试,否则异常退出
'''
@retry( wait_random_min= 5000 , wait_random_max= 10000 , stop_max_attempt_number= 10 )
def download_file ( dir_path, resume_name, download_url) :
print ( '开始下载' , resume_name, download_url)
byte_img_data = requests. get( url= download_url, headers= hd. headers( ) ) . content
img_path = dir_path + '/' + resume_name
with open ( img_path, 'wb' ) as fp:
fp. write( byte_img_data)
print ( resume_name, '下载成功' )
def headers ( ) :
return {
'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36' ,
}