源代码
import requests
import os
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
if not os.path.exists('resume_template'):
os.mkdir('resume_template')
url_cnt = download_cnt = 0
template_url_list = []
template_name = []
download_url_list = []
url = 'http://sc.chinaz.com/jianli/free_%d.html'
for i in range(1, 6):
if i == 1:
entrance_url = 'http://sc.chinaz.com/jianli/free.html'
else:
entrance_url = format(url % i)
entrance_page = requests.get(url=entrance_url, headers=headers)
entrance_page.encoding='utf-8'
tree = etree.HTML(entrance_page.text)
entrance_list = tree.xpath('//div[@id="main"]//p/a')
for item in entrance_list:
name = item.xpath('./text()')[0]
addr = item.xpath('./@href')[0]
# print(title)
# print(addr)
template_url_list.append(addr)
template_name.append(name)
url_cnt += 1
print(url_cnt)
for i in range(len(template_url_list)):
template_url = template_url_list[i]
print(template_url)
template_page = requests.get(url=template_url, headers=headers).text
tree = etree.HTML(template_page)
download_url = tree.xpath('//div[@class="down_wrap"]//ul/li[10]/a/@href')[0]
path = './resume_template/' + template_name[i]
# if not os.path.exists(path):
# os.mkdir(path)
# print("make")
template = requests.get(url=download_url, headers=headers).content
# z = zipfile.ZipFile(io.BytesIO(template))
# z.extractall(path)
with open(path + '.rar', 'wb') as f:
f.write(template)
download_cnt += 1
download_url_list.append(download_url)
print(download_cnt)
-
第一, xpath的语法不太熟练
这个xpath其实语法挺简单的,只要系统地去了解一下就行,搞清楚相应符号的意义基本就问题不大
-
第二, 当url对应的下载文件是zip文件时,不知道该以何种方式下载能让下载下来的直接就是压缩文件,后来是参考了 python爬虫爬取站长素材免费简历模板完成的。
运行结果