1.爬取图片
import requests
from lxml import etree
from urllib import request
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
}
dir_name = "./meinv"
if not os.path.exists(dir_name):
os.mkdir(dir_name)
for i in range(1,10):
url = "http://sc.chinaz.com/tag_tupian/yazhoumeinv_{}.html"
if i == 1:
new_url = "http://sc.chinaz.com/tag_tupian/yazhoumeinv.html"
else:
new_url = url.format(str(i))
ret = requests.get(url=new_url, headers=headers).text
tree = etree.HTML(ret)
img_name_list= tree.xpath("//div[@id='container']/div/p/a[@target='_blank']/text()")
img_path_list = tree.xpath("//div[@id='container']/div/div/a/img/@src2")
for img in range(len(img_name_list)):
img_name =img_name_list[img].encode('iso-8859-1').decode('utf-8')
img_path = img_path_list[img]
img_dir = dir_name+"/"+img_name+".jpg"
img = requests.get(url=img_path,headers=headers).content
# request.urlretrieve(img_name+":"+img_path,img_dir)
with open(img_dir,'wb') as f:
f.write(img)
2.爬取简历模板
import requests
from lxml import etree
from urllib import request
import os
import random
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
}
dir_name = "./jianli"
url = "http://sc.chinaz.com/jianli/free_{}.html"
if not os.path.exists(dir_name):
os.mkdir(dir_name)
for i in range(1,10):
if i == 1:
new_url = "http://sc.chinaz.com/jianli/free.html"
else:
new_url = url.format(str(i))
ret = requests.get(url=new_url, headers=headers).text
tree = etree.HTML(ret)
text_name_list = tree.xpath("//div[@id='container']/div/p/a[@target='_blank']/@href")
for text in range(len(text_name_list)):
rets = requests.get(url=text_name_list[text], headers=headers).text
tree = etree.HTML(rets)
name = tree.xpath('//div[@class="ppt_tit clearfix"]/h1/text()')[0].encode('iso-8859-1').decode('utf-8')
xin = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a[1]/@href')
x_url = random.choice(xin)
try:
ret = requests.get(url=x_url,headers=headers).content
with open(dir_name+"/"+name+".rar",'wb') as f:
f.write(ret)
print(name,":","下载成功")
except requests.exceptions.ConnectionError as c:
pass