# https://www.zcool.com.cn/
import requests,os,re
from urllib import request
from lxml import etree
class ZkSpider(object):
def __init__(self):
self.url='https://www.zcool.com.cn/'
self.headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
def get_html(self):
html=requests.get(self.url,headers=self.headers).text
# print(html)
html_obj=etree.HTML(html)
titles=html_obj.xpath('//div[@class="work-list-box"]/div[@class="card-box"]/div/a/@title') # 每组图片的标题
hrefs=html_obj.xpath('//div[@class="work-list-box"]/div[@class="card-box"]/div/a/@href') # 每组图片的链接
print('****',titles) # 第一页所有的标题
self.get_detail_img(hrefs,titles) # 嵌套函数,直接引用
#解析详情页图片
def get_detail_img(self,hrefs,titles):
for index,href in enumerate(hrefs): # 循环出每组图片的链接和索引,根据索引匹配到每组图片的标题
title = titles[index].replace(":","").replace(" ","_").replace("×","_").replace("|", "_").replace(",", "_").replace('/','') # 把标题中的特殊字符替换掉
print('*****',title) # 打印出标题
# if not os.path.exists(f'imgs'):
# os.mkdir(f'imgs') # 先建一个总的文件夹用来存储子文件夹和图片
if not os.path.exists(f"imgs/{title}"):
os.mkdir(f"imgs/{title}") # 以分组的标题为名新建文件夹
response = requests.get(url=href, headers=self.headers) # 请求每一组匹配到的链接
self.html = etree.HTML(response.text) # 生成对象树
# imgs=self.html.xpath('//div[@class="work-show-box"]/div/img/@src')
imgs = re.findall(re.compile(r'.*?<img src="https://img(.*?)"', re.S), response.text) # 每组图片中每张图片的链接
print(imgs) # 整组图片的链接
imgs = ["https://img" + url for url in imgs] # 拼接每张图片的链接,存放在列表中
for i, img in enumerate (imgs): # 循环出每张图片的链接和索引
print(img)
if img: # 如果存在这张图片
response = requests.get(img).content
# 打开图片文件,写进图片数据
with open(f"imgs/{title}/{i}.{img.split('.')[-1]}", "wb") as w: # 以索引为图片名加上后缀名,然后写图片
w.write(response)
else:
print(f"出问题的{img}")
for index,title in enumerate(titles):
print(title[index])
for x, img in enumerate(imgs):
print(x)
# 获取图片数据
response = requests.get(img).content
# 打开图片文件,写进图片数据
with open(f"images/{title[index]}", "wb") as w:
w.write(response)
# print(f'{index}',img)
for index,title in enumerate(titles):
# print(title)
self.title = title
path = self.mkdir + '/' + self.title
if not os.path.exists(path):
os.mkdir(path)
print('正在下载第{}张图片,请稍后。。。。'.format(index))
request.urlretrieve(img,f'images/{title}')
images=ZkSpider()
images.get_html()
爬取站酷网图片
最新推荐文章于 2024-10-18 23:35:21 发布