《2018年8月26日》【连续327天】
标题:pyspider爬取照片实现;
内容:
参照了别人的轮子,把前几天的代码完善了一下:
参考:https://blog.csdn.net/u013919836/article/details/68066715
from pyspider.libs.base_handler import *
DIR_PATH = 'D:\honkai3'
import os
class Handler(BaseHandler):
crawl_config = {
}
def __init__(self):
self.base_url = 'https://bcy.net/huodong/176?&p='
self.page_num = 1
self.dir_path = DIR_PATH
self.total_num = 20
self.tool = Tool()
@every(minutes=24 * 60)
def on_start(self):
while self.page_num <= self.total_num:
url = self.base_url + str(self.page_num)
print(url)
self.crawl(url, callback=self.index_page,validate_cert=False)
self.page_num += 1
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
count = 1
#获取图片详情链接标签列表
for each in response.doc('.db.ovf').items():
url =each.attr.href
#进入图片详情页
self.crawl(url, callback=self.detail_page,validate_cert=False,save={"count":count})
count +=1
@config(priority=2)
def detail_page(self, response):
#获取图片标签
imgElem = response.doc(".detail_clickable")
#获取图片地址
imgUrl = imgElem.attr.src
if imgUrl:
#获取图片文件后缀
extension = self.tool.get_extension(imgUrl)
#拼接图片名
file_name = str(response.save["count"]) + ".jpg"
self.crawl(imgUrl,callback=self.save_img,save={"file_name":file_name},validate_cert=False)
#保存图片
def save_img(self,response):
content = response.content
file_name = response.save["file_name"]
file_path = self.dir_path + os.path.sep + file_name
self.tool.save_img(content,file_path)
#工具类
class Tool:
def __init__(self):
self.dir = DIR_PATH
#创建文件夹(如果不存在)
if not os.path.exists(self.dir):
os.makedirs(self.dir)
#保存图片
def save_img(self,content,path):
f = open(path,"wb" )
f.write(content)
f.close()
#获取url后缀名
def get_extension(self,url):
extension = url.split(".")[-1]
return extension