1:在setting.py 中设置文件存放的路径
IMAGES_STORE = "/home/xx/xx/xx/images/"
2:在pipelines.py 文件代码
import scrapy
from scrapy.utils.project import get_project_settings
from scrapy.pipelines.images import ImagesPipeline
import os
class LolspiderPipeline(ImagesPipeline):
IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
def get_media_requests(self,item,info):
image_url = item["image"]
yield scrapy.Request(image_url)
def item_completed(self,result,item,info):
image_path = [x["path"] for ok, x in result if ok]
os.rename(self.IMAGES_STORE + image_path[0], self.IMAGES_STORE + item["name
"] + ".png")
item["imagePath"] = self.IMAGES_STORE + item["name"]
print(item["imagePath"][0])
return item
from scrapy.utils.project import get_project_settings
from scrapy.pipelines.images import ImagesPipeline
import os
class LolspiderPipeline(ImagesPipeline):
IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
def get_media_requests(self,item,info):
image_url = item["image"]
yield scrapy.Request(image_url)
def item_completed(self,result,item,info):
image_path = [x["path"] for ok, x in result if ok]
os.rename(self.IMAGES_STORE + image_path[0], self.IMAGES_STORE + item["name
"] + ".png")
item["imagePath"] = self.IMAGES_STORE + item["name"]
print(item["imagePath"][0])
return item
3:items中的代码
import scrapy
class xxxspiderItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
image = scrapy.Field()
imagePath = scrapy.Field()
class xxxspiderItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
image = scrapy.Field()
imagePath = scrapy.Field()
4:在自己的爬虫代码中:
def parse(self,response):
hero_list = response.xpath('//div[@class="mod-pic-bd"]//ul/li')
i = 0
for each in hero_list:
item = xxxspiderItem()
name = each.xpath('./div/text()').extract()[0]
image = each.xpath('./a/img/@src').extract()[0]
item['name'] = name
item['image'] = image
print(name)
print(image)
yield item
hero_list = response.xpath('//div[@class="mod-pic-bd"]//ul/li')
i = 0
for each in hero_list:
item = xxxspiderItem()
name = each.xpath('./div/text()').extract()[0]
image = each.xpath('./a/img/@src').extract()[0]
item['name'] = name
item['image'] = image
print(name)
print(image)
yield item
5: 保存json 文件的写法 在pipelines.py文件中
import scrapy
import json
import json
class LolspiderPipeline(object):
def __init__(self):
self.filename = open("lol.json","w")
def process_item(self,item,spider):
text = json.dumps(dict(item), ensure_ascii = False) + ",\n"
self.filename.write(text.encode("utf-8"))
return item
text = json.dumps(dict(item), ensure_ascii = False) + ",\n"
self.filename.write(text.encode("utf-8"))
return item
def close_spider(self,spider):
self.filename.close()
self.filename.close()