目录
基本保存
- pipelines.py中书写管道的类
- setting.py中开启普通管道
pipelines.py
class MyScrapyPipeline:
def __init__(self):
self.f = open('q.csv',mode='a',encoding='utf8')
# def from_crawler(cls, crawler):
# # from_crawler()作用从settings读取相关配置,然后可以将读取内容保存类中使用
# pass
def open_spider(self,spider):
# Spider开启的时候默认自动调用的方法
self.f.write('text,author,Tag')
self.f.write('\n')
def close_spider(self,spider):
# Spider关闭的时候默认自动调用的方法
self.f.close()
def process_item(self, item, spider):
# pipelines 管道文件 会拿爬虫返回所有item数据
self.f.write(item['text'] + ',' + item['author'] + ',' + '/'.join(item['Tag'])) # join数组拼接
self.f.write('\n')
return item
settings.py
ITEM_PIPELINES = {
# 开启MyScrapyPipeline类的管道
'my_scrapy.pipelines.MyScrapyPipeline': 300,
}
照片管道
需求分析
- 目标网站:www.douyu.com
- 找到ajax隐藏的图片数据链接
- 将图片转换为json数据
- 在循环里实例化对象迭代出图片数据到管道
- 在管道里启用图片专属管道
- 在setting开启保存路径和启用管道
源代码
spider.py
import scrapy
import json
from douyu_img.items import DouyuImgItem
class SpiderSpider(scrapy.Spider):
name = 'spider'
# allowed_domains = ['douyu.com'] # 记住要注释掉
# 找到ajax隐藏的图片数据链接
start_urls = ['https://m.douyu.com/api/room/list?page={}&type=yz']
def start_requests(self):
# 翻页
for i in range(1,5):
url = self.start_urls[0].format(i)
yield scrapy.Request(url = url,callback=self.parse)
def parse(self, response):
# json.loads 用于解码 JSON 数据。该函数返回 Python 字段的数据类型
# 获取图片数据
data = json.loads(response.text)['data']['list']
# 循环获取图像数据
for i in data:
# 实例化对象
item = DouyuImgItem()
# 数据
item["nickname"] = i["nickname"]
item['roomSrc'] = i['roomSrc']
yield item
items.py
import scrapy
class DouyuImgItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
nickname = scrapy.Field()
roomSrc = scrapy.Field()
pipelines.py
from scrapy.pipelines.images import ImagesPipeline
import scrapy
import os
# PS:要继承ImagesPipeline
class DouyuImgPipeline(ImagesPipeline):
# 图片下载专业函数
def get_media_requests(self, item, info):
# 从爬虫中获取的图片链接
item_url = item['roomSrc']
yield scrapy.Request(url=item_url)
def item_completed(self, results, item, info):
# 图片路径
path = 'D:/爬虫项目/douyu_img/img'
# 图片原本名字
image_path = results[0][1]['path']
# 改变保存后的图片名字
os.rename(path + '/' + image_path, path + '/' + item["nickname"] + '.jpg')
return item
settings.py
IMAGES_STORE = 'D:\爬虫项目\douyu_img\img'
ITEM_PIPELINES = {
'douyu_img.pipelines.DouyuImgPipeline': 300,
}