逆向爬虫16 Scrapy持久化存储
在Scrapy中,数据可以持久化存储到4个地方:① CSV文件,② MySQL数据库,③ MongoDB数据库,④ 文件存储 (图片等二进制文件)
本节通过两个实战案例来说明Scrapy是如何进行数据持久化存储的。
说明: 我认为在初学框架的时候,最重要的就是弄清楚框架代码的运行顺序,而框架将不同功能的代码拆分成不同模块,写到不同的函数和文件中,只用文字不方便说明,因此本节使用截图+手动标记顺序的方式来说明代码的运行顺序。
案例一:双色球
目标:
- 说明items.py文件的作用,用于规范要爬取数据的字段名称
- 实现① CSV文件,② MySQL数据库,③ MongoDB数据库持久化数据存储
文件目录结构:
shuangseqiu.py文件
items.py文件
piplines.py文件(保存CSV部分)
piplines.py文件(保存MySQL部分)
piplines.py文件(保存MongoDB部分)
settings.py文件
shuangseqiu.py源码:
import scrapy
from caipiao.items import CaipiaoItem # 先导包
class ShuangseqiuSpider(scrapy.Spider):
name = 'shuangseqiu'
allowed_domains = ['500.com'] # 限制域名范围
start_urls = ['http://datachart.500.com/ssq/'] # 起始url
def parse(self, resp, **kwargs):
trs = resp.xpath('//*[@id="tdata"]/tr')
# result = [] # 别这么干,很傻
for tr in trs:
if tr.xpath('./@class').extract_first() == 'tdbck': # 过滤掉空行
continue
# red_ball = tr.xpath('./td[@class="chartBall01"]/text()').extract()
# blue_ball = tr.xpath('./td[@class="chartBall02"]/text()').extract_first()
qihao = tr.xpath('./td[1]/text()').extract_first().strip()
red_ball = '_'.join(tr.css(".chartBall01::text").extract())
blue_ball = tr.css(".chartBall02::text").extract_first()
# print(qihao, red_ball, blue_ball) # 打印下看看
cai = CaipiaoItem() # cai = dict()
cai['qihao'] = qihao
cai['red_ball'] = red_ball
cai['blue_ball'] = blue_ball
yield cai # 聪明人都这么干
# dic = {
# 'qihao': qihao,
# 'red_ball': red_ball,
# 'blue_ball': blue_ball
# }
# result.append(dic)
# return result # 别这么干,很傻
# yield result # 别这么干,很傻
items.py源码:
import scrapy
class CaipiaoItem(scrapy.Item):
# define the fields for your item here like:
qihao = scrapy.Field() # 想当于字典的Key
red_ball = scrapy.Field()
blue_ball = scrapy.Field()
pipelines.py源码:
from itemadapter import ItemAdapter
import pymysql
from caipiao.settings import MYSQL
import pymongo
"""
存储数据的方案:
1. 数据要存储在csv文件中
2. 数据存储在mysql数据库中
3. 数据存储在mongodb数据库中
4. 文件存储
"""
class CaipiaoPipeline:
"""
我们希望的是,在爬虫开始的时候,打开这个文件
在执行过程中,不断地往里存储数据
在执行完毕时,关掉这个文件
"""
def open_spider(self, spider):
self.f = open("./双色球.csv", mode="a", encoding="utf-8")
def close_spider(self, spider):
if self.f:
self.f.close()
def process_item(self, item, spider):
self.f.write(f"{item['qihao']},{item['red_ball']},{item['blue_ball']}\n")
return item
class CaipiaoMySQLPipeline:
def open_spider(self, spider):
self.conn = pymysql.connect(
host=MYSQL['host'],
port=MYSQL['port'],
user=MYSQL['user'],
password=MYSQL['password'],
database=MYSQL['database']
)
def close_spider(self, spider):
if self.conn:
self.conn.close()
def process_item(self, item, spider):
try:
cursor = self.conn.cursor()
sql = "insert into caipiao (qihao, red_ball, blue_ball) values (%s, %s, %s)"
cursor.execute(sql, (item['qihao'], item['red_ball'], item['blue_ball']))
self.conn.commit()
except:
self.conn.rollback()
finally:
if cursor:
cursor.close()
return item
class CaipiaoMongoDBPipeline:
def open_spider(self, spider):
self.client = pymongo.MongoClient(host='localhost', port=27017)
db = self.client['haha'] # use database
self.collection = db['caipiao'] # 指定彩票集合
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.collection.insert({"qihao": item['qihao'], "red_ball": item['red_ball'], "blue_ball": item['blue_ball']})
return item
settings.py源码:
BOT_NAME = 'caipiao'
SPIDER_MODULES = ['caipiao.spiders']
NEWSPIDER_MODULE = 'caipiao.spiders'
LOG_LEVEL = "WARNING"
# 配置MySQL
MYSQL = {
'host':'localhost',
'port':3306,
'user':'root',
'password':'xxxxxx',
'database':'spider'
}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'caipiao.pipelines.CaipiaoPipeline': 300,
'caipiao.pipelines.CaipiaoMySQLPipeline': 301,
'caipiao.pipelines.CaipiaoMongoDBPipeline': 302
}
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
案例二:图片之家
目标:
- 说明从起始URL中获取详情URL列表,依次进入详情URL中抓取数据的过程
- 实现② MySQL数据库 ④ 文件存储 (图片等二进制文件)
文件目录结构:
和双色球案例一样
meinv.py文件(解析起始页面部分)
meinv.py文件(解析详情页面部分)
items.py文件
pipelines.py文件(图片下载功能)
piplines.py文件(结果保存到MySQL功能)
settings.py文件
meinv.py源码
import scrapy
# from urllib.parse import urljoin
from tupianzhijia.items import MeinvItem
class MeinvSpider(scrapy.Spider):
name = 'meinv'
allowed_domains = ['tupianzj.com']
start_urls = ['http://tupianzj.com/bizhi/DNmeinv/']
def parse(self, resp, **kwargs):
# print(resp.text) # 看看源码中是否包含要的内容
li_list = resp.xpath("//ul[@class='list_con_box_ul']/li")
for li in li_list:
href = li.xpath("./a/@href").extract_first()
# print(href) # 打印下看看
# 理论上应该开始进行一个网络请求了
# 根据Scrapy的运行原理,此处应该对href进行处理,处理成一个请求,交给引擎
# print(resp.urljoin(href)) # 打印下看看
yield scrapy.Request(
url=resp.urljoin(href), # 吧resp中的url和刚刚获取的url进行拼接整合
method='get',
callback=self.parse_detial # 回调函数,当响应回馈之后,如何进行处理响应内容
)
# 可以考虑下一页的问题
# 如果这里可以下一页,那么数据的解析,直接就是当前的这个parse
next_href = resp.xpath('//div[@class="pages"]/ul/li/a[contains(text(), "下一页")]/@href').extract_first()
yield scrapy.Request(
url=resp.urljoin(next_href), # 吧resp中的url和刚刚获取的url进行拼接整合
method='get',
callback=self.parse # 回调函数,当响应回馈之后,如何进行处理响应内容
)
def parse_detial(self, resp, **kwargs):
# print(resp.text) # 看看源码中是否包含要的内容
name = resp.xpath('//*[@id="container"]/div/div/div[2]/h1/text()').extract_first()
img_src = resp.xpath("//div[@id='bigpic']/a/img/@src").extract_first()
# print(name, img_src) # 打印下看看
Meinv = MeinvItem()
Meinv['name'] = name
Meinv['img_src'] = img_src
yield Meinv
items.py源码
import scrapy
class MeinvItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
img_src = scrapy.Field()
local_path = scrapy.Field()
pipelines.py源码
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
import scrapy
import pymysql
from tupianzhijia.settings import MYSQL
class TupianzhijiaPipeline:
def open_spider(self, spider):
self.conn = pymysql.connect(
host=MYSQL['host'],
port=MYSQL['port'],
user=MYSQL['user'],
password=MYSQL['password'],
database=MYSQL['database']
)
def close_spider(self, spider):
if self.conn:
self.conn.close()
def process_item(self, item, spider):
try:
cursor = self.conn.cursor()
sql = "insert into tu (name, img_src, local_path) values (%s, %s, %s)"
cursor.execute(sql, (item['name'], item['img_src'], item['local_path']))
self.conn.commit()
except:
self.conn.rollback()
finally:
if cursor:
cursor.close()
return item
# 想要使用ImagesPipeline必须单独设置一个配置,用来保存文件的文件夹
class MeinvSavePipeline(ImagesPipeline): # 利用图片管道帮我们完成数据下载操作
""" 重写父类三个功能 """
def get_media_requests(self, item, info): # 负责下载
yield scrapy.Request(item['img_src']) # 直接返回一个请求即可
def file_path(self, request, response=None, info=None, *, item=None): # 准备文件路径
file_name = request.url.split("/")[-1] # request.url可以直接获取到刚刚请求的url
return f"img/{file_name}"
def item_completed(self, results, item, info): # 返回文件的详细信息
ok, finfo = results[0]
# finfo["path"]
# print(results)
item['local_path'] = finfo['path']
return item
settings.py源码
BOT_NAME = 'tupianzhijia'
SPIDER_MODULES = ['tupianzhijia.spiders']
NEWSPIDER_MODULE = 'tupianzhijia.spiders'
LOG_LEVEL = "WARNING"
# 配置MySQL
MYSQL = {
'host':'localhost',
'port':3306,
'user':'root',
'password':'xxxxxxxx',
'database':'spider'
}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tupianzhijia.pipelines.TupianzhijiaPipeline': 300,
'tupianzhijia.pipelines.MeinvSavePipeline': 299
}
IMAGES_STORE = './meinvtupian'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
小结:
本节需要弄明白重点:
- 两个案例的代码运行顺序
- 每一种数据持久化存储的方法