9.保存数据
利用管道pipeline来处理(保存)数据
9.1在pipelines文件中定义对数据的操作
1.定义一个管道类
2.重写管道类的process_item方法
- process_item方法处理完item之后必须返回给引擎
#*******************************************pipelines.py*******************************************#
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class Myspider1Pipeline(object):
def process_item(self, item, spider):
# 此处的item是itcast.py文件内的temp字典的值
print('itcast:',item)
#默认使用完管道之后需要将数据返回给引擎
return item
pipelines文件中的编码参数执行的流程,如下图
9.2查看管道是否已经启用了
此处的管道为空,说明管道还没有启用
9.3启用settings.py配置启用管道
修改65~68行的代码
#*******************************************settings.py*******************************************#
# Scrapy settings for myspider1 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'myspider1'
SPIDER_MODULES = ['myspider1.spiders']
NEWSPIDER_MODULE = 'myspider1.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'myspider1 (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'myspider1.middlewares.Myspider1SpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'myspider1.middlewares.Myspider1DownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
#'爬取的项目名称.管道python文件名.管道文件内部的类':取值在100-1000之间
# 注意:管道的大小,可以由多条,数字越小就先执行,数字越大就往后执行
'myspider1.pipelines.Myspider1Pipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
9.4管道启用成功
在命令行中,再次爬取itcast界面,即可查看管道启用成功
9.5在pipelines文件内格式化数据
#*******************************************pipelines.py*******************************************#
# 格式化数据
import json
class Myspider1Pipeline(object):
def __init__(self):
self.file = open('itcast.json','w')
def process_item(self,item,spider):
#将字典数据序列化
json_data = json.dumps(item) +',\n' #查看英文编码
# 将数据写入文件
self.file.write(json_data)
#默认使用完管道之后需要将数据返回给引擎
return item
def __del__(self):
self.file.close()
运行终端的含crawl命令行后,在项目中自动生成itcast.json文件
9.6查看爬取itcast网址的json文件中的英文编码
9.7查看爬取itcast网址的json文件中的中文编码
#*******************************************修改 pipelines.py*******************************************#
……
# ensure_ascii参数默认为True,这样utf-8格式的非ASCII编码内容会被翻译成ASCII编码输出,要想得到字符的真实表示,需要将这个参数设置为False。
json_data = json.dumps(item,ensure_ascii=False) +',\n' #查看中文编码
……
10.完善爬虫
10.1核心知识点
10.2内部获取数据
修改items的模板信息
#******************************************修改items.py**************************************#
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class Myspider1Item(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
title = scrapy.Field()
desc = scrapy.Field()
if __name__ == '__main__':
item = Myspider1Item()
item['name'] = "梁老师" #item内的参数需要和itcast.py文件中的参数相对应,否则报错!!
item['title'] = "打飞的达扎的"
item['desc'] = "梁反对法都"
print(item)
运行结果:
若把 item[‘name’] = "梁老师"修改成 item[‘nam’] = “梁老师”,则报错以下的信息
10.3在itcast.py中导入包
把字典temp修改为item,并且放入到包内
#***************************itcast.py*****************************
import scrapy
from myspider1.items import Myspider1Item
# =============================================================================
# #定义爬虫类
# =============================================================================
class ItcastSpider(scrapy.Spider):
# 定义爬虫的名字
name = 'itcast'
#2.检查域名
allowed_domains = ['itcast.cn']
# 1.修改起始的url
start_urls = ['http://www.itcast.cn/channel/teacher.shtml#javaee']
#3.在parse方法中实现爬取逻辑
def parse(self, response): # response所对应的start_urls
# =============================================================================
# # 获取所有教师节点
# =============================================================================
node_list = response.xpath('//div[@class="main_bot"]')
# =============================================================================
# #遍历教师节点列表
# ==========================================================================
# 遍历每一个结点
for node in node_list:
#字典用于实例化
item = Myspider1Item()
#xpath方法返回的是选择器对象列表,extract()用于从选择器中提取数据
item['name'] = node.xpath('./h2/text()').extract_first() #获取第一个值!!
item['title'] = node.xpath('./h3/./span/text()')[0].extract()
item['desc'] = node.xpath('./p/./span/text()').extract_first()
#xpath结果为只含有一个值的列表,可以使用extract_first(),如果为多个值得则使用extract()
yield item #yield将获取的数据保存起来,并且还没有结束(可适用于获取不同页面的数据)
运行结果:
解决方案
解决报错”TypeError: Object of type Myspider1Item is not JSON serializable“方法如下:
1.定位报错的地方
2.修改pipelines.py文件内的参数
在 此文件的Myspider1Pipeline类中的函数名为process_item中加入以下代码
#将item对象强转成字典
item = dict(item)
完整代码
#*******************************************添加pipelines.py*******************************************#
# 格式化数据
import json
class Myspider1Pipeline(object):
def __init__(self):
self.file = open('itcast.json','w')
def process_item(self,item,spider):
#将item对象强转成字典!!!,该操作只能在scrapy中使用
item = dict(item)
#将字典数据序列化
# json_data = json.dumps(item) +',\n' #查看英文编码
# ensure_ascii参数默认为True,这样utf-8格式的非ASCII编码内容会被翻译成ASCII编码输出,要想得到字符的真实表示,需要将这个参数设置为False。
json_data = json.dumps(item,ensure_ascii=False) +',\n' #查看中文编码
# 将数据写入文件
self.file.write(json_data)
#默认使用完管道之后需要将数据返回给引擎
return item
def __del__(self):
self.file.close()