目的: 将Scrapy爬取的数据保存到ES中!
1. 新建一个py文件, 名字自取(如es_model.py)
# 字段类型
from elasticsearch_dsl import DocType, Completion, Keyword, Text, Boolean, Integer, Date
# 链接函数
from elasticsearch_dsl.connections import connections
# 分析器
from elasticsearch_dsl.analysis import CustomAnalyzer
# 1. 创建es连接,参数就是es的地址
connections.create_connection(hosts=["127.0.0.1"])
# 自定义一个分词器
class Analyzer(CustomAnalyzer):
# 返回分析器对象
def get_analysis_definition(self):
return {}
# 创建分析器对象
ik_analyzer = Analyzer('ik_max_word', filter=['lowercase'])
class Field(DocType):
# 搜索框中的自动补齐功能
suggest = Completion(analyzer=ik_analyzer)
# ik_max_word 分词策略 细分
# ik_smart 分词策略 粗分
# analyzer: 分析器(意思)
name = Text(analyzer='ik_max_word')
author = Text(analyzer='ik_max_word')
content = Text()
class Meta:
index = 'novels'
doc_type = 'novel'
if __name__ == '__main__':
Field.init()
2. 在pipelines.py文件中建一个pipline
# 将爬虫爬取的数据存储搜索服务器中
class EsPipline(object):
def process_item(self, item, spider):
item.save_es()
return item
3. 在settings中设置优先级
ITEM_PIPELINES = {
'PaCongSpider.pipelines.EsPipline': 3,
}
4. 在items.py中
import scrapy
from .es_model import Field
from elasticsearch_dsl.connections import connections
es = connections.create_connection(hosts=['127.0.0.1'])
class PacongspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# 将建立的字段进行分词处理
# *args 元组 **args字典
def conduct_suggest(index, *args):
"""
:param index: 操作的数据库
:param args: 需要进行分词的内容
:return: 返回分词之后的列表
"""
# 声明空集合
use_words = set()
# 声明列表
suggest = []
# 分词的分重
for text, weight in args:
# 调用分词接口
words = es.indices.analyze(
index=index,
params={
'filter': ['lowercase']
},
body={
'analyzer': 'ik_max_word',
'text': text
}
)
analyzer_word = set([x['token'] for x in words['tokens']])
print(analyzer_word)
# 计算差集
new_words = analyzer_word - use_words
# 加入suggest之前,这条数据在suggest是不存在的
suggest.append({'input': list(new_words), 'weight': weight})
use_words = analyzer_word
# suggest是没有重复数据的
# [{'input':['土豆','豆','逆','袭'],'weight':10},{'words':['天蚕'],'weight':8}]
return suggest
class QiShuItem(scrapy.Item):
novel_name = scrapy.Field()
novel_author = scrapy.Field()
novel_content = scrapy.Field()
# 将数据保存到es搜索服务器中
def save_es(self):
novel = Field()
# 从传递进来的item中取值
novel.name = self['novel_name']
novel.author = self['novel_author']
novel.content = self['novel_content']
# 将数据对应分词信息进行保存
# 将某些字段进行分词处理, 将处理后的数据保存到suggest中
novel.suggest = conduct_suggest('novels', (novel.name, 10), (novel.author, 8))
novel.save()
5. 只要你的Scrapy运行没有问题, 这样就保存到ES当中!