爬虫中建立moudle文件夹用于存放elasticsearch基本数据操作命令(建表)
from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean,analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
from elasticsearch_dsl.connections import connections
#创建服务器链接,非常终于
connections.create_connection(hosts=["localhost"])
#定义数据类,继承DocType,定义各个字段数据类型,在from elasticsearch_dsl import中导入需要的数据类型,包括字符串,整型,布尔等等
class LagouType(DocType):
job_name = Text(analyzer="ik_max_word")
company = Text(analyzer="ik_max_word")
url = Keyword()
job_id = Keyword()
salary = Text(analyzer="ik_max_word")
city = Keyword()
experience = Text(analyzer="ik_max_word")
education = Text(analyzer="ik_max_word")
job_type = Keyword()
label = Text(analyzer="ik_max_word")
job_benefit = Text(analyzer="ik_max_word")
job_description = Text(analyzer="ik_max_word")
addr = Text(analyzer="ik_max_word")
publish_time = Text(analyzer="ik_max_word")
crawl_time = Date()
#建立链接的index和doc,在类中建立类,必须是Meta类,用于传入index值和type(表)值
class Meta:
index = "lagou"
doc_type = "job"
if __name__ == "__main__":
#调用init()方法建立映射(mappings)
LagouType.init()
在pipeline中定制与Elasticsearch连接
1.直接写在pipeline中,但是爬去的item不一定存入elasticsearch中或某数据库中,并且值内容不一,容易混乱,配置性低
#pipeline中写入
class Elasticsearch_pipeline(object):
def __init__(self):
pass
def process_item(self,item,spider):
lagou = LagouType()
lagou.job_name = item['job_name']
lagou.company = item['company']
lagou.url = item['url']
lagou.job_id = item['job_id']