ES版本和pip install elasticsearch==x.x.x 版本一致
import requests
import json
# 初始化es
from elasticsearch import Elasticsearch
class ESspider():
es = Elasticsearch()
mapping = {
"properties": {
"title": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_max_word"
}
}
}
def get_dates(self):
headers = {
"cookie": "_ntes_nnid=ec99ba1927e39c47f7f92529ac357e5e,1630661255720; _ntes_nuid=ec99ba1927e39c47f7f92529ac357e5e; __oc_uuid=9db14250-1084-11ec-a725-f1a9c7f81d1d; OUTFOX_SEARCH_USER_ID_NCOO=644224530.1474793; mp_MA-BFF5-63705950A31C_hubble=%7B%22sessionReferrer%22%3A%20%22https%3A%2F%2Fke.study.163.com%2Fcourse%2Fdetail%2F75268%3FPdt%3DydkWeb%22%2C%22updatedTime%22%3A%201633868071502%2C%22sessionStartTime%22%3A%201633867930016%2C%22sendNumClass%22%3A%20%7B%22allNum%22%3A%205%2C%22errSendNum%22%3A%200%7D%2C%22deviceUdid%22%3A%20%227c43e0ea-9d70-4ff0-b02e-19e16bc62221%22%2C%22persistedTime%22%3A%201633867930013%2C%22LASTEVENT%22%3A%20%7B%22eventId%22%3A%20%22db879b02780efec52202b7dbf0cc55396731e9a0%22%2C%22time%22%3A%201633868071502%7D%2C%22sessionUuid%22%3A%20%22fe22a015-6e85-42fa-abbb-4a8b84f61265%22%7D; nts_mail_user=17670404326@163.com:-1:1; __root_domain_v=.163.com; _qddaz=QD.201442685373792; wyy_uid=cc35a0f5-5621-465b-8af0-a41e63c98ba6; locale=zh_CN; _ga=GA1.2.1546998699.1642685376; UM_distinctid=18046760d08a8a-00ff3f452f8af9-6b3e555b-1fa400-18046760d09407; NTES_PASSPORT=DDZMfzZnP8MEcrLalbaQV6J_XhqBGveUkuhS1TZzOfy0RcNfRVhy3uXwDESUwKLWes0pwA7KEPurBgFA7XKswNNLfBhynRzaX9MEtf_CZL2KKDgjk8nnaoN1LUCH9r.Ng6SscTQIfX0MZfOlrfYEyTbYFEj54caEIxePyt3vm._A9W.7lftxmZBk5vbtY5zq5; NTES_P_UTID=e7bqkr9keO44mtPpEPWSOw5oghxRTPXl|1652677431; P_INFO=m17670404326@163.com|1652677431|0|mail163|00&99|null&null&null#CN&null#10#0#0|176326&1|newsclient|17670404326@163.com; _antanalysis_s_id=1652951272274; cm_newmsg=user%3Dm17670404326%40163.com%26new%3D-1%26total%3D-1; BAIDU_SSP_lcr=https://www.baidu.com/link?url=8Lx9yu_hGauKmV_hogbUAZ0li069W6j8bcp4Qdd9Hte&wd=&eqid=ed04fd700000bcef00000006628608e5; Hm_lvt_210da436b3de9b40d52898d20bb51ebf=1652951311; pver_n_f_l_n3=a; UserAdLocation=%u6E56%u5357; UserProvince=%u5168%u56FD; s_n_f_l_n3=21c1439fdba0f6d11652953716973; NTES_PC_IP=%E9%95%BF%E6%B2%99%7C%E6%B9%96%E5%8D%97; Hm_lpvt_210da436b3de9b40d52898d20bb51ebf=1652954718; ne_analysis_trace_id=1652954790104; vinfo_n_f_l_n3=21c1439fdba0f6d1.1.3.1636701235898.1652951656389.1652954790106",
'referer': "https://news.163.com/domestic/",
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
url = 'https://news.163.com/special/cm_guonei/?callback=data_callback'
# 发起数据请求
html = requests.get(url=url, headers=headers)
# 数处理
html = html.text
jsons = html.replace('data_callback(', '')[:-1]
dates = json.loads(jsons)
return dates
def es_insert(self, datas):
# 连接ES 并将数据写入
# 删除之间的索引
try:
self.es.indices.delete(index='ceshi_news', ignore=[400, 401])
except:
pass
# 创建索引
self.es.indices.create(index='news', ignore=[400, 401])
#
result = self.es.indices.put_mapping(index='news', body=mapping)
print(result)
# 插入数据
for data in datas:
temp = {}
temp['title'] = data['title']
temp['url'] = data['docurl']
self.es.index(index='news', body=temp)
print('ok')
def search(self, kw):
dsl = {
'query': {
'match': {
'title': kw
}
}
}
result = self.es.search(index='news', body=dsl)
print(result)
def start(self):
'''
重新调用爬虫爬
:return:
'''
dates = self.get_dates()
self.es_insert(dates)
kw = input('请输入想查找的关键多关键字使用空格:')
self.search(kw)