python+ES简单案例

ES版本和pip install elasticsearch==x.x.x 版本一致

import requests
import json
# 初始化es
from elasticsearch import Elasticsearch


class ESspider():
    es = Elasticsearch()
    mapping = {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "ik_max_word",
                "search_analyzer": "ik_max_word"
            }
        }
    }

    def get_dates(self):
        headers = {
            "cookie": "_ntes_nnid=ec99ba1927e39c47f7f92529ac357e5e,1630661255720; _ntes_nuid=ec99ba1927e39c47f7f92529ac357e5e; __oc_uuid=9db14250-1084-11ec-a725-f1a9c7f81d1d; OUTFOX_SEARCH_USER_ID_NCOO=644224530.1474793; mp_MA-BFF5-63705950A31C_hubble=%7B%22sessionReferrer%22%3A%20%22https%3A%2F%2Fke.study.163.com%2Fcourse%2Fdetail%2F75268%3FPdt%3DydkWeb%22%2C%22updatedTime%22%3A%201633868071502%2C%22sessionStartTime%22%3A%201633867930016%2C%22sendNumClass%22%3A%20%7B%22allNum%22%3A%205%2C%22errSendNum%22%3A%200%7D%2C%22deviceUdid%22%3A%20%227c43e0ea-9d70-4ff0-b02e-19e16bc62221%22%2C%22persistedTime%22%3A%201633867930013%2C%22LASTEVENT%22%3A%20%7B%22eventId%22%3A%20%22db879b02780efec52202b7dbf0cc55396731e9a0%22%2C%22time%22%3A%201633868071502%7D%2C%22sessionUuid%22%3A%20%22fe22a015-6e85-42fa-abbb-4a8b84f61265%22%7D; nts_mail_user=17670404326@163.com:-1:1; __root_domain_v=.163.com; _qddaz=QD.201442685373792; wyy_uid=cc35a0f5-5621-465b-8af0-a41e63c98ba6; locale=zh_CN; _ga=GA1.2.1546998699.1642685376; UM_distinctid=18046760d08a8a-00ff3f452f8af9-6b3e555b-1fa400-18046760d09407; NTES_PASSPORT=DDZMfzZnP8MEcrLalbaQV6J_XhqBGveUkuhS1TZzOfy0RcNfRVhy3uXwDESUwKLWes0pwA7KEPurBgFA7XKswNNLfBhynRzaX9MEtf_CZL2KKDgjk8nnaoN1LUCH9r.Ng6SscTQIfX0MZfOlrfYEyTbYFEj54caEIxePyt3vm._A9W.7lftxmZBk5vbtY5zq5; NTES_P_UTID=e7bqkr9keO44mtPpEPWSOw5oghxRTPXl|1652677431; P_INFO=m17670404326@163.com|1652677431|0|mail163|00&99|null&null&null#CN&null#10#0#0|176326&1|newsclient|17670404326@163.com; _antanalysis_s_id=1652951272274; cm_newmsg=user%3Dm17670404326%40163.com%26new%3D-1%26total%3D-1; BAIDU_SSP_lcr=https://www.baidu.com/link?url=8Lx9yu_hGauKmV_hogbUAZ0li069W6j8bcp4Qdd9Hte&wd=&eqid=ed04fd700000bcef00000006628608e5; Hm_lvt_210da436b3de9b40d52898d20bb51ebf=1652951311; pver_n_f_l_n3=a; UserAdLocation=%u6E56%u5357; UserProvince=%u5168%u56FD; s_n_f_l_n3=21c1439fdba0f6d11652953716973; NTES_PC_IP=%E9%95%BF%E6%B2%99%7C%E6%B9%96%E5%8D%97; Hm_lpvt_210da436b3de9b40d52898d20bb51ebf=1652954718; ne_analysis_trace_id=1652954790104; vinfo_n_f_l_n3=21c1439fdba0f6d1.1.3.1636701235898.1652951656389.1652954790106",
            'referer': "https://news.163.com/domestic/",
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
        }
        url = 'https://news.163.com/special/cm_guonei/?callback=data_callback'
        # 发起数据请求
        html = requests.get(url=url, headers=headers)
        # 数处理
        html = html.text
        jsons = html.replace('data_callback(', '')[:-1]
        dates = json.loads(jsons)
        return dates

    def es_insert(self, datas):
        # 连接ES 并将数据写入

        # 删除之间的索引
        try:
            self.es.indices.delete(index='ceshi_news', ignore=[400, 401])
        except:
            pass
        # 创建索引
        self.es.indices.create(index='news', ignore=[400, 401])
        #
        result = self.es.indices.put_mapping(index='news', body=mapping)
        print(result)

        # 插入数据
        for data in datas:
            temp = {}
            temp['title'] = data['title']
            temp['url'] = data['docurl']
            self.es.index(index='news', body=temp)
        print('ok')

    def search(self, kw):
        dsl = {
            'query': {
                'match': {
                    'title': kw
                }
            }
        }
        result = self.es.search(index='news', body=dsl)
        print(result)

    def start(self):
        '''
        重新调用爬虫爬
        :return:
        '''
        dates = self.get_dates()
        self.es_insert(dates)
        kw = input('请输入想查找的关键多关键字使用空格:')
        self.search(kw)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值