1 默认查询
如果是chrome浏览器,可以下载Crap Api Debug插件,模拟发送http请求,最简单的查询如下面,不带查询条件。其中info
是,customer
是, _search
是rest api查询接口。
http://elk.test.com/info/customer/_search
2 精确查询
参考Elasticsearch 常用基本查询文章中介绍的方式是通过curl的方式,当然也可以参考23 个很有用的 ElasticSearch 查询示例
http://elk.test.com/info/customer/_search?q=taxAuthId:9de3cd41a3a341d689f28a42d324a52c
通过Crap API中的post请求也可以获取结果。
3 elasticsearch备份
当你使用pykafka写入数据的速度你忍受不了的时候,你就会想一下好不容易导入数据到el中,有一部分数据错误了怎么办,这个时候想到的是先对elasticsearch进行备份。可以参考
Elasticsearch的备份和恢复
Elasticsearch–索引备份与迁移
elasticsearch配置文件详解
elasticsearch snapshot
每个人使用elasticsearch的起点不一样,遇到的问题也参差不齐。所以我把自己摸索的过程,写到了elasticsearch备份过程记录
4 kibana
cd /usr/hadoop/application/kibana/bin/
nohup ./kibana &
tail -fn 100 nohup.out
kibana.yml配置,这里用了最简单的配置方法
server.host: "192.168.5.185"
elasticsearch.url: "http://192.168.5.185:9200"
5 批量更新
elasticsearch更新文档是很复杂的,必须先提取文档,从_source中获取数据、移除旧文档、应用变更,并作为一个新文档创建索引,所以如果字段有无,全量或局部更新需要自行写程序来完成.下面是一个例子
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import pandas as pd
import json
from MysqlDao import pss_engine
import udf_config
logger = udf_config.logger
from elasticsearch import Elasticsearch
from elasticsearch import helpers
class CrmCustomerService:
def __init__(self):
self.engine = pss_engine;
self.es = Elasticsearch(['192.168.5.150'], sniff_on_start=True, sniff_on_connection_fail=True, max_retries=3, retry_on_timeout=True)
def get_customers(self, start=0, rows=1000):
'''
获取旧crm中的数据
:return:
'''
sql = "select id from crm_customer limit {},{}".format(start, rows)
df = pd.read_sql(sql, self.engine)
results = json.loads(df.to_json(orient='records'))
actions = []
for result in results:
id = str(result['id'])
logger.debug('id is {} '.format(id))
if self.is_exist(id):
action = {
'_op_type':'update',
'_index':'info',
'_type':'customer',
'_id':str(result['id']),
'doc':{'entId':'00000000000000000000000000000000'}
}
actions.append(action)
if len(actions) >0:
success,msg = helpers.bulk(self.es, actions)
return success, msg
else:
return "OK","这一页没啥数据可更新"
def is_exist(self, id):
body = {
"query":{
"term":{
"id":str(id)
}
}
}
result = self.es.count(index='info',doc_type='customer',body=body)
if result['count']>=1:
return True
else:
return False
def update_customer_paging(self):
logger.debug('进入get_customers_paging方法')
rows = 1000
for page in range(0, 16):
start = page * rows
success, msg = self.get_customers( start=start, rows=rows)
logger.debug('第{}页,success:{}. msg:{}'.format(str(start), success, msg))
if __name__ == '__main__':
crmCustomerService = CrmCustomerService()
crmCustomerService.update_customer_paging()
上面是根据数据库,还可以通过elasticsearch自身的进行更新,下面的分页是通过from+size的方式
Elasticsearch——分页查询From&Size VS scroll,
使用from+size的方式,如果出现以下异常,我使用通过kibana执行的,可以参考from-size VS scroll-scan
curl -XPUT "http://127.0.0.1:9200/test-index/_settings" -d '{
"index": {
"max_result_window": 10000000
}
}'
elasticsearch.exceptions.TransportError: TransportError(500,
u'search_phase_execution_exception', u'Result window is too large, from + size
must be less than or equal to: [10000] but was [11000]. See the scroll api for a
more efficient way to request large data sets. This limit can be set by changing
the [index.max_result_window] index level setting.')
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import udf_config
logger = udf_config.logger
from elasticsearch import Elasticsearch
from elasticsearch import helpers
class CrmCustomerDiskService:
def __init__(self):
self.es = Elasticsearch(['192.168.5.150'], sniff_on_start=True, sniff_on_connection_fail=True, max_retries=3, retry_on_timeout=True)
def update_customer_disk(self, start=0, rows=1000):
results = self.get_customer_disk(start, rows)
actions = []
for result in results:
id = str(result['_id'])
logger.debug('id is {} '.format(id))
action = {
'_op_type': 'update',
'_index': 'info',
'_type': 'customerDisk',
'_id': id,
'doc': {'entId': '00000000000000000000000000000000'}
}
actions.append(action)
if len(actions) > 0:
success, msg = helpers.bulk(self.es, actions)
return success, msg
else:
return "OK", "这一页没啥数据可更新"
def get_customer_disk(self, start=0, rows=1000):
body = {
"query": {
"match_all": {}
},
"from":start*rows,
"size":rows
}
results = self.es.search(index='info', doc_type='customerDisk', body=body)
return results['hits']['hits']
def update_customer_disk_paging(self):
rows = self.get_count()
pages = rows/1000+1
for page in range(0, pages):
self.update_customer_disk(start=page,rows=1000)
def get_count(self):
body = {
"query": {
"match_all": {}
}
}
result = self.es.count(index='info', doc_type='customerDisk', body=body)
return result['count']
if __name__ == '__main__':
crmCustomerDiskService = CrmCustomerDiskService()
crmCustomerDiskService.update_customer_disk_paging()
6 更改某个字段
下面的操作是只更新某一个字段,或某几个字段。
POST edata/title/9243100/_update
{
"doc":{
"status":"注销"
}
}
es7的语法有些变化
POST invoice_title/_update/03bacd86492052b743fb71bda317adf6
{
"doc":{
"addresstel" : "深",
"bandkaccount" : "浙",
"date" : "2022-06-28 13:54:03"
}
}
插入数据
POST invoice_title/_doc/9a932cb96ddc1aa4ee87d7e4904a022b
{
"creditcode":"91411xxxxxxxx0",
"name":"xxx",
"addresstel" : "驻马店xxx",
"bandkaccount" : "中国xxx",
"date" : "2022-01-13 12:08:03"
}
7 删除一定规则的字段
做爬虫,最痛恨的是,爬取大量数据后,发现好多网站自身的数据有问题,这个时候需要将其帅选出来,然后删掉。
有时候很奇怪,为什么在网上搜索到的例子在我的kibana中跑不通呢,比如elasticsearch依据字段长度过滤,难道跟es的版本有关系,我使用的es5.6.6
还比如elastic search 多条件查询,这个2017年的帖子,也没法使用了。
根据条件进行删除
POST my_index/index_type/_delete_by_query
{
"query":{
"range": {
"date": {
"gte": &#