尝试了Java和Python批量导入数据到ES
每条数据34个字段
1.每一百万条数据【mac 到 mac本地】大概需要5分钟
2.【win 到 win 本地】未测时间
Java中可使用BulkRequestBuilder来add数据;
Python中可使用elasticsearch模块的helpers.bulk来添加数据
Java解决报错【org.elasticsearch.action.ActionRequestValidationException: Validation Failed: 1: no requests added;】
【Java源码中:最终执行的是requests.add(request);
,调用的是List中的add】
【final List<DocWriteRequest> requests = new ArrayList<>();
】
【super.request.add(request);
】
BulkRequestBuilder调用ActionRequestBuilder的request()方法获得Request对象,该对象再调用BulkRequest的requests()方法获得List< DocWriteRequest>
if (bulkRequest.request().requests().size() > 0 )
判断
utils—配置类
package util;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
/**
* <p>package: util,descirption:</p>
*
* @author 王海
* @version V1.0
* @since <pre>2018/5/30 20:03</pre>
*/
public class EsPropertiesUtils {
// 配置文件封装于map
private static Map<String, Object> properties = new HashMap<>();
/**
* 获取配
*
* @param key 配置信息的键
* @return 配置值
*/
static Object getProperty(String key) {
return properties.get(key);
}
// static块,项目启动时执行
static {
// 加载配置文件
Properties prop = new Properties();
InputStream input;
try {
// 获取ES的配置信息,然后加载到prop中
input = EsPropertiesUtils.class.getResourceAsStream("/es.properties");
prop.load(input);
// 配置使用跨平台提交任务
properties.put("cluster.name", prop.getProperty("cluster.name"));
properties.put("server", prop.getProperty("server"));
properties.put("port", prop.getProperty("port"));
} catch (IOException e) {
e.printStackTrace();
}
}
public static Map<String,Object> getConf(){
return properties;
}
}
utils—open/close
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.Map;
/**
* <p>Descirption:打开Elastic客户端以及关闭客户端</p>
*
* @author 王海
* @version V1.0
* @package PACKAGE_NAME
* @date 2018/5/18 14:55
* @since api1.0
*/
public class OpenClose {
private static final Logger LOGGER = LoggerFactory.getLogger(OpenClose.class);
private static TransportClient client = null;
/**
* 从Map中解析出参数来初始化客户端
*
* @param mapParms 参数Map
*/
private static TransportClient getClientWithMap(Map<String, Object> mapParms) {
// 初始化client通常需要ip地址,端口,集群名称三个参数
// 获取集群名
String clusterName = mapParms.get("cluster.name").toString();
// 获取ip地址
String addressMaster = mapParms.get("server").toString();
//byte[] addressMaster = (byte[]) mapParms.get("server");
// 获取端口(ES的默认传输端口为9300)
int transport = Integer.valueOf(mapParms.get("port").toString());
// 根据以上设置,初始化elasticsearch客户端
// Builder是Settings中的一个静态内部类
Settings settings = Settings.builder().put("cluster.name", clusterName).build();
try {
client = new PreBuiltTransportClient(settings).addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(addressMaster), transport));
//client = new PreBuiltTransportClient(settings).addTransportAddress(new InetSocketTransportAddress(InetAddress.getByAddress(addressMaster), transport));
} catch (UnknownHostException e) {
e.printStackTrace();
LOGGER.error("初始化client失败 ===== " + e.getMessage());
}
LOGGER.info("初始化client成功 ===== " + System.currentTimeMillis());
return client;
}
public static synchronized TransportClient getInstance(Map<String, Object> mapParms){
if (client == null){
client = OpenClose.getClientWithMap(mapParms);
}
return client;
}
public static void closeClient(TransportClient client) {
if (client != null) {
client.close();
LOGGER.info("关闭client成功 ===== " + System.currentTimeMillis());
}
}
}
CSV导入到ES——Java
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.client.transport.TransportClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import util.EsPropertiesUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* <p>package: PACKAGE_NAME,descirption:</p>
*
* @author 王海
* @version V1.0
* @since <pre>2018/5/30 19:32</pre>
*/
public class CsvToES {
private static final Logger LOGGER = LoggerFactory.getLogger(OpenClose.class);
public void csvToES(String[] fields, File[] files) {
Map<String, Object> hashMap = EsPropertiesUtils.getConf();
TransportClient client = OpenClose.getInstance(hashMap);
int idNum = 0;
int error = 0;
// 一、读取文件
for (File f : files) {
// 绝对路径读取文件
System.out.println(f.getAbsolutePath());
File file = new File(f.getAbsolutePath());
// 二、开始一行一行的写
try (BufferedReader br = new BufferedReader(new FileReader(file))) {
BulkRequestBuilder bulkRequest = client.prepareBulk();
Map<String, Object> valuesMap = new HashMap<>(64, 0.6f);
// 跳过文件第一行
System.out.println("略过文件第一行==========" + br.readLine());
String line;
while ((line = br.readLine()) != null) {
String[] filedValues = line.split(",");
if (filedValues.length != 34) {
error = error + 1;
continue;
}
for (int filedNum = 0; filedNum < filedValues.length; filedNum++) {
valuesMap.put(fields[filedNum], filedValues[filedNum]);
}
// 导入数据有很多种方式,比如:1.手动构建JSON风格的字符串;2.使用map;3.使用JackSon等工具包序列化Beans;3.使用ES的XContentBuilder ;4.BulkRequestBuilder ; 5.https://www.elastic.co/guide/en/elasticsearch/client/java-api/6.1/java-docs-index.html#java-docs-index
bulkRequest.add(client.prepareIndex("xueli", "new_orders", Integer.toString(idNum)).setSource(valuesMap));
valuesMap.clear();
/* 3.jsonBuilder
bulkRequest.add(client.prepareIndex("******", "********", Integer.toString(count)).setSource(jsonBuilder()
.startObject()
.field(fields[0], filedValues[0])
.field(fields[1], filedValues[1])
.endObject()));*/
if (idNum % 2000 == 0) {
System.out.println("++++++++++++++++++++++++++++++++++++");
// 三、批量导入
bulkRequest.get();
// 与上面的语法效果相同:bulkRequest.execute().actionGet();
}
idNum++;
}
// 四、导入每个文件最后不足2000条的数据(但是必须得有数据,否则报错: no requests added)
// 查阅源码可知,可通过拿到父类requests属性,判断其size来解决[子类调用父类方法获得属性]
if (bulkRequest.request().requests().size() > 0 ) {
bulkRequest.get();
}
// 五、操作下一个文件
} catch (IOException e) {
LOGGER.error(":ERROR:堆栈信息====={}", e.getMessage());
}
}
OpenClose.closeClient(client);
}
public static void main(String[] args) {
CsvToES csvToES = new CsvToES();
String[] fields = {"order_id",。。。。。。。。。};
File file = new File("D:\\U_pan\\work\\new_data\\code\\data_utf8\\");
// 目录下只有我需要读取的文件,故不再进行进一步处理
File[] files = file.listFiles();
csvToES.csvToES(fields, files);
}
}
CSV导入到ES——Python
#!/usr/bin/python
import time
import os
from itertools import islice
from elasticsearch import Elasticsearch
from elasticsearch import helpers
def get_files_to_be_imported(path):
f_list = os.listdir(path)
files_ = []
for i in f_list:
if os.path.splitext(i)[1] == '.csv':
print(i)
files_.append(i)
return files_
if __name__ == '__main__':
es = Elasticsearch()
actions = []
workspace = u'../total/utf8/es/'
files = get_files_to_be_imported(workspace)
str_time = "1111-11-11 11:11:11"
id_num = 0
error = 0
for csv in files:
csv = workspace + csv
print(time.strftime('%y-%m-%d %H:%M:%S', time.localtime()))
this_file = open(csv)
# ignore table's header with 'islice'
for line in islice(this_file, 1, None):
line = line.strip().split(',')
if len(line) < 34:
error += 1
id_num += 1
print(id_num)
continue
action = {
"_index": "your_index",
"_type": "your_type",
"_id": id_num,
"_source": {
"order_id": line[0].decode('utf8'),
"order_time": str_time if len(line[5]) != 19 else line[5].decode('utf8'),
# 。。。
}
}
id_num += 1
# if id_num == 900000:
# print("++++++++++++++++++++++")
actions.append(action)
if len(actions) == 2000:
# print("======================")
helpers.bulk(es, actions)
del actions[0:len(actions)]
if len(actions) > 0:
helpers.bulk(es, actions)
del actions[0:len(actions)]
print("finish process file:%s" % csv)
print(error)
print("down!")
遇见的问题
- 不符合规范的时间
自定义时间字段的format,会比原有的限制“宽松”很多 - 数组问题,就是说,某个字段有多个值,如何解析传递这么个数组
TODO - 字段中包含自定义的分隔符,我这里是”,”
Python: dataframe[‘column’]str.replace(“,”,”;”) - Java解决报错【org.elasticsearch.action.ActionRequestValidationException: Validation Failed: 1: no requests added;】
【Java源码中:最终执行的是requests.add(request);
,调用的是List中的add】
【final List<DocWriteRequest> requests = new ArrayList<>();
】
【super.request.add(request);
】
当一个类继承于另一个类,子类中没有父类的方法时。用子类的对象调用方法时,会首先在子类中查找,如果子类中没有改方法,再到父类中查找。
当一个方法只在父类中定义时,调用该方法时会使用父类中的属性
BulkRequestBuilder调用ActionRequestBuilder的request()方法获得Request对象,该对象再调用BulkRequest的requests()方法获得List< DocWriteRequest>
if (bulkRequest.request().requests().size() > 0 )
判断
参考:
[1]. 官方doc
[2]. CSDN
[3]. CSDN
[4]. try-with-resources