mac/win——测试Python/Java导入CSV文件到ElasticSearch

尝试了Java和Python批量导入数据到ES
每条数据34个字段
1.每一百万条数据【mac 到 mac本地】大概需要5分钟
2.【win 到 win 本地】未测时间

Java中可使用BulkRequestBuilder来add数据;
Python中可使用elasticsearch模块的helpers.bulk来添加数据

Java解决报错【org.elasticsearch.action.ActionRequestValidationException: Validation Failed: 1: no requests added;】
【Java源码中:最终执行的是requests.add(request);,调用的是List中的add】
final List<DocWriteRequest> requests = new ArrayList<>();
super.request.add(request);
BulkRequestBuilder调用ActionRequestBuilder的request()方法获得Request对象,该对象再调用BulkRequest的requests()方法获得List< DocWriteRequest>
if (bulkRequest.request().requests().size() > 0 )判断

utils—配置类

package util;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;

/**
 * <p>package: util,descirption:</p>
 *
 * @author 王海
 * @version V1.0
 * @since <pre>2018/5/30 20:03</pre>
 */
public class EsPropertiesUtils {
    // 配置文件封装于map
    private static Map<String, Object> properties = new HashMap<>();

    /**
     * 获取配
     *
     * @param key 配置信息的键
     * @return 配置值
     */
    static Object getProperty(String key) {
        return properties.get(key);
    }

    // static块,项目启动时执行
    static {
        // 加载配置文件
        Properties prop = new Properties();
        InputStream input;
        try {
            // 获取ES的配置信息,然后加载到prop中
            input = EsPropertiesUtils.class.getResourceAsStream("/es.properties");
            prop.load(input);

            // 配置使用跨平台提交任务
            properties.put("cluster.name", prop.getProperty("cluster.name"));
            properties.put("server", prop.getProperty("server"));
            properties.put("port", prop.getProperty("port"));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    public static Map<String,Object> getConf(){
        return properties;
    }
}

utils—open/close

import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.Map;

/**
 * <p>Descirption:打开Elastic客户端以及关闭客户端</p>
 *
 * @author 王海
 * @version V1.0
 * @package PACKAGE_NAME
 * @date 2018/5/18 14:55
 * @since api1.0
 */
public class OpenClose {

    private static final Logger LOGGER = LoggerFactory.getLogger(OpenClose.class);
    private static TransportClient client = null;
    /**
     * 从Map中解析出参数来初始化客户端
     *
     * @param mapParms 参数Map
     */
    private static TransportClient getClientWithMap(Map<String, Object> mapParms) {
        // 初始化client通常需要ip地址,端口,集群名称三个参数
        // 获取集群名
        String clusterName = mapParms.get("cluster.name").toString();
        // 获取ip地址
        String addressMaster = mapParms.get("server").toString();
        //byte[] addressMaster = (byte[]) mapParms.get("server");
        // 获取端口(ES的默认传输端口为9300)
        int transport = Integer.valueOf(mapParms.get("port").toString());
        // 根据以上设置,初始化elasticsearch客户端
        // Builder是Settings中的一个静态内部类
        Settings settings = Settings.builder().put("cluster.name", clusterName).build();
        try {
             client = new PreBuiltTransportClient(settings).addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(addressMaster), transport));
             //client = new PreBuiltTransportClient(settings).addTransportAddress(new InetSocketTransportAddress(InetAddress.getByAddress(addressMaster), transport));
        } catch (UnknownHostException e) {
            e.printStackTrace();
            LOGGER.error("初始化client失败 ===== " + e.getMessage());
        }
        LOGGER.info("初始化client成功 ===== " + System.currentTimeMillis());
        return client;
    }

    public static synchronized TransportClient getInstance(Map<String, Object> mapParms){
        if (client == null){
            client = OpenClose.getClientWithMap(mapParms);
        }
        return client;
    }

    public static void closeClient(TransportClient client) {
        if (client != null) {
            client.close();
            LOGGER.info("关闭client成功 ===== " + System.currentTimeMillis());
        }
    }
}

CSV导入到ES——Java

import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.client.transport.TransportClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import util.EsPropertiesUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * <p>package: PACKAGE_NAME,descirption:</p>
 *
 * @author 王海
 * @version V1.0
 * @since <pre>2018/5/30 19:32</pre>
 */
public class CsvToES {
    private static final Logger LOGGER = LoggerFactory.getLogger(OpenClose.class);

    public void csvToES(String[] fields, File[] files) {
        Map<String, Object> hashMap = EsPropertiesUtils.getConf();
        TransportClient client = OpenClose.getInstance(hashMap);
        int idNum = 0;
        int error = 0;
        // 一、读取文件
        for (File f : files) {
            // 绝对路径读取文件
            System.out.println(f.getAbsolutePath());
            File file = new File(f.getAbsolutePath());
            // 二、开始一行一行的写
            try (BufferedReader br = new BufferedReader(new FileReader(file))) {
                BulkRequestBuilder bulkRequest = client.prepareBulk();
                Map<String, Object> valuesMap = new HashMap<>(64, 0.6f);
                // 跳过文件第一行
                System.out.println("略过文件第一行==========" + br.readLine());
                String line;
                while ((line = br.readLine()) != null) {
                    String[] filedValues = line.split(",");
                    if (filedValues.length != 34) {
                        error = error + 1;
                        continue;
                    }
                    for (int filedNum = 0; filedNum < filedValues.length; filedNum++) {
                        valuesMap.put(fields[filedNum], filedValues[filedNum]);
                    }
                    // 导入数据有很多种方式,比如:1.手动构建JSON风格的字符串;2.使用map;3.使用JackSon等工具包序列化Beans;3.使用ES的XContentBuilder ;4.BulkRequestBuilder  ; 5.https://www.elastic.co/guide/en/elasticsearch/client/java-api/6.1/java-docs-index.html#java-docs-index
                    bulkRequest.add(client.prepareIndex("xueli", "new_orders", Integer.toString(idNum)).setSource(valuesMap));
                    valuesMap.clear();
                /* 3.jsonBuilder
                bulkRequest.add(client.prepareIndex("******", "********", Integer.toString(count)).setSource(jsonBuilder()
                        .startObject()
                        .field(fields[0], filedValues[0])
                        .field(fields[1], filedValues[1])
                        .endObject()));*/
                    if (idNum % 2000 == 0) {
                        System.out.println("++++++++++++++++++++++++++++++++++++");
                        // 三、批量导入
                        bulkRequest.get();
                        // 与上面的语法效果相同:bulkRequest.execute().actionGet();
                    }
                    idNum++;
                }
                // 四、导入每个文件最后不足2000条的数据(但是必须得有数据,否则报错: no requests added)
                // 查阅源码可知,可通过拿到父类requests属性,判断其size来解决[子类调用父类方法获得属性]
                if (bulkRequest.request().requests().size() > 0 ) {
                    bulkRequest.get();
                }
                // 五、操作下一个文件
            } catch (IOException e) {
                LOGGER.error(":ERROR:堆栈信息====={}", e.getMessage());
            }
        }
        OpenClose.closeClient(client);
    }

    public static void main(String[] args) {
        CsvToES csvToES = new CsvToES();
        String[] fields = {"order_id",。。。。。。。。。};
        File file = new File("D:\\U_pan\\work\\new_data\\code\\data_utf8\\");
        // 目录下只有我需要读取的文件,故不再进行进一步处理
        File[] files = file.listFiles();
        csvToES.csvToES(fields, files);
    }
}

CSV导入到ES——Python

#!/usr/bin/python

import time
import os
from itertools import islice
from elasticsearch import Elasticsearch
from elasticsearch import helpers


def get_files_to_be_imported(path):
    f_list = os.listdir(path)
    files_ = []
    for i in f_list:
        if os.path.splitext(i)[1] == '.csv':
            print(i)
            files_.append(i)
    return files_


if __name__ == '__main__':

    es = Elasticsearch()
    actions = []
    workspace = u'../total/utf8/es/'
    files = get_files_to_be_imported(workspace)
    str_time = "1111-11-11 11:11:11"

    id_num = 0
    error = 0
    for csv in files:
        csv = workspace + csv
        print(time.strftime('%y-%m-%d %H:%M:%S', time.localtime()))
        this_file = open(csv)
        # ignore table's header with 'islice'
        for line in islice(this_file, 1, None):
            line = line.strip().split(',')
            if len(line) < 34:
                error += 1
                id_num += 1
                print(id_num)
                continue
            action = {
                "_index": "your_index",
                "_type": "your_type",
                "_id": id_num,
                "_source": {
                    "order_id": line[0].decode('utf8'),

                    "order_time": str_time if len(line[5]) != 19 else line[5].decode('utf8'),
                    # 。。。
                }
            }
            id_num += 1
            # if id_num == 900000:
            #     print("++++++++++++++++++++++")
            actions.append(action)
            if len(actions) == 2000:
                # print("======================")
                helpers.bulk(es, actions)
                del actions[0:len(actions)]

        if len(actions) > 0:
            helpers.bulk(es, actions)
            del actions[0:len(actions)]
        print("finish process file:%s" % csv)

    print(error)
    print("down!")

遇见的问题


  1. 不符合规范的时间
    自定义时间字段的format,会比原有的限制“宽松”很多
  2. 数组问题,就是说,某个字段有多个值,如何解析传递这么个数组
    TODO
  3. 字段中包含自定义的分隔符,我这里是”,”
    Python: dataframe[‘column’]str.replace(“,”,”;”)
  4. Java解决报错【org.elasticsearch.action.ActionRequestValidationException: Validation Failed: 1: no requests added;】
    【Java源码中:最终执行的是requests.add(request);,调用的是List中的add】
    final List<DocWriteRequest> requests = new ArrayList<>();
    super.request.add(request);

当一个类继承于另一个类,子类中没有父类的方法时。用子类的对象调用方法时,会首先在子类中查找,如果子类中没有改方法,再到父类中查找。
当一个方法只在父类中定义时,调用该方法时会使用父类中的属性



BulkRequestBuilder调用ActionRequestBuilder的request()方法获得Request对象,该对象再调用BulkRequest的requests()方法获得List< DocWriteRequest>
if (bulkRequest.request().requests().size() > 0 )判断

参考:

[1]. 官方doc
[2]. CSDN
[3]. CSDN
[4]. try-with-resources

  • 2
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值