java flink 读取ES

2运行环境flink standalone模式

1、main 入口

package es;

import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.util.FileUtils;

import java.io.File;
import java.util.List;
import java.util.Map;

public class EsReadTest {

    private static EsRestClientService esRestClientService = new EsRestClientService();

    public static void main(String[] args) throws Exception {

        // set up the execution environment
        final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

        // 查询数据searchResponse
        String scrollId = null;
        DataSet<Tuple3<String, String, Integer>> dataSet = null;
        List<Tuple3<String, String, Integer>> dataList = null;

        int count = 0;

        while (!"none".equals(scrollId)) {

            Map<String, Object> map = esRestClientService.queryDeviceListPage(scrollId);
            if (map.get("tupleList") instanceof List)
                dataList = (List<Tuple3<String, String, Integer>>) map.get("tupleList");
            scrollId = map.get("scrollId").toString();

            if (dataList == null || dataList.size() < 10000 || count > 3)
                break;

            // 导入数据
            DataSet<Tuple3<String, String, Integer>> dataSetTemp = env.fromCollection(dataList);
            if (dataSet == null) {
                dataSet = dataSetTemp;
            } else {
                dataSet = dataSet.union(dataSetTemp);
            }
            ++count;
        }
        // 分组计算规则
        dataSet = dataSet.groupBy(0).sum(2);

        String output = "/opt/flink-data/esoutput2.txt";
        FileUtils.deleteFileOrDirectory(new File(output));
        dataSet.writeAsText(output);

        env.execute("read es");
    }
}

2、游标方式读取es

package es;

import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.http.HttpHost;
import org.elasticsearch.action.search.*;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestClientBuilder;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


/**
 * 阿里云服务器搭建的ES服务
 *
 * @author lizixian
 * @date 2020/3/16 10:41
 */
public class EsRestClientService {

    private String host = "172.168.0.138:9200";
    private String scheme = "http";
    private String index = "es_index";
    private String type = "es_type";
    private RestClientBuilder builder = null;
    private RestHighLevelClient client = null;

    public void init() {
        String[] nodeIpInfos = host.split(":");
        builder = RestClient.builder(new HttpHost(nodeIpInfos[0], Integer.parseInt(nodeIpInfos[1]), scheme))
                .setRequestConfigCallback(requestConfigBuilder -> {
                    requestConfigBuilder.setConnectTimeout(10 * 60 * 1000);
                    requestConfigBuilder.setSocketTimeout(10 * 60 * 1000);
                    requestConfigBuilder.setConnectionRequestTimeout(10 * 60 * 1000);
                    return requestConfigBuilder;
                }).setMaxRetryTimeoutMillis(10 * 60 * 1000);
        client = new RestHighLevelClient(builder);
    }

    /**
     * 分页查询应设备应用安装列表-使用游标
     *
     * @author lizixian
     * @date 2020/5/10 18:01
     */
    public Map<String, Object> queryDeviceListPage(String scrollId) {
        String brand = "CH";

        //设置查询数量
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();

        sourceBuilder.size(10000);
        BoolQueryBuilder bool = QueryBuilders.boolQuery();

        // 平台
//        bool.must(QueryBuilders.termQuery("brand", brand));

        sourceBuilder.query(bool);//查询条件
        return queryDeviceListPageResult(sourceBuilder, scrollId);
    }

    private Map<String, Object> queryDeviceListPageResult(SearchSourceBuilder sourceBuilder, String scrollId) {
        SearchRequest searchRequest = new SearchRequest(index)
                .types(type)
                .scroll("2m")
                .source(sourceBuilder);
        if (client == null) {
            init();
        }
        Map<String, Object> resultMap = new HashMap<>(5);
        List<Tuple3<String, String, Integer>> tupleList = new ArrayList<>();
        try {
            SearchResponse response = null;

            if (scrollId != null) {
                SearchScrollRequest scrollRequest = new SearchScrollRequest(scrollId).scroll("2m");
                response = client.searchScroll(scrollRequest);
            } else {
                response = client.search(searchRequest);
            }

            int s = response.status().getStatus();
            if (s == RestStatus.OK.getStatus()) {
                SearchHit[] hits = response.getHits().getHits();
                scrollId = response.getScrollId();
                System.out.println("*********************查询es结果");
                if (hits != null) {
                    for (SearchHit hit : hits) {
                        System.out.println("*********************查询es结果:" + hit.getSourceAsString());
                        JSONObject json = JSONObject.parseObject(hit.getSourceAsString());
                        tupleList.add(new Tuple3<>(json.getString("name"), json.getString("city"), 1));
                    }
                }
            } else {
                //清除滚屏
                ClearScrollRequest clearScrollRequest = new ClearScrollRequest();
                clearScrollRequest.addScrollId(scrollId);//也可以选择setScrollIds()将多个scrollId一起使用
                ClearScrollResponse clearScrollResponse = client.clearScroll(clearScrollRequest);
                boolean succeeded = clearScrollResponse.isSucceeded();
            }
            resultMap.put("scrollId", scrollId);
            resultMap.put("tupleList", tupleList);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return resultMap;
    }
}
  • 3
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论
我可以回答这个问题。以下是一个Java实现Flink读取HDFS下多目录文件的例子: ``` import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.io.TextInputFormat; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.core.fs.Path; import org.apache.flink.util.Collector; public class FlinkHDFSExample { public static void main(String[] args) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<String> text = env.readTextFile("hdfs://localhost:9000/path/to/directory1/,hdfs://localhost:9000/path/to/directory2/") .withParameters(new Configuration().setBoolean("recursive.file.enumeration", true)); DataSet<Tuple2<String, Integer>> counts = text.flatMap(new Tokenizer()) .groupBy(0) .sum(1); counts.print(); } public static final class Tokenizer implements FlatMapFunction<String, Tuple2<String, Integer>> { @Override public void flatMap(String value, Collector<Tuple2<String, Integer>> out) { String[] tokens = value.toLowerCase().split("\\W+"); for (String token : tokens) { if (token.length() > 0) { out.collect(new Tuple2<>(token, 1)); } } } } } ``` 这个例子使用Flink的`readTextFile`方法读取HDFS下的多个目录中的文件,并使用`Tokenizer`函数对文件进行分词,最后统计每个单词出现的次数。注意,需要在`readTextFile`方法中设置`recursive.file.enumeration`参数为`true`,以便递归地读取所有子目录中的文件。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

_小李哥

你的鼓励将是我创作的最大动力

¥2 ¥4 ¥6 ¥10 ¥20
输入1-500的整数
余额支付 (余额:-- )
扫码支付
扫码支付:¥2
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值