flink消费kafka消息,处理后保存入es
一、引入依赖
<properties>
<flink.version>1.7.2</flink.version>
<java.version>1.8</java.version>
<scala.binary.version>2.11</scala.binary.version>
</properties>
<dependencies>
<!-- Apache Flink dependencies -->
<!-- These dependencies are provided, because they should not be packaged into the JAR file. -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-clients -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-table -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.11_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<!--<scope>compile</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-jdbc -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-jdbc_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
<!--<scope>test</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<!--<scope>test</scope>-->
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.3</version>
<!--<scope>test</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/ch.qos.logback/logback-core -->
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-core</artifactId>
<version>1.2.3</version>
</dependency>
<!--es6-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-elasticsearch6_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.3.0</version>
</dependency>
<!-- json -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.5</version>
</dependency>
</dependencies>
二、将日志保存入ES中
package com.test.monitor.util;
import com.imassbank.monitor.RestClientFactoryImpl;
import org.apache.commons.configuration2.Configuration;
import org.apache.commons.configuration2.builder.fluent.Configurations;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction;
import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer;
import org.apache.flink.streaming.connectors.elasticsearch.util.RetryRejectedExecutionFailureHandler;
import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;
import org.apache.http.HttpHost;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Requests;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.util.*;
/**
* sink es
* @since 2020/4/21 9:58
*/
public class SinkToES6 {
private static final Logger log = LoggerFactory.getLogger(SinkToES6.class);
public static void sinkEs() throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
Properties props = new Properties();
props.put("bootstrap.servers", "localhost:9092");
props.put("zookeeper.connect", "localhost:2181");
props.put("group.id", "test");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("auto.offset.reset", "latest");
DataStreamSource<String> data = env.addSource(new FlinkKafkaConsumer011<>(
//这个 kafka topic 需要和上面的工具类的 topic 一致
"testTopic",
new SimpleStringSchema(),
props)).setParallelism(1);
data.print();
log.info("data:" + data);
List<HttpHost> esHttphost = new ArrayList<>();
esHttphost.add(new HttpHost("127.0.0.1", Integer.valueOf(9200), "http"));
ElasticsearchSink.Builder<String> esSinkBuilder = new ElasticsearchSink.Builder<>(
esHttphost,
new ElasticsearchSinkFunction<String>() {
public IndexRequest createIndexRequest(String element) {
Map<String, String> json = new HashMap<>();
json.put("data", element);
log.info("data:" + element);
return Requests.indexRequest()
.index("index-log")
.type("log-monitor")
.source(json);
}
@Override
public void process(String element, RuntimeContext ctx, RequestIndexer indexer) {
indexer.add(createIndexRequest(element));
}
}
);
esSinkBuilder.setBulkFlushMaxActions(1);
esSinkBuilder.setRestClientFactory(new RestClientFactoryImpl());
esSinkBuilder.setFailureHandler(new RetryRejectedExecutionFailureHandler());
data.addSink(esSinkBuilder.build());
env.execute("flink learning connectors kafka");
}
}
配置builder
import org.apache.flink.streaming.connectors.elasticsearch6.RestClientFactory;
import org.apache.http.Header;
import org.apache.http.message.BasicHeader;
import org.elasticsearch.client.RestClientBuilder;
/**
* @author duanjx
* @since 2020/4/21 10:02
*/
public class RestClientFactoryImpl implements RestClientFactory {
@Override
public void configureRestClientBuilder(RestClientBuilder restClientBuilder) {
Header[] headers = new BasicHeader[]{new BasicHeader("Content-Type", "application/json")};
restClientBuilder.setDefaultHeaders(headers); //以数组的形式可以添加多个header
restClientBuilder.setMaxRetryTimeoutMillis(90000);
}
}
三、测试入口 kafka producer
import org.apache.commons.configuration2.Configuration;
import org.apache.commons.configuration2.builder.fluent.Configurations;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.io.File;
import java.util.ArrayList;
import java.util.Properties;
/**
* 本地测试用
*/
public class KafkaProducer {
public static ArrayList<String> fileReader(String path) {
// 使用ArrayList来存储每行读取到的字符串
ArrayList<String> arrayList = new ArrayList<>();
try {
File file = new File(path);
InputStreamReader inputReader = new InputStreamReader(new FileInputStream(file));
BufferedReader bf = new BufferedReader(inputReader);
// 按行读取字符串
String str;
while ((str = bf.readLine()) != null) {
arrayList.add(str);
}
bf.close();
inputReader.close();
} catch (IOException e) {
e.printStackTrace();
}
return arrayList;
}
public static void kafkaClint(ArrayList<String> log) throws Exception {
Properties props = new Properties();
props.put("bootstrap.servers", "localhost:9092");
/**
* acks=0 客户端不会等待服务端的确认; acks=1 只会等待leader分区的确认; acks=all或者acks=-1
* 等待leader分区和follower分区的确认
*
*/
props.put("acks", "all");
/**
* 设置大于0的值将使客户端重新发送任何数据,一旦这些数据发送失败。
* 注意,这些重试与客户端接收到发送错误时的重试没有什么不同。允许重试将潜在的改变数据的顺序,如果这两个消息记录都是发送到同一个partition,
* 则第一个消息失败第二个发送成功,则第二条消息会比第一条消息出现要早。
*/
props.put("retries", 0);
/**
* producer将试图批处理消息记录,以减少请求次数。这将改善client与server之间的性能。这项配置控制默认的批量处理消息字节数。
* 不会试图处理大于这个字节数的消息字节数。 发送到brokers的请求将包含多个批量处理,其中会包含对每个partition的一个请求。
* 较小的批量处理数值比较少用,并且可能降低吞吐量(0则会仅用批量处理)。较大的批量处理数值将会浪费更多内存空间,这样就需要分配特定批量处理数值的内存大小。
*/
props.put("batch.size", 16384);
/**
* producer组将会汇总任何在请求与发送之间到达的消息记录一个单独批量的请求。通常来说,
* 这只有在记录产生速度大于发送速度的时候才能发生。然而,在某些条件下,
* 客户端将希望降低请求的数量,甚至降低到中等负载一下。这项设置将通过增加小的延迟来完成--即,不是立即发送一条记录,
* producer将会等待给定的延迟时间以允许其他消息记录发送,这些消息记录可以批量处理。这可以认为是TCP种Nagle的算法类似。
* 这项设置设定了批量处理的更高的延迟边界:一旦我们获得某个partition的batch.size,他将会立即发送而不顾这项设置,
* 然而如果我们获得消息字节数比这项设置要小的多,我们需要“linger”特定的时间以获取更多的消息。
* 这个设置默认为0,即没有延迟。设定linger.ms=5,例如,将会减少请求数目,但是同时会增加5ms的延迟。
*/
props.put("linger.ms", 1);
/**
* producer可以用来缓存数据的内存大小。如果数据产生速度大于向broker发送的速度,producer会阻塞或者抛出异常,以“block.on.buffer.full”来表明。
* 这项设置将和producer能够使用的总内存相关,但并不是一个硬性的限制,因为不是producer使用的所有内存都是用于缓存。
* 一些额外的内存会用于压缩(如果引入压缩机制),同样还有一些用于维护请求。
*/
props.put("buffer.memory", 33554432);
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
Producer<String, String> producer = new org.apache.kafka.clients.producer.KafkaProducer<>(props);
for (String item : log) {
// send()方法是异步的
producer.send(new ProducerRecord<String, String>("testTopic", "log", item));
}
producer.close();
}
public static void main(String[] args) {
ArrayList<String> log = fileReader("E:\\catalina.log");
try {
//kafka消费日志
kafkaClint(log);
//保存ES
SinkToES6.sinkEs();
} catch (Exception e) {
e.printStackTrace();
}
}
}