Flink 从kafka中读取json数据,sink到elasticsearch7.x

1. 说明

使用flink从中读取kafka中的json数据,然后把数据存储到elasticsearch7.x中
并进行简单的校验.
更新时间: 2021年1月24日

2. 依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>cn.cloudfall</groupId>
    <artifactId>flink_elk</artifactId>
    <version>1.0-SNAPSHOT</version>
	
	<!-- 版本管理 -->
    <properties>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <encoding>UTF-8</encoding>
        <scala.version>2.11.12</scala.version>
        <scala.compat.version>2.11</scala.compat.version>
        <flink.version>1.10.0</flink.version>
        <scala.binary.version>2.11.12</scala.binary.version>
        <iheart.version>1.4.3</iheart.version>
        <fastjson.version>1.2.7</fastjson.version>
    </properties>

    <dependencies>
        <!-- 导入scala的依赖 -->
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>
        <!-- 导入flink streaming 和 scala的依赖 -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_${scala.compat.version}</artifactId>
            <version>${flink.version}</version>
            <exclusions>
                <exclusion>
                    <artifactId>scala-library</artifactId>
                    <groupId>org.scala-lang</groupId>
                </exclusion>
            </exclusions>
        </dependency>
        <!-- 导入flink和scala的依赖 -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_${scala.compat.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <!-- 指定flink-client API的版本 -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_${scala.compat.version}</artifactId>
            <version>${flink.version}</version>
            <exclusions>
                <exclusion>
                    <artifactId>scala-parser-combinators_${scala.compat.version}</artifactId>
                    <groupId>org.scala-lang.modules</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>slf4j-api</artifactId>
                    <groupId>org.slf4j</groupId>
                </exclusion>
            </exclusions>
        </dependency>

<!--        &lt;!&ndash; 导入flink-table的依赖 &ndash;&gt;-->
<!--        &lt;!&ndash; https://mvnrepository.com/artifact/org.apache.flink/flink-table &ndash;&gt;-->
<!--        <dependency>-->
<!--            <groupId>org.apache.flink</groupId>-->
<!--            <artifactId>flink-table</artifactId>-->
<!--            <version>${flink.version}</version>-->
<!--            <type>pom</type>-->
<!--            <scope>provided</scope>-->
<!--        </dependency>-->
        <!-- 指定flink-connector-elasticsearch的依赖 -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-elasticsearch7_${scala.compat.version}</artifactId>
            <version>${flink.version}</version>
            <exclusions>
                <exclusion>
                    <artifactId>slf4j-api</artifactId>
                    <groupId>org.slf4j</groupId>
                </exclusion>
            </exclusions>
        </dependency>

        <!-- 指定flink-connector-kafka的依赖 -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka-0.11_${scala.compat.version}</artifactId>
            <version>${flink.version}</version>
            <exclusions>
                <exclusion>
                    <artifactId>slf4j-api</artifactId>
                    <groupId>org.slf4j</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>snappy-java</artifactId>
                    <groupId>org.xerial.snappy</groupId>
                </exclusion>
            </exclusions>
        </dependency>


<!--        &lt;!&ndash; 指定fast json的依赖 &ndash;&gt;-->
<!--        <dependency>-->
<!--            <groupId>com.alibaba</groupId>-->
<!--            <artifactId>fastjson</artifactId>-->
<!--            <version>1.2.60</version>-->
<!--        </dependency>-->
        <!-- 指定Google json 的依赖 -->
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.3.1</version>
        </dependency>

    </dependencies>
   <!-- 打包插件-->
    <build>
        <sourceDirectory>src/main/scala</sourceDirectory>
        <testSourceDirectory>src/test/scala</testSourceDirectory>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>2.5.1</version>
                <configuration>
                    <source>${maven.compiler.source}</source>
                    <target>${maven.compiler.target}</target>
                    <!--<encoding>${project.build.sourceEncoding}</encoding>-->
                </configuration>
            </plugin>

            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.0</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                        <configuration>
                            <args>
                                <!--<arg>-make:transitive</arg>-->
                                <arg>-dependencyfile</arg>
                                <arg>${project.build.directory}/.scala_dependencies</arg>
                            </args>

                        </configuration>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-surefire-plugin</artifactId>
                <version>2.18.1</version>
                <configuration>
                    <useFile>false</useFile>
                    <disableXmlReport>true</disableXmlReport>
                    <includes>
                        <include>**/*Test.*</include>
                        <include>**/*Suite.*</include>
                    </includes>
                </configuration>
            </plugin>
			
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <!--
                                        zip -d learn_spark.jar META-INF/*.RSA META-INF/*.DSA META-INF/*.SF
                                        -->
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <transformers>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>cn.cloudFall.FLinkKafka</mainClass>
                                </transformer>
                            </transformers>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>

3. 详细代码

import java.util
import java.util.Properties

import com.google.gson.Gson
import org.apache.flink.api.common.functions.RuntimeContext
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.elasticsearch.{ElasticsearchSinkFunction, RequestIndexer}
import org.apache.flink.streaming.connectors.elasticsearch7.{ElasticsearchSink, RestClientFactory}
import org.apache.flink.streaming.util.serialization.SimpleStringSchema
import org.apache.http.HttpHost
import org.apache.http.auth.{AuthScope, UsernamePasswordCredentials}
import org.apache.http.impl.client.BasicCredentialsProvider
import org.apache.http.impl.nio.client.HttpAsyncClientBuilder
import org.elasticsearch.action.index.IndexRequest
import org.elasticsearch.client.RestClientBuilder.HttpClientConfigCallback
import org.elasticsearch.client.{Requests, RestClientBuilder}

/**
* 红尘丶世界
*
*/
object FLink_Kafka_ES {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    // 非常关键,一定要设置启动检查点!!
    env.enableCheckpointing(1000)

    //设置kafka topic
    val topic: String = "test"
    //配置kafka参数
    val props: Properties = new Properties
    props.setProperty("bootstrap.servers", "hadoop01:9092,hadoop02:9092,hadoop03:9092")
    props.setProperty("group.id", "test01")
    props.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    props.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    
    //导入隐式转换
    import org.apache.flink.streaming.connectors.kafka._
    import org.apache.flink.api.scala._
    import scala.collection.JavaConverters._

    val consumer: FlinkKafkaConsumer011[String] = new FlinkKafkaConsumer011[String](topic, new SimpleStringSchema(), props)
    //设置最新的数据进行消费
    consumer.setStartFromLatest()
    //构建数据源
    val kafkaSource: DataStream[String] = env.addSource(consumer)
    //进行转换
    val mapDS: DataStream[Map[String, AnyRef]] = kafkaSource.map(x => {
        //创建Gson解析对象, 把json转化成map
      (new Gson).fromJson(x, classOf[util.Map[String, AnyRef]]).asScala.toMap
    })

	//配置节点信息
    val httpHosts: util.ArrayList[HttpHost] = new java.util.ArrayList[HttpHost]
    httpHosts.add(new HttpHost("192.168.100.111", 9200, "http"))
	//构建es sink
    val esSinkBuilder: ElasticsearchSink.Builder[Map[String, AnyRef]] = new ElasticsearchSink.Builder[Map[String, AnyRef]](
      httpHosts,
      new ElasticsearchSinkFunction[Map[String, AnyRef]] {
        override def process(t: Map[String, AnyRef], runtimeContext: RuntimeContext, requestIndexer: RequestIndexer): Unit = {
          val map: util.Map[String, AnyRef] = t.asJava
          val indexRequest: IndexRequest = Requests
            .indexRequest()
            .index("flink_kafka")
            //.`type`("kafka_data") //非必选项ES 7.x中不需要再设置文档
            //.create(false) //是否自动创建索引,不推荐使用,最好提前在es中进行Mapping映射,当然如果你的时间字段能够被ES自动识别可以让它自动创建
            //因为ES命名的问题,无法直接使用ES的命名
            //如需使用 x.x 命名格式, 可以考虑嵌套map或者json
            //如使用嵌套map需注意把所有的 map 都需要转化成 java.util.map 否则会爆类型异常
            .source(map)
          //发送请求,写入数据
          requestIndexer.add(indexRequest)
		  //写入数据成功输出一下
          println("data saved successfully")
        }
      })
    //以下的一些配置可作为生产环境使用, es容错需配合flink 检查点使用
	//设置最大并行度
    //esSinkBuilder.setBulkFlushMaxActions(1)
	//设置es sink 的参数
    esSinkBuilder.setRestClientFactory(
      new RestClientFactory {
        override def configureRestClientBuilder(restClientBuilder: RestClientBuilder): Unit = {
          restClientBuilder.setHttpClientConfigCallback(new HttpClientConfigCallback {
            override def customizeHttpClient(httpClientBuilder: HttpAsyncClientBuilder): HttpAsyncClientBuilder = {
              val provider: BasicCredentialsProvider = new BasicCredentialsProvider()
              //设置用户名和密码
              val credentials: UsernamePasswordCredentials = new UsernamePasswordCredentials("elastic", "123456")
              provider.setCredentials(AuthScope.ANY, credentials)
              httpClientBuilder.setDefaultCredentialsProvider(provider)
            }
          })
        }
      })
    //进行重试的时间间隔。对于指数型则表示起始的基数
    esSink.setBulkFlushBackoffDelay(1)
    //失败重试的次数
    esSink.setBulkFlushBackoffRetries(3)
    //重试策略,又可以分为以下两种类型
    //a、指数型,表示多次重试之间的时间间隔按照指数方式进行增长。eg:2 -> 4 -> 8 ...
    //b、常数型,表示多次重试之间的时间间隔为固定常数。eg:2 -> 2 -> 2 ...
    esSink.setBulkFlushBackoffType(ElasticsearchSinkBase.FlushBackoffType.EXPONENTIAL)
    //设置批量提交时间间隔
    //esSink.setBulkFlushInterval(100)
    //该配置表示批量写入ES时的记录条数
    esSink.setBulkFlushMaxActions(1)
    //设置批量提交的最大字节 以MB为单位
    //esSink.setBulkFlushMaxSizeMb(16)
    //es 容错处理
    esSink.setFailureHandler(
      new ActionRequestFailureHandler {
        override def onFailure(actionRequest: ActionRequest, throwable: Throwable, i: Int, requestIndexer: RequestIndexer): Unit = {
          if (ExceptionUtils.findThrowable(throwable, classOf[EsRejectedExecutionException]).isPresent) {
            // full queue; re-add document for indexing
            requestIndexer.add(actionRequest)
          } else if (ExceptionUtils.findThrowable(throwable, classOf[EsRejectedExecutionException]).isPresent) {
            // malformed document; simply drop request without failing sink
            println("WARN   数据格式出错了")
          } else {
            // for all other failures, fail the sink;
            // here the failure is simply rethrown, but users can also choose to throw custom exceptions
            println("ES 出问题了")
            throw throwable
          }
        }
      }
    )
    esSink
  }
    //设置最大并行度
    mapDS.setMaxParallelism(1)
	//把数据sink到es
    mapDS.addSink(esSinkBuilder.build())

    //生产数据命令如下
    // $KAFKA_HOME/bin/kafka-console-producer.sh --broker-list hadoop01:9092,hadoop02:9092,hadoop03:9092 --topic test
    //kafka中输入的测试数据 
    // {"id":1,"completed":false,"title":"delectus aut autem","userId":1}
    
    //查看索引
    //Get _cat/indices
    //查看索引中的内容
    //Get flink_kafka/_search
    //批量请求的配置;这将指示接收器在每个元素之后发出请求,否则将对它们进行缓冲。
    env.execute("Kafka_Flink")
  }
}

4. 最后

转载请注明出处。
如果我的文章对你有帮助的话,麻烦博主点个赞!!!
当然如果遇到问题,可以在下方留言,一起探讨,解决问题。

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值