自定义flink es source

1、 需求

       增量导入elasticsearch的数据到kafka。

2、 解决方式

      1) 自定义一个flume的essource

      2)使用spark 的 es rdd

      3) 自定义flink的es source

 

3、解决问题

1) 思路:es中的数据有一个sendTime。也就是发送到es的时间。我们就根据这个时间来增量采集数据。使用es的

transport api。并且使用scorll api来分页。所以我们使用自定义es source 。首先我们是要继承SourceFunction这个类。在run方法中实现查找逻辑。
   
   

2)注意点

假如我们的程序挂掉了怎么办。怎么知道我们采集到了哪个时间段呢?~~

这个问题我是这样想的的 首先我是5分钟采集一次。然后记录好每五分钟采集的的条数,es的index,采集的时间段。采集成功了就写入到mysql表中做记录。失败也会写入记录失败。然后如果是因为异常采集失败了。那么就重新采集。采集三次还失败程序就直接退出。然后检查原因再重新启动程序。重新启动先去mysql读取上一次采集的位置。然后从下一次记录开始采集。

 

2)代码:es -source 是scala代码


   
   
  1. package com.rongan.source
  2. import java.util.Date
  3. import com.rongan.commos.{DateUtils, EsUtils, PropertiesUtil}
  4. import com.rongan.constants.Constants
  5. import com.rongan.dao.EsExportRecordDAO
  6. import com.rongan.model.EsExportRecord
  7. import org.apache.flink.streaming.api.functions.source.SourceFunction
  8. import org.elasticsearch.search.SearchHit
  9. import scala.util.control.Breaks.{ break, breakable}
  10. /**
  11. * 自定义es的数据源
  12. *
  13. * @param clusterName :集群名称
  14. * @param esNode :集群节点
  15. * @param esPort :es通信端口
  16. * @param index :索引名字
  17. * @param type1 :tpye
  18. */
  19. class EsSource(val clusterName: String, val esNode: String, val esPort: Int, val index: String, val type1: String, var fromDate: String) extends SourceFunction[String] {
  20. //判断是否取消运行
  21. var isRunning = true
  22. //es的客户端
  23. EsUtils.getClient(clusterName, esNode, esPort)
  24. val properties = PropertiesUtil.getProperties(Constants.PROPERTIES_PATH)
  25. override def run (sourceContext: SourceFunction.SourceContext[String]): Unit = {
  26. //定义一个标志位,标志这是第一次采集
  27. var flag = true;
  28. //创建客户端
  29. EsUtils.getClient(clusterName, esNode, esPort)
  30. var toDate = fromDate
  31. var fromDate1 = fromDate
  32. var errorCount = 0;
  33. //开始采集数据
  34. while ( true && isRunning) {
  35. //判断是否是第一次采集。创建lastUpdateTime的采集时间
  36. if (flag) {
  37. fromDate1 = toDate;
  38. flag = false
  39. }
  40. else fromDate1 = DateUtils.targetFormat(DateUtils.add5Minute(DateUtils.strToDate(fromDate1)))
  41. toDate = DateUtils.targetFormat(DateUtils.subtraction1second(DateUtils.add5Minute(DateUtils.strToDate(fromDate1))))
  42. try {
  43. var startTime = DateUtils.targetFormat( new Date())
  44. println( "start collection data index = " + index + " send_time (start)= " + fromDate1 + " send_time (end)= "
  45. + toDate + " currentTime" + startTime)
  46. val count: Int = collect(sourceContext, fromDate1, toDate)
  47. var endTime = DateUtils.targetFormat( new Date())
  48. EsExportRecordDAO.updateRecord(EsExportRecord(fromDate1, toDate, count, startTime, endTime, 1, index))
  49. errorCount = 0
  50. println( "end of data collection index = " + index + " send_time (start)= " + fromDate1 + " send_time (end)= "
  51. + toDate + " currentTime " + endTime + " count data = " + count)
  52. Thread.sleep(properties.getProperty(Constants.ES_COLLECT_INTERVAL).toLong)
  53. } catch {
  54. case e: Exception => {
  55. e.printStackTrace()
  56. errorCount += 1
  57. println( "采集数据出错 index = " + index + " send_time (开始)= " + fromDate1 + " send_time (结束) ")
  58. EsExportRecordDAO.updateRecord(EsExportRecord(fromDate1, "00000000", 0, "00000000", "00000000", 0, index))
  59. fromDate1 = DateUtils.targetFormat(DateUtils.subtraction5Minute(DateUtils.strToDate(fromDate1)))
  60. //如果采集三次失败那么就停止程序
  61. if (errorCount >= 3) {
  62. cancel()
  63. }
  64. }
  65. }
  66. }
  67. }
  68. //采集数据
  69. def collect (sourceContext: SourceFunction.SourceContext[String], fromDate: String, toDate: String) = {
  70. var count = 0;
  71. val tuple: (Array[SearchHit], String) = EsUtils.searchByScrollRangeQuery(index, type1, "send_time.keyword", fromDate, toDate)
  72. count = tuple._1.length
  73. for (hit <- tuple._1) {
  74. sourceContext.collect(hit.getSourceAsString)
  75. }
  76. var scrollID = tuple._2
  77. // println(new Date().toString + " count= " + count)
  78. breakable {
  79. while (isRunning) {
  80. val result: (Array[SearchHit], String) = EsUtils.searchByScrollId(scrollID)
  81. if (result._1.length == 0) {
  82. break;
  83. }
  84. for (hit <- result._1) {
  85. sourceContext.collect(hit.getSourceAsString)
  86. }
  87. count += result._1. length
  88. scrollID = result._2
  89. }
  90. }
  91. EsUtils.clearScroll(scrollID)
  92. count
  93. }
  94. override def cancel (): Unit = {
  95. isRunning = false
  96. }
  97. }
  98. //kafkatopic :roi-center.incident.detail.topic
  99. object EsCollect {
  100. }

4.整个项目代码请留言~。暂时就是实现这么多。如有更好的想法可以讨论讨论~

esutil代码:


   
   
  1. package rongan. util
  2. import org. elasticsearch. action. search.{ ClearScrollResponse, SearchRequestBuilder, SearchResponse}
  3. import org. elasticsearch. client. transport. TransportClient
  4. import org. elasticsearch. common. transport. TransportAddress
  5. import org. elasticsearch. common. unit. TimeValue
  6. import org. elasticsearch. index. query. QueryBuilders
  7. import org. elasticsearch. search. SearchHit
  8. import org. elasticsearch. search. sort. SortOrder
  9. import rongan. business. tornado. RsdTornadoIpcDeviceEsToHbase. properties
  10. import rongan. config. Constans
  11. import scala. util. control. Breaks.{ break, breakable}
  12. object EsUtils {
  13. import java. net. InetAddress
  14. import org. elasticsearch. common. settings. Settings
  15. import org. elasticsearch. transport. client. PreBuiltTransportClient
  16. //创建client
  17. var client: TransportClient = _
  18. def getClient( clusterName: String, host: String, port: Int) = {
  19. val settings: Settings = Settings. builder(). put( "cluster.name", clusterName). build
  20. client = new PreBuiltTransportClient(settings)
  21. . addTransportAddress( new TransportAddress( InetAddress. getByName(host), port))
  22. }
  23. /**
  24. * 该方法用于做范围查询
  25. *
  26. * @param index :索引名
  27. * @param `type` :type 的名字
  28. * @param field : 要根据哪个字段的范围来查询
  29. * @param fromData :开头的数据
  30. * @param toData :结束的数据
  31. * @return scroollId
  32. */
  33. def searchByScrollRangeQuery( index: String, `type`: String, field: String, fromData: Any, toData: Any) = {
  34. //1.创建搜索条件
  35. val searchRequestBuilder: SearchRequestBuilder = client. prepareSearch()
  36. searchRequestBuilder. setIndices(index)
  37. searchRequestBuilder. setTypes( `type`)
  38. searchRequestBuilder. setScroll( new TimeValue( 30000))
  39. //2.设置根据范围查询
  40. searchRequestBuilder. setQuery( QueryBuilders. rangeQuery(field). from(fromData). to(toData)). setSize( 10000)
  41. searchRequestBuilder. addSort( "send_time.keyword", SortOrder. ASC)
  42. //3.执行查询
  43. val searchResponse: SearchResponse = searchRequestBuilder. get
  44. //4获取scrollId
  45. val scrollId: String = searchResponse. getScrollId
  46. //println("scrollID = " + scrollId)
  47. //將这一页的数据和scrollId返回
  48. val searchHits: Array[ SearchHit] = searchResponse. getHits. getHits
  49. (searchHits, scrollId)
  50. }
  51. /**
  52. * 根據scrollId查询数据,只查询一页的数据
  53. *
  54. * @param scrollId1
  55. * @return
  56. */
  57. def searchByScrollId( scrollId1: String): ( Array[ SearchHit], String) = {
  58. if (scrollId1 == null) {
  59. return ( Array[ SearchHit](), null);
  60. }
  61. // println(scrollId1)
  62. // 结果
  63. val searchScrollRequestBuilder = client. prepareSearchScroll(scrollId1)
  64. // 重新设定滚动时间
  65. searchScrollRequestBuilder. setScroll( new TimeValue( 30000))
  66. // 请求
  67. val response = searchScrollRequestBuilder. get
  68. // 每次返回下一个批次结果 直到没有结果返回时停止 即hits数组空时
  69. //if (response.getHits.getHits.length == 0) break
  70. (response. getHits. getHits, response. getScrollId)
  71. }
  72. /**
  73. * 清除scrollID
  74. *
  75. * @param scrollId
  76. */
  77. def clearScroll( scrollId: String) {
  78. if (scrollId == null) return
  79. var clearScrollRequestBuilder = client. prepareClearScroll
  80. clearScrollRequestBuilder. addScrollId(scrollId)
  81. val response: ClearScrollResponse = clearScrollRequestBuilder. get
  82. response. isSucceeded
  83. }
  84. def main( args: Array[ String]): Unit = {
  85. // searchByScrollPrefixQuery("a", "b", "c", "d")
  86. // 左闭合 右闭合。如果是下一个五分钟。最终的秒数要往后面退一位
  87. EsUtils. getClient(properties. getProperty( Constans. ES_CLUSTER_NAME), properties. getProperty( Constans. ES_NODE),
  88. properties. getProperty( Constans. ES_PORT). toInt)
  89. var count = 0;
  90. val tuple: ( Array[ SearchHit], String) = searchByScrollRangeQuery( "firewall.ipc.info*",
  91. "alert", "send_time.keyword", "2019-01-28 19:15:20", "2019-09-28 19:15:2")
  92. count = tuple. _1. length
  93. var scrollID = tuple. _2
  94. println(count)
  95. for (hit <- tuple. _1) {
  96. println(hit. getSourceAsString)
  97. }
  98. // EsUtils.getClient("")
  99. breakable {
  100. while ( true) {
  101. val result: ( Array[ SearchHit], String) = searchByScrollId(scrollID)
  102. count += result. _1. length
  103. for (hit <- result. _1) {
  104. println(hit. getSourceAsString)
  105. }
  106. if (result. _1. length == 0) {
  107. break;
  108. }
  109. scrollID = result. _2
  110. }
  111. println(count)
  112. }
  113. clearScroll(scrollID)
  114. }
  115. }

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值