flink写入hbase

POM

    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_${scala.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_${scala.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_${scala.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner-blink_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-scala-bridge_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <!--kafka相关的依赖-->
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka_2.10</artifactId>
            <version>0.8.2.1</version>
        </dependency>

        <!--scala相关的依赖-->
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>2.12.8</version>
        </dependency>

        <!--reflect IO的依赖-->
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-reflect</artifactId>
            <version>2.12.8</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.19</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-common</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-hbase_2.12</artifactId>
            <version>1.7.2</version>
        </dependency>
    </dependencies>

代码

package cl.flink

import com.alibaba.fastjson._
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction}
import org.apache.hadoop.hbase.{HBaseConfiguration, HConstants, TableName}
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.api.common.serialization.SimpleStringSchema

import java.time.LocalDate
import java.util.Properties
import scala.collection.mutable

object Kafka2Hbase{

  val propertiesC = new Properties()
  def main(args: Array[String]): Unit = {
    val environment: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //  数据源为kafka
    propertiesC.load(this.getClass.getClassLoader.getResourceAsStream("kafka_test.properties"))

    val datas: DataStream[String] = environment.addSource(new FlinkKafkaConsumer[String]("idnum-topic-1", new SimpleStringSchema, propertiesC))


    datas.print()
    datas.addSink(new WriteHbaseRich)
    environment.execute()
  }
}


class WriteHbaseRich extends RichSinkFunction[String]{
  var conn: Connection = null
  val scan: Scan = null
  var mutator: BufferedMutator = null
  var count = 0
  var flushtime=System.currentTimeMillis()

  val propertiesHbase = new Properties()
  propertiesHbase.load(this.getClass.getClassLoader.getResourceAsStream("hbase.properties"))

  val WI=Array(7,9,10,5,8,4,2,1,6,3,7,9,10,5,8,4,2)
  val WI_check = Array('1', '0', 'X', '9', '8', '7', '6', '5', '4', '3', '2')

  /**
   * 建立HBase连接
   * @param parameters
   */
  override def open(parameters: Configuration): Unit = {
    val config:org.apache.hadoop.conf.Configuration = HBaseConfiguration.create
    config.set(HConstants.ZOOKEEPER_QUORUM, propertiesHbase.getProperty("test.cluster"))
    config.set(HConstants.ZOOKEEPER_CLIENT_PORT, "2181")
    config.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 30000)
    config.setInt(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 30000)
    conn = ConnectionFactory.createConnection(config)

    val tableName: TableName = TableName.valueOf("cdp_user_base_info")
    val params: BufferedMutatorParams = new BufferedMutatorParams(tableName)
    //设置缓存1m,当达到1m时数据会自动刷到hbase
    params.writeBufferSize(1024 * 1024) //设置缓存的大小
    mutator = conn.getBufferedMutator(params)
    count = 0
  }

  /**
   * 处理获取的hbase数据
   *
   * @param value
   * @param context
   */
  override def invoke(value: String, context: SinkFunction.Context): Unit = {
    if(value.contains("idnum")){
      val datas: JSONObject = analysis_idcard(JSON.parseObject(value))

      println(datas)

      val put: Put = new Put(Bytes.toBytes(datas.getString("idnum")))
      import collection.JavaConverters._
      val keys: mutable.Set[String] = datas.keySet().asScala
      val strings: Array[String] = Array("idnum", "phone", "uid", "name","birthday","address_code","update_time","sex","address","address")
      for(key <- keys if strings contains key ){
        put.addColumn("info".getBytes(),key.getBytes(),datas.getString(key).getBytes())
      }
      mutator.mutate(put)
      //每满2000条刷新一下数据
      if (count >= 100 || System.currentTimeMillis()-flushtime > 20000){
        mutator.flush()
        count = 0
        flushtime=System.currentTimeMillis()
        println("flush")
      }
      count = count + 1
    }
  }

  /**
   * 关闭
   */
  override def close(): Unit = {
    if (conn != null) conn.close()
  }

  def check_idcard(idcards:String): Boolean ={
    val b: Array[Char] = idcards.toCharArray
    var flag=false
    if(b.size==18){
      var x=0
      for(i <- 0 to 16){
        x+=(WI(i))*(b(i)-'0')
      }
      flag=WI_check(x%11).equals(b(17))
    }else{
      println(s"错误的身份证号:'${idcards}'")
    }
    flag
  }

  def analysis_idcard(datas:JSONObject):JSONObject= {

      val idnum: String = datas.getString("idnum")
      if (idnum != null && idnum != "") {
        val idcard: String = idnum.trim
        try {
          if (idcard != null && !idcard.equals("") && check_idcard(idnum)) {
            if (idcard.substring(16, 17).toInt % 2 == 0) {
              datas.put("sex", "女")
            } else {
              datas.put("sex", "男")
            }
            val bd: String = idcard.substring(6, 14)
            val birthday = s"${bd.substring(0, 4)}-${bd.substring(4, 6)}-${bd.substring(6, 8)}"
            datas.put("birthday", birthday)
            datas.put("address_code", idcard.substring(0, 6))
            datas.put("update_time", LocalDate.now())
          }
        } catch {
          case exception: Exception => println(s"错误的身份证号:'${idnum}'")
        }
      }
      datas
  }
}

下面是一个示例代码,演示如何使用Apache Flink将数据写入HBase: ``` import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.util.Bytes; public class FlinkHBaseExample { public static void main(String[] args) throws Exception { // 创建Flink的执行环境 ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // 加载数据集 DataSet<Tuple2<String, String>> data = env.fromElements( new Tuple2<>("row1", "value1"), new Tuple2<>("row2", "value2"), new Tuple2<>("row3", "value3") ); // 配置HBase连接信息 Configuration config = HBaseConfiguration.create(); config.set("hbase.zookeeper.quorum", "localhost"); config.set("hbase.zookeeper.property.clientPort", "2181"); Connection connection = ConnectionFactory.createConnection(config); Table table = connection.getTable(TableName.valueOf("mytable")); // 将数据写入HBase data.map(new MapFunction<Tuple2<String, String>, Put>() { @Override public Put map(Tuple2<String, String> value) throws Exception { Put put = new Put(Bytes.toBytes(value.f0)); put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("col1"), Bytes.toBytes(value.f1)); return put; } }).output(new HBaseOutputFormat(table)); // 执行程序 env.execute(); } } ``` 在上面的代码中,我们首先创建了Flink的执行环境,并加载了一个数据集。接下来,我们使用HBaseConfiguration类来配置HBase连接信息,然后创建一个HBase表对象。最后,我们使用map函数将数据转换为Put对象,并将其输出到HBase中。在output方法中,我们使用了一个自定义的HBaseOutputFormat对象,该对象用于将Put对象写入HBase表中。 需要注意的是,上面的代码中没有包含所有必要的依赖项。在实际使用中,您需要在项目中添加以下依赖项: ``` <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-core</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.11</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-hbase</artifactId> <version>${flink.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>${hadoop.version}</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hbase-client</artifactId> <version>${hbase.version}</version> </dependency> ``` 其中,${flink.version}、${hadoop.version}和${hbase.version}是您选择的Flink、Hadoop和HBase版本的占位符。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值