一、Sink hbase
1、extends RichSinkFunction
1.1 主体类
package com.nfdw;
import com.nfdw.entity.Employees;
import com.nfdw.sink.MyHBaseSinkFunction;
import com.nfdw.utils.*;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;
import java.util.Date;
import java.util.Properties;
public class App {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = GetStreamExecutionEnvironment.getEnv();
Properties prop = new Properties();
prop.setProperty("bootstrap.servers","cdh101:9092");
prop.setProperty("group.id","cloudera_mirrormaker");
prop.put("value.serializer","org.apache.kafka.common.serialization.StringSerializer");
FlinkKafkaConsumer011<String> myConsumer = new FlinkKafkaConsumer011("luchangyin", new SimpleStringSchema() ,prop);
myConsumer.setStartFromLatest();
DataStreamSource<String> dataStream = env.addSource(myConsumer);
DataStream<Employees> result = dataStream.map(new MapFunction<String, Employees>() {
@Override
public Employees map(String s) throws Exception {
Employees emp = MyJsonUtils.str2JsonObj(s);
emp.setEmpStartTime(new Date(emp.getTs()));
emp.setDt(MyDateUtils.getDate2Second(emp.getEmpStartTime()));
return emp;
}
});
result.print();
result.addSink(new MyHBaseSinkFunction());
env.execute("wo xi huan ni");
}
}
1.2、获取环境对象
package com.nfdw.utils;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class GetStreamExecutionEnvironment {
public static StreamExecutionEnvironment getEnv(){
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(5000);
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointTimeout(60000);
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
env.setParallelism(3);
return env;
}
}
1.3、自定义 sink 入hbase
package com.nfdw.sink;
import com.nfdw.entity.Employees;
import com.nfdw.utils.MyDateUtils;
import com.nfdw.utils.SnowflakeIdUtil;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.MD5Hash;
public class MyHBaseSinkFunction extends RichSinkFunction<Employees> {
private transient Connection conn = null;
private transient Table table = null;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
org.apache.hadoop.conf.Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "10.122.1.112");
conf.set("hbase.zookeeper.property.clientPort", "2181");
if (null == conn) {
this.conn = ConnectionFactory.createConnection(conf);
}
}
@Override
public void invoke(Employees value, Context context) throws Exception {
TableName tableName = TableName.valueOf("employees");
table = conn.getTable(tableName);
String pkId = String.valueOf(SnowflakeIdUtil.getdidId(SnowflakeIdUtil.DCD_SNOWFLAKE));
byte[] orginkey = Bytes.toBytes(pkId.toString());
String md5AsHex = MD5Hash.getMD5AsHex(orginkey).substring(0,4);
String rowkey = md5AsHex + pkId;
Put put = new Put(Bytes.toBytes(rowkey));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("eId"), Bytes.toBytes(String.valueOf(value.getEId())));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("eName"), Bytes.toBytes(value.getEName()));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("eSal"), Bytes.toBytes(value.getESal()));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("eDept"), Bytes.toBytes(value.getEDept()));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("ts"), Bytes.toBytes(value.getTs()));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("empStartTime"), Bytes.toBytes(MyDateUtils.getDate2Str(value.getEmpStartTime())));
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("dt"), Bytes.toBytes(value.getDt()));
table.put(put);
}
@Override
public void close() throws Exception {
super.close();
if (table != null){
table.close();
}
if (conn != null){
conn.close();
}
}
}
1.4 雪花模型生成 rowkey工具类
package com.nfdw.utils;
import xyz.downgoon.snowflake.Snowflake;
public class SnowflakeIdUtil {
public static long groupId = Long.parseLong("6");
public static long workId = Long.parseLong("10");
public static Snowflake DCD_SNOWFLAKE = new Snowflake(groupId, workId);
public static long getdidId(Snowflake snowflake){
return snowflake.nextId();
}
}
1.5 检查数据
- 在hbase中先创建对应的 hbase 表和 列簇
create 'employees','cf'
- 执行程序并查看结果 scan “employees”
![在这里插入图片描述](https://i-blog.csdnimg.cn/blog_migrate/b4c68ccd4c609ea13974308d06f8dcd8.png)
2、实现OutputFormat接口
2.1、自定义HBaseOutputFormat
package cn.swordfall.hbaseOnFlink
import org.apache.flink.api.common.io.OutputFormat
import org.apache.flink.configuration.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration, HConstants, TableName}
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
class HBaseOutputFormat extends OutputFormat[String]{
val zkServer = "192.168.187.201"
val port = "2181"
var conn: Connection = null
var mutator: BufferedMutator = null
var count = 0
override def configure(configuration: Configuration): Unit = {
}
override def open(i: Int, i1: Int): Unit = {
val config: org.apache.hadoop.conf.Configuration = HBaseConfiguration.create
config.set(HConstants.ZOOKEEPER_QUORUM, zkServer)
config.set(HConstants.ZOOKEEPER_CLIENT_PORT, port)
config.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 30000)
config.setInt(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 30000)
conn = ConnectionFactory.createConnection(config)
val tableName: TableName = TableName.valueOf("test")
val params: BufferedMutatorParams = new BufferedMutatorParams(tableName)
params.writeBufferSize(1024 * 1024)
mutator = conn.getBufferedMutator(params)
count = 0
}
override def writeRecord(it: String): Unit = {
val cf1 = "cf1"
val array: Array[String] = it.split(",")
val put: Put = new Put(Bytes.toBytes(array(0)))
put.addColumn(Bytes.toBytes(cf1), Bytes.toBytes("name"), Bytes.toBytes(array(1)))
put.addColumn(Bytes.toBytes(cf1), Bytes.toBytes("age"), Bytes.toBytes(array(2)))
mutator.mutate(put)
if (count >= 4){
mutator.flush()
count = 0
}
count = count + 1
}
override def close(): Unit = {
try {
if (conn != null) conn.close()
} catch {
case e: Exception => println(e.getMessage)
}
}
}
2.2、Flink Streaming流式处理
def write2HBaseWithOutputFormat(): Unit = {
val topic = "test"
val props = new Properties
props.put("bootstrap.servers", "192.168.187.201:9092")
props.put("group.id", "kv_flink")
props.put("enable.auto.commit", "true")
props.put("auto.commit.interval.ms", "1000")
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.enableCheckpointing(5000)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
val myConsumer = new FlinkKafkaConsumer[String](topic, new SimpleStringSchema, props)
val dataStream: DataStream[String] = env.addSource(myConsumer)
dataStream.writeUsingOutputFormat(new HBaseOutputFormat)
env.execute()
}
2.3、Flink DataSet批处理的方式
def write2HBaseWithOutputFormat(): Unit = {
val topic = "test"
val props = new Properties
props.put("bootstrap.servers", "192.168.187.201:9092")
props.put("group.id", "kv_flink")
props.put("enable.auto.commit", "true")
props.put("auto.commit.interval.ms", "1000")
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.enableCheckpointing(5000)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
val myConsumer = new FlinkKafkaConsumer[String](topic, new SimpleStringSchema, props)
val dataStream: DataStream[String] = env.addSource(myConsumer)
dataStream.writeUsingOutputFormat(new HBaseOutputFormat)
env.execute()
}