Flink接入体系
Flink Connertors
JDBC(读/写)
pom
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc_2.11</artifactId>
<version>1.11.2</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
写入Sink
public class CustomSinkApplication {
public static void main(String[] args) throws Exception {
// 1. 创建运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 2. 读取Socket数据源
DataStreamSource<String> socketStr = env.socketTextStream("192.168.19.102", 9911, "\n");
// 3. 转换处理流数据
SingleOutputStreamOperator<AccessLog> outputStream = socketStr.map(new MapFunction<String, AccessLog>() {
@Override
public AccessLog map(String value) throws Exception {
System.out.println(value);
// 根据分隔符解析数据
String[] arrValue = value.split("\t");
// 将数据组装为对象
AccessLog log = new AccessLog();
log.setNum(1);
for (int i = 0; i < arrValue.length; i++) {
if (i == 0) {
log.setIp(arrValue[i]);
} else if (i == 1) {
log.setTime(arrValue[i]);
} else if (i == 2) {
log.setType(arrValue[i]);
} else if (i == 3) {
log.setApi(arrValue[i]);
}
}
return log;
}
});
// 4. 配置自定义写入数据源
outputStream.addSink(new MySQLSinkFunction());
// 5. 执行任务
env.execute("job");
}
}
public class MySQLSinkFunction extends RichSinkFunction<AccessLog> {
/**
* 数据库连接
*/
private Connection connection;
/**
* 预编译SQL
*/
private PreparedStatement preparedStatement;
@Override
public void open(Configuration parameters) throws Exception {
// 建立Mysql连接
String jdbcUrl = "jdbc:mysql://192.168.19.150:3306/flink?useSSL=false";
String executeSql = "insert into t_access_log (ip, time, type, api) values(?, ?, ?, ?)";
connection = DriverManager.getConnection(jdbcUrl, "root", "654321");
preparedStatement = connection.prepareStatement(executeSql);
}
@Override
public void close() throws Exception {
// 释放数据库连接
if(null != connection) {
try {
connection.close();
}catch(Exception e) {
e.printStackTrace();
}
connection = null;
}
}
@Override
public void invoke(AccessLog value, Context context) throws Exception {
// 执行写入数据
preparedStatement.setString(1, value.getIp());
preparedStatement.setString(2, value.getTime());
preparedStatement.setString(3, value.getType());
preparedStatement.setString(4, value.getApi());
preparedStatement.executeUpdate();
}
}
读Source
public class CustomSourceApplication {
public static void main(String[] args) throws Exception {
// 1. 创建运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 2. 配置自定义MySQL读取数据源
DataStreamSource<AccessLog> dataStream = env.addSource(new MySQLSourceFunction());
// 3. 设置并行度
dataStream.print().setParallelism(1);
// 4. 执行任务
env.execute("custom jdbc source.");
}
}
public class MySQLSourceFunction extends RichSourceFunction<AccessLog> {
/**
* 数据库连接
*/
private Connection connection;
/**
* 预编译SQL
*/
private PreparedStatement preparedStatement;
@Override
public void open(Configuration parameters) throws Exception {
// 建立Mysql连接
String jdbcUrl = "jdbc:mysql://192.168.19.150:3306/flink?useSSL=false";
String executeSql = "select ip, time, type, api from t_access_log";
connection = DriverManager.getConnection(jdbcUrl, "root", "654321");
preparedStatement = connection.prepareStatement(executeSql);
}
@Override
public void close() throws Exception {
// 释放数据库连接
if(null != connection) {
try {
connection.close();
}catch(Exception e) {
e.printStackTrace();
}
connection = null;
}
}
@Override
public void run(SourceContext<AccessLog> ctx) throws Exception {
// 查询数据库的数据,组装返回
ResultSet rs = preparedStatement.executeQuery();
while(rs.next()) {
AccessLog accessLog = new AccessLog();
accessLog.setIp(rs.getString("ip"));
accessLog.setTime(rs.getString("time"));
accessLog.setType(rs.getString("type"));
accessLog.setApi(rs.getString("api"));
ctx.collect(accessLog);
}
}
@Override
public void cancel() {
System.out.println("cancel method.");
}
}
HDFS(读/写)
写入Sink
public class HdfsSinkApplication {
public static void main(String[] args) throws Exception {
// 1. 创建运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 2. 读取Socket数据源
DataStreamSource<String> socketStr = env.socketTextStream("localhost", 9911, "\n");
BucketingSink<String> sink = new BucketingSink<String>("d:/tmp/hdfs");
sink.setBucketer(new DateTimeBucketer<>("yyyy-MM-dd--HHmm"));
sink.setWriter(new StringWriter())
.setBatchSize(5*1024) // 设置每个文件的大小
.setBatchRolloverInterval(5*1000) // 设置滚动写入新文件的时间
.setInactiveBucketCheckInterval(30*1000) // 30秒检查一次不写入的文件
.setInactiveBucketThreshold(60*1000); // 60秒不写入,就滚动写入新的文件
socketStr.addSink(sink).setParallelism(1);
// 5. 执行任务
env.execute("job");
}
}
读Source
public class HdfsSourceApplication {
public static void main(String[] args) throws Exception{
// 1. 创建运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 2. 读取HDFS数据源
DataStreamSource<String> socketStr = env.readTextFile("hdfs://10.10.20.132:9090/hadoop-env.sh");
// 3. 打印文件内容
socketStr.print().setParallelism(1);
// 4. 执行任务
env.execute("job");
}
}
ES(写)
public class ElasticSinkApplication {
public static void main(String[] args) throws Exception{
// 1. 创建运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 2. 读取Socket数据源
DataStreamSource<String> socketStr = env.socketTextStream("localhost", 9911, "\n");
//3. 配置ES服务信息
List<HttpHost> httpHosts = new ArrayList<>();
httpHosts.add(new HttpHost("10.10.20.132", 9200, "http"));
//4. 数据解析处理
ElasticsearchSink.Builder<String> esSinkBuilder = new ElasticsearchSink.Builder<>(
httpHosts,
new ElasticsearchSinkFunction<String>() {
public IndexRequest createIndexRequest(String element) {
Map<String, String> json = new HashMap<>();
// 解析数据
String[] arrValue = String.valueOf(element).split("\t");
for(int i=0; i<arrValue.length; i++) {
if(i == 0) {
json.put("ip", arrValue[i]);
}else if( i== 1) {
json.put("time", arrValue[i]);
}else if( i== 2) {
json.put("type", arrValue[i]);
}else if( i== 3) {
json.put("api", arrValue[i]);
}
}
return Requests.indexRequest()
.index("flink-es")
.type("access-log")
.source(json);
}
@Override
public void process(String element, RuntimeContext ctx, RequestIndexer indexer) {
indexer.add(createIndexRequest(element));
}
}
);
// 5. ES的写入配置
esSinkBuilder.setBulkFlushMaxActions(1);
esSinkBuilder.setRestClientFactory(
restClientBuilder -> {
restClientBuilder.setMaxRetryTimeoutMillis(5000);
}
);
// 6. 添加ES的写入器
socketStr.addSink(esSinkBuilder.build());
socketStr.print().setParallelism(1);
// 7. 执行任务
env.execute("job");
}
}
KAFKA(读/写)
写入Sink
public class ElasticSinkApplication {
public static void main(String[] args) throws Exception{
// 1. 创建运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 2. 读取Socket数据源
DataStreamSource<String> socketStr = env.socketTextStream("localhost", 9911, "\n");
//3. 配置ES服务信息
List<HttpHost> httpHosts = new ArrayList<>();
httpHosts.add(new HttpHost("10.10.20.132", 9200, "http"));
//4. 数据解析处理
ElasticsearchSink.Builder<String> esSinkBuilder = new ElasticsearchSink.Builder<>(
httpHosts,
new ElasticsearchSinkFunction<String>() {
public IndexRequest createIndexRequest(String element) {
Map<String, String> json = new HashMap<>();
// 解析数据
String[] arrValue = String.valueOf(element).split("\t");
for(int i=0; i<arrValue.length; i++) {
if(i == 0) {
json.put("ip", arrValue[i]);
}else if( i== 1) {
json.put("time", arrValue[i]);
}else if( i== 2) {
json.put("type", arrValue[i]);
}else if( i== 3) {
json.put("api", arrValue[i]);
}
}
return Requests.indexRequest()
.index("flink-es")
.type("access-log")
.source(json);
}
@Override
public void process(String element, RuntimeContext ctx, RequestIndexer indexer) {
indexer.add(createIndexRequest(element));
}
}
);
// 5. ES的写入配置
esSinkBuilder.setBulkFlushMaxActions(1);
esSinkBuilder.setRestClientFactory(
restClientBuilder -> {
restClientBuilder.setMaxRetryTimeoutMillis(5000);
}
);
// 6. 添加ES的写入器
socketStr.addSink(esSinkBuilder.build());
socketStr.print().setParallelism(1);
// 7. 执行任务
env.execute("job");
}
}
读Source
public class KafkaSourceApplication {
public static void main(String[] args) throws Exception {
// 1. 创建运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 2. 设置kafka服务连接信息
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", "10.10.20.132:9092");
properties.setProperty("group.id", "fink_group");
// 3. 创建Kafka消费端
FlinkKafkaConsumer kafkaProducer = new FlinkKafkaConsumer(
"flink-source", // 目标 topic
new SimpleStringSchema(), // 序列化 配置
properties);
kafkaProducer.setStartFromEarliest(); // 尽可能从最早的记录开始
// kafkaProducer.setStartFromLatest(); // 从最新的记录开始
// kafkaProducer.setStartFromTimestamp(...); // 从指定的时间开始(毫秒)
// kafkaProducer.setStartFromGroupOffsets(); // 默认的方法
// 4. 读取Kafka数据源
DataStreamSource<String> socketStr = env.addSource(kafkaProducer);
socketStr.print().setParallelism(1);
// 5. 执行任务
env.execute("job");
}
}