基本知识
TableSource提供了从外部系统(消息队列,KV存储,数据库,文件系统等)接入数据,之后注册到TableEnvironment中,然后可以通过Table API或者SQL进行查询。
TableSink提供了将flink内部注册的Table中的数据写出到外部系统(Es ,Hbase ,消息队列数据库,文件系统等)。
TableFactory用来创建TableSource 、TableSink实例,那么创建具体是什么类型(KafkaTableSink、CsvTableSink)的实例呢?这个是通过基于规范化的字符串的属性来进行相应的过滤的。这些标准字符串属性我们可以通过Descriptor编程实现,也可以通过配置yaml文件使用SQL-CLIENT实现。
定义不同的TableSource或者具有不同功能的TableSource需要分别实现下面的接口。
定义批的BatchTableSource
BatchTableSource<T> implements TableSource<T> {
public DataSet<T> getDataSet(ExecutionEnvironment execEnv);
}
这里有一些创建DataSet的接口:点击此处
注意getDataSet返回的DataSet里面的数据类型必须与TableSource.getReturnType()返回的一致。
通常使用InputFormat或者batch connector来实现BatchTableSource。
定义流的StreamTableSource
StreamTableSource<T> implements TableSource<T> {
public DataStream<T> getDataStream(StreamExecutionEnvironment execEnv);
}
getDataStream方法返回的DataStream中数据类型需要与TableSource.getReturnType()返回的一致。
通常使用SourceFunction或者stream connector来实现StreamTableSource。关于stream connector可以看这里。
定义一个TableSource支持Projection下推
ProjectableTableSource<T> {
public TableSource<T> projectFields(int[] fields);
}
返回TableSouce的一个copy对象,这个对象经过了物理返回类型的调整,这个返回的TableSource必须与原TableSource的TableSchema一致。
如果自定义的TableSource实现了DefinedFieldMapping接口,那么字段映射必须调整为新的返回类型。
定义一个TableSource支持Filter下推
FilterableTableSource<T> {
public TableSource<T> applyPredicate(List<Expression> predicates);
public boolean isFilterPushedDown();
}
定义一个TableSource支持RowTime属性
DefinedRowtimeAttribute {
public List<RowtimeAttributeDescriptor> getRowtimeAttributeDescriptors();
}
返回一个Rowtime属性列表,但是目前仅仅支持单一rowtime属性,rowtime属性有以下几个特性:
(1)attributeName:rowtime属性在table schema中的名字,类型是 Types.SQL_TIMESTAMP
(2)timestampExtractor:从record中抽取时间戳,能够将一个Long类型字段转为timestamp类型或者解析为 String-encoded timestamp,flink提供了一些内置的TimestampExtractor实现,当然也能自定义实现。
(3)watermarkStrategy: The watermark strategy defines how watermarks are generated for the rowtime attribute. Flink comes with a set of built-in WatermarkStrategy implementations for common use cases. It is also possible to provide a custom implementation.
定义一个TableSource支持ProcTime属性
DefinedProctimeAttribute {
public String getProctimeAttribute();
}
getProctimeAttribute方法返回的是processing time属性的名字,如果不具有proctime那么返回null。
定义TableSink
TableSink<T> {
public TypeInformation<T> getOutputType();
public String[] getFieldNames();
public TypeInformation[] getFieldTypes();
public TableSink<T> configure(String[] fieldNames, TypeInformation[] fieldTypes);
}
定义批的BatchTableSink
BatchTableSink<T> implements TableSink<T> {
public void emitDataSet(DataSet<T> dataSet);
}
在emitDataSet中实现将dataSet写出。
定义仅仅支持插入的流的AppendStreamTableSink
AppendStreamTableSink<T> implements TableSink<T> {
public void emitDataStream(DataStream<T> dataStream);
}
在emitDataStream中实现将dataStream写出。
定义支持插入、删除的RetractStreamTableSink
RetractStreamTableSink<T> implements TableSink<Tuple2<Boolean, T>> {
public TypeInformation<T> getRecordType();
public void emitDataStream(DataStream<Tuple2<Boolean, T>> dataStream);
}
emitDataStream返回的数据类型是Tuple,第一个字段是boolean类型true 代表insert, false 代表 delete
定义支持插入、删除、更新的UpsertStreamTableSink
UpsertStreamTableSink<T> implements TableSink<Tuple2<Boolean, T>> {
public void setKeyFields(String[] keys);
public void setIsAppendOnly(boolean isAppendOnly);
public TypeInformation<T> getRecordType();
public void emitDataStream(DataStream<Tuple2<Boolean, T>> dataStream);
}
自定义一个AppendStreamTableSink实现将数据写到控制台或者打印到日志
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.table.sinks.AppendStreamTableSink;
import org.apache.flink.table.sinks.BatchTableSink;
import org.apache.flink.table.sinks.TableSink;
import org.apache.flink.types.Row;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Iterator;
import java.util.List;
/**
* @描述
* printer sink to test
* @创建人
* hehuiyuan
* @创建时间
* 2019-07-16
*/
public class PrintTableSink implements AppendStreamTableSink<Row>, BatchTableSink<Row> {
public static Logger logger = LoggerFactory.getLogger(PrintTableSink.class);
@Override
public void emitDataSet(DataSet<Row> dataSet) {
List<Row> elements = null;
try {
elements = dataSet.collect();
} catch (Exception e) {
e.printStackTrace();
}
Iterator var2 = elements.iterator();
while(var2.hasNext()) {
Row e = (Row)var2.next();
System.out.println(e);
}
try {
dataSet.print();
} catch (Exception e) {
e.printStackTrace();
}
}
private String[] fieldNames;
private TypeInformation<?>[] fieldTypes;
@Override
public TableSink configure(String[] strings, TypeInformation<?>[] typeInformations) {
PrintTableSink configuredSink = new PrintTableSink();
configuredSink.fieldNames = strings;
configuredSink.fieldTypes = typeInformations;
return configuredSink;
}
@Override
public void emitDataStream(DataStream dataStream) {
//输出到控制台
//dataStream.print();
//dataStream.addSink(new PrintSinkFunction<>());
//打印到日志
dataStream.addSink(new SinkFunction() {
@Override
public void invoke(Object value, Context context) throws Exception {
logger.info(((Row)value).toString());
}
});
}
@Override
public String[] getFieldNames() {
return fieldNames;
}
@Override
public TypeInformation<?>[] getFieldTypes() {
return fieldTypes;
}
@Override
public TypeInformation<Row> getOutputType() {
return Types.ROW_NAMED(fieldNames,fieldTypes);
}
}
定义一个RetractStreamTableSink
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.experimental.CollectSink;
import org.apache.flink.table.sinks.RetractStreamTableSink;
import org.apache.flink.table.sinks.TableSink;
import org.apache.flink.types.Row;
import java.net.InetAddress;
/**
* Table sink for collecting the results locally using sockets.
*/
public class CollectStreamTableSink implements RetractStreamTableSink<Row> {
private final InetAddress targetAddress;
private final int targetPort;
private final TypeSerializer<Tuple2<Boolean, Row>> serializer;
private String[] fieldNames;
private TypeInformation<?>[] fieldTypes;
public CollectStreamTableSink(InetAddress targetAddress, int targetPort, TypeSerializer<Tuple2<Boolean, Row>> serializer) {
this.targetAddress = targetAddress;
this.targetPort = targetPort;
this.serializer = serializer;
}
@Override
public String[] getFieldNames() {
return fieldNames;
}
@Override
public TypeInformation<?>[] getFieldTypes() {
return fieldTypes;
}
@Override
public TableSink<Tuple2<Boolean, Row>> configure(String[] fieldNames, TypeInformation<?>[] fieldTypes) {
final CollectStreamTableSink copy = new CollectStreamTableSink(targetAddress, targetPort, serializer);
copy.fieldNames = fieldNames;
copy.fieldTypes = fieldTypes;
return copy;
}
@Override
public TypeInformation<Row> getRecordType() {
return Types.ROW_NAMED(fieldNames, fieldTypes);
}
@Override
public void emitDataStream(DataStream<Tuple2<Boolean, Row>> stream) {
// add sink
stream
.addSink(new CollectSink<>(targetAddress, targetPort, serializer))
.name("SQL Client Stream Collect Sink")
.setParallelism(1);
}
@Override
public TupleTypeInfo<Tuple2<Boolean, Row>> getOutputType() {
return new TupleTypeInfo<>(Types.BOOLEAN, getRecordType());
}
}
//sink函数------------sink函数-------------sink函数
public class CollectSink<IN> extends RichSinkFunction<IN> {
private static final long serialVersionUID = 1L;
private final InetAddress hostIp;
private final int port;
private final TypeSerializer<IN> serializer;
private transient Socket client;
private transient OutputStream outputStream;
private transient DataOutputViewStreamWrapper streamWriter;
/**
* Creates a CollectSink that will send the data to the specified host.
*
* @param hostIp IP address of the Socket server.
* @param port Port of the Socket server.
* @param serializer A serializer for the data.
*/
public CollectSink(InetAddress hostIp, int port, TypeSerializer<IN> serializer) {
this.hostIp = hostIp;
this.port = port;
this.serializer = serializer;
}
@Override
public void invoke(IN value, Context context) throws Exception {
try {
serializer.serialize(value, streamWriter);
}
catch (Exception e) {
throw new IOException("Error sending data back to client (" + hostIp.toString() + ":" + port + ')', e);
}
}
/**
* Initialize the connection with the Socket in the server.
* @param parameters Configuration.
*/
@Override
public void open(Configuration parameters) throws Exception {
try {
client = new Socket(hostIp, port);
outputStream = client.getOutputStream();
streamWriter = new DataOutputViewStreamWrapper(outputStream);
}
catch (IOException e) {
throw new IOException("Cannot connect to the client to send back the stream", e);
}
}
/**
* Closes the connection with the Socket server.
*/
@Override
public void close() throws Exception {
try {
if (outputStream != null) {
outputStream.flush();
outputStream.close();
}
// first regular attempt to cleanly close. Failing that will escalate
if (client != null) {
client.close();
}
}
catch (Exception e) {
throw new IOException("Error while closing connection that streams data back to client at "
+ hostIp.toString() + ":" + port, e);
}
finally {
// if we failed prior to closing the client, close it
if (client != null) {
try {
client.close();
}
catch (Throwable t) {
// best effort to close, we do not care about an exception here any more
}
}
}
}
}
CSVTableSource实现
//实现了BatchTableSource可以用于批处理,同时实现了StreamTableSource可以用于流处理,实现了ProjectableTableSource用于Projection下推
class CsvTableSource private (
private val path: String,
private val fieldNames: Array[String],
private val fieldTypes: Array[TypeInformation[_]],
private val selectedFields: Array[Int],
private val fieldDelim: String,
private val rowDelim: String,
private val quoteCharacter: Character,
private val ignoreFirstLine: Boolean,
private val ignoreComments: String,
private val lenient: Boolean)
extends BatchTableSource[Row]
with StreamTableSource[Row]
with ProjectableTableSource[Row] {
/**
* @param path CSV文件路径
* @param fieldNames 字段名字集合
* @param fieldTypes 字段类型集合
* @param fieldDelim 字段分隔符
* @param rowDelim 行分隔符
* @param quoteCharacter An optional quote character for String values, null by default.
* @param ignoreFirstLine 是否忽略第一行,有的csv文件第一行可能是字段名字
* @param ignoreComments An optional prefix to indicate comments, null by default.
* @param lenient 是否跳过解析异常的记录,不跳过那么解析异常则会失败
*/
def this(
path: String,
fieldNames: Array[String],
fieldTypes: Array[TypeInformation[_]],
fieldDelim: String = CsvInputFormat.DEFAULT_FIELD_DELIMITER,
rowDelim: String = CsvInputFormat.DEFAULT_LINE_DELIMITER,
quoteCharacter: Character = null,
ignoreFirstLine: Boolean = false,
ignoreComments: String = null,
lenient: Boolean = false) = {
this(
path,
fieldNames,
fieldTypes,
fieldTypes.indices.toArray, // initially, all fields are returned
fieldDelim,
rowDelim,
quoteCharacter,
ignoreFirstLine,
ignoreComments,
lenient)
}
def this(path: String, fieldNames: Array[String], fieldTypes: Array[TypeInformation[_]]) = {
this(path, fieldNames, fieldTypes, CsvInputFormat.DEFAULT_FIELD_DELIMITER,
CsvInputFormat.DEFAULT_LINE_DELIMITER, null, false, null, false)
}
if (fieldNames.length != fieldTypes.length) {
throw new TableException("Number of field names and field types must be equal.")
}
private val selectedFieldTypes = selectedFields.map(fieldTypes(_))
private val selectedFieldNames = selectedFields.map(fieldNames(_))
private val returnType: RowTypeInfo = new RowTypeInfo(selectedFieldTypes, selectedFieldNames)
/**
* Returns the data of the table as a [[DataSet]] of [[Row]].
*/
override def getDataSet(execEnv: ExecutionEnvironment): DataSet[Row] = {
execEnv.createInput(createCsvInput(), returnType).name(explainSource())
}
/** Returns the [[RowTypeInfo]] for the return type of the [[CsvTableSource]]. */
override def getReturnType: RowTypeInfo = returnType
/**
* Returns the data of the table as a [[DataStream]] of [[Row]].
*/
override def getDataStream(streamExecEnv: StreamExecutionEnvironment): DataStream[Row] = {
streamExecEnv.createInput(createCsvInput(), returnType).name(explainSource())
}
/** Returns the schema of the produced table. */
override def getTableSchema = new TableSchema(fieldNames, fieldTypes)
/** Returns a copy of [[TableSource]] with ability to project fields */
override def projectFields(fields: Array[Int]): CsvTableSource = {
val selectedFields = if (fields.isEmpty) Array(0) else fields
new CsvTableSource(
path,
fieldNames,
fieldTypes,
selectedFields,
fieldDelim,
rowDelim,
quoteCharacter,
ignoreFirstLine,
ignoreComments,
lenient)
}
private def createCsvInput(): RowCsvInputFormat = {
val inputFormat = new RowCsvInputFormat(
new Path(path),
selectedFieldTypes,
rowDelim,
fieldDelim,
selectedFields)
inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine)
inputFormat.setLenient(lenient)
if (quoteCharacter != null) {
inputFormat.enableQuotedStringParsing(quoteCharacter)
}
if (ignoreComments != null) {
inputFormat.setCommentPrefix(ignoreComments)
}
inputFormat
}
override def equals(other: Any): Boolean = other match {
case that: CsvTableSource => returnType == that.returnType &&
path == that.path &&
fieldDelim == that.fieldDelim &&
rowDelim == that.rowDelim &&
quoteCharacter == that.quoteCharacter &&
ignoreFirstLine == that.ignoreFirstLine &&
ignoreComments == that.ignoreComments &&
lenient == that.lenient
case _ => false
}
override def hashCode(): Int = {
returnType.hashCode()
}
override def explainSource(): String = {
s"CsvTableSource(" +
s"read fields: ${getReturnType.getFieldNames.mkString(", ")})"
}
}