1.11.2 flinksql自定义sls connector 连接器

背景

需要连接阿里日志服务sls,获取日志

阿里提供了对应的flink-log-connector 阿里云文档

不过他提供的是flink stream api, 我们需要使用flinksql, 所以需要自定义编写对应的table soruce

注意 

flink-log-connector一次获取的是多条日志格式RawLogGroupList, 想要更好的结合flinksql还可能需要去修改对应的源码

编码

依赖

这里用的自己checkout源码打包的

<dependency>
    <groupId>com.aliyun.openservices</groupId>
    <artifactId>flink-log-connector</artifactId>
    <version>0.1.24-SNAPSHOT</version>
</dependency>

format

编写一个处理格式处理器

 SlsFormatFactory

public class SlsFormatFactory implements DecodingFormatFactory<LogDeserializationSchema<RowData>> {

    @Override
    public DecodingFormat<LogDeserializationSchema<RowData>> createDecodingFormat(DynamicTableFactory.Context context, ReadableConfig readableConfig) {
        // either implement your custom validation logic here ...
        // or use the provided helper method
        FactoryUtil.validateFactoryOptions(this, readableConfig);

        // create and return the format
        return new SlsFormat();
    }

    @Override
    public String factoryIdentifier() {
        return "sls";
    }

    @Override
    public Set<ConfigOption<?>> requiredOptions() {
        return Collections.emptySet();
    }

    @Override
    public Set<ConfigOption<?>> optionalOptions() {
        final Set<ConfigOption<?>> options = new HashSet<>();
        return options;
    }
}

SlsFormat

public class SlsFormat implements DecodingFormat<LogDeserializationSchema<RowData>> {

    @Override
    public LogDeserializationSchema<RowData> createRuntimeDecoder(DynamicTableSource.Context context, DataType dataType) {
        // create type information for the DeserializationSchema
        //创建反序列化schema
        final TypeInformation<RowData> producedTypeInfo = (TypeInformation<RowData>) context.createTypeInformation(
                dataType);

        // most of the code in DeserializationSchema will not work on internal data structures
        // create a converter for conversion at the end
        final DynamicTableSource.DataStructureConverter converter = context.createDataStructureConverter(dataType);

        // use logical types during runtime for parsing
        final List<LogicalType> parsingTypes = dataType.getLogicalType().getChildren();

        // create runtime class
        return new SlsDeserializer(parsingTypes, converter, producedTypeInfo);
    }

    @Override
    public ChangelogMode getChangelogMode() {
        // define that this format can produce INSERT and DELETE rows
        return ChangelogMode.newBuilder()
                .addContainedKind(RowKind.INSERT)
                .build();
    }
}

SlsDeserializer

public class SlsDeserializer implements LogDeserializationSchema<RowData> {
    private final List<LogicalType> parsingTypes;
    private final DynamicTableSource.DataStructureConverter converter;
    private final TypeInformation<RowData> producedTypeInfo;

    public SlsDeserializer(List<LogicalType> parsingTypes, DynamicTableSource.DataStructureConverter converter, TypeInformation<RowData> producedTypeInfo) {
        this.parsingTypes = parsingTypes;
        this.converter = converter;
        this.producedTypeInfo = producedTypeInfo;
    }

    @Override
    public TypeInformation<RowData> getProducedType() {
        return producedTypeInfo;
    }

    @Override
    public RowData deserialize(List<LogGroupData> logGroups) {
        //在这里把sls consumer接收到的数据解析,转成RowData,内容就是一个json字符串
        //然后flink-log-connector中的com.aliyun.openservices.log.flink.model.LogDataFetcher中emitRecordAndUpdateState把RowData转成多个RowData
        List<Map<String, String>> collect = logGroups.stream()
                .map(LogGroupData::GetFastLogGroup)
                .map(FastLogGroup::getLogs)
                .flatMap(Collection::stream)
                .map(fastLog -> {
                    int count = fastLog.getContentsCount();
                    Map<String, String> log = new HashMap<>();
                    for (int cIdx = 0; cIdx < count; ++cIdx) {
                        FastLogContent content = fastLog.getContents(cIdx);
                        log.put(content.getKey(), content.getValue());
                    }
                    return log;
                }).collect(Collectors.toList());
//        ArrayList<RawLog> rawLogs = new ArrayList<>();
//        ArrayList<RawLog> rawLogs = new ArrayList<>();
//        for (LogGroupData logGroup : logGroups) {
//            FastLogGroup flg = logGroup.GetFastLogGroup();
//            for (int lIdx = 0; lIdx < flg.getLogsCount(); ++lIdx) {
//                FastLog log = flg.getLogs(lIdx);
//                RawLog rlog = new RawLog();
//                rlog.setTime(log.getTime());
//                for (int cIdx = 0; cIdx < log.getContentsCount(); ++cIdx) {
//                    FastLogContent content = log.getContents(cIdx);
//                    rlog.addContent(content.getKey(), content.getValue());
//                }
//                rawLogs.add(rlog);
//            }
//        }
        final RowKind kind = RowKind.valueOf("INSERT");
        final Row row = new Row(kind, parsingTypes.size());
//        Row row = new Row(1);
        row.setField(0, JSONObject.toJSONString(collect));
        return (RowData) converter.toInternal(row);
    }
}

flinksql connector

SlsDynamicTableSourceFactory

public class SlsDynamicTableSourceFactory implements DynamicTableSourceFactory {

    public static final ConfigOption<String> PROJECT = ConfigOptions.key("project").stringType().noDefaultValue();
    public static final ConfigOption<String> ACCESS_ID = ConfigOptions.key("access.id").stringType().noDefaultValue();
    public static final ConfigOption<String> ACCESS_KEY = ConfigOptions.key("access.key").stringType().noDefaultValue();
    public static final ConfigOption<String> ENDPOINT = ConfigOptions.key("endpoint").stringType().noDefaultValue();
    public static final ConfigOption<String> LOGSTORE = ConfigOptions.key("logstore").stringType().noDefaultValue();
    public static final ConfigOption<String> CONSUMER_BEGINPOSITION = ConfigOptions.key("consumer.beginposition").stringType().noDefaultValue();
    public static final ConfigOption<String> FORMAT = ConfigOptions.key("format").stringType().noDefaultValue();

    public SlsDynamicTableSourceFactory() {}

    public String factoryIdentifier() {
        return "sls";
    }

    @Override
    public Set<ConfigOption<?>> requiredOptions() {
        Set<ConfigOption<?>> options = new HashSet();
        options.add(PROJECT);
        options.add(ACCESS_ID);
        options.add(ACCESS_KEY);
        options.add(ENDPOINT);
        options.add(LOGSTORE);
        options.add(CONSUMER_BEGINPOSITION);
        options.add(FORMAT);
        return options;
    }

    @Override
    public Set<ConfigOption<?>> optionalOptions() {
        return new HashSet();
    }

    @Override
    public DynamicTableSource createDynamicTableSource(Context context) {
        // either implement your custom validation logic here ...
        // or use the provided helper utility
        final FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context);

        // discover a suitable decoding format
        final DecodingFormat<LogDeserializationSchema<RowData>> decodingFormat = helper.discoverDecodingFormat(
                DecodingFormatFactory.class,
                FactoryUtil.FORMAT);

        // validate all options
        helper.validate();
        TableSchema schema = context.getCatalogTable().getSchema();
        // get the validated options
        final ReadableConfig options = helper.getOptions();
        String project = options.get(PROJECT);
        String accessId = options.get(ACCESS_ID);
        String accessKey = options.get(ACCESS_KEY);
        String endpoint = options.get(ENDPOINT);
        String logstore = options.get(LOGSTORE);
        String consumerBeginposition = options.get(CONSUMER_BEGINPOSITION);

        // derive the produced data type (excluding computed columns) from the catalog table
        final DataType producedDataType = context.getCatalogTable().getSchema().toPhysicalRowDataType();
        return new SlsDynamicTableSource(project,accessId,accessKey,endpoint,logstore,consumerBeginposition,decodingFormat, producedDataType,schema);
    }
}

SlsDynamicTableSource

public class SlsDynamicTableSource implements ScanTableSource {
    private String project;
    private String accessId;
    private String accessKey;
    private String endpoint;
    private String logstore;
    private String consumerBeginposition;
    private DecodingFormat<LogDeserializationSchema<RowData>> decodingFormat;
    private DataType producedDataType;
    private TableSchema schema;


    public SlsDynamicTableSource(String project, String accessId, String accessKey, String endpoint, String logstore, String consumerBeginposition,
                                 DecodingFormat<LogDeserializationSchema<RowData>> decodingFormat, DataType producedDataType,
                                 TableSchema schema
    ) {
        this.project = project;
        this.accessId = accessId;
        this.accessKey = accessKey;
        this.endpoint = endpoint;
        this.logstore = logstore;
        this.consumerBeginposition = consumerBeginposition;
        this.decodingFormat = decodingFormat;
        this.producedDataType = producedDataType;
        this.schema = schema;
    }

    @Override
    public ChangelogMode getChangelogMode() {
        return ChangelogMode.newBuilder()
                .addContainedKind(RowKind.INSERT)
                .build();
    }

    @Override
    public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) {
        // create runtime classes that are shipped to the cluster

        final LogDeserializationSchema<RowData> deserializer = decodingFormat.createRuntimeDecoder(
                scanContext,
                producedDataType);

        //逗号切割logstores名字
        List<String> topics = Arrays.asList(this.logstore.split(","));
        Properties slsProperties = new Properties();
        // 设置访问日志服务的域名
        slsProperties.put(ConfigConstants.LOG_ENDPOINT, this.endpoint);
        // 设置访问ak
        slsProperties.put(ConfigConstants.LOG_ACCESSSKEYID, this.accessId);
        slsProperties.put(ConfigConstants.LOG_ACCESSKEY, this.accessKey);
        // 设置消费日志服务起始位置
        /**
         * begin_cursor, end_cursor, unix timestamp or consumer_from_checkpoint
         */
        slsProperties.put(ConfigConstants.LOG_CONSUMER_BEGIN_POSITION, this.consumerBeginposition);
//        /**
//         * 消费组名
//         */
//        slsProperties.put(ConfigConstants.LOG_CONSUMERGROUP, "flink-consumer-test");
//        slsProperties.put(ConfigConstants.LOG_FETCH_DATA_INTERVAL_MILLIS, 3000);
//        slsProperties.put(ConfigConstants.LOG_MAX_NUMBER_PER_FETCH, 10);
//        /**
//         * DISABLED---Never commit checkpoint to remote server.
//         * ON_CHECKPOINTS---Commit checkpoint only when Flink creating checkpoint, which means Flink
//         *                  checkpointing must be enabled.
//         * PERIODIC---Auto commit checkpoint periodic.
//         */
//        slsProperties.put(ConfigConstants.LOG_CHECKPOINT_MODE, CheckpointMode.ON_CHECKPOINTS.name());
//        /**
//         * 应该是如果ConfigConstants.LOG_CHECKPOINT_MODE设置了CheckpointMode.PERIODIC,则可以设置自动提交间隔
//         * slsProperties.put(ConfigConstants.LOG_COMMIT_INTERVAL_MILLIS, "10000");
//         */

        FlinkLogConsumer<RowData> flinkLogConsumer = new FlinkLogConsumer<>(project, topics, (LogDeserializationSchema) deserializer, slsProperties);
        return SourceFunctionProvider.of(flinkLogConsumer, false);
    }

    @Override
    public DynamicTableSource copy() {
        return new SlsDynamicTableSource(project,accessId,accessKey,endpoint,logstore,consumerBeginposition,null, producedDataType,schema);
    }

    @Override
    public String asSummaryString() {
        return "sls Table Source";
    }

}

流程

FlinkLogConsumer接收到的数据格式是List<LogGroupData>, 然后回进入我们写的SlsDeserializer的deserialize方法, 在这个方法中我把List<LogGroupData>转成一个RowData, RowData只有一个字段, 这个字段内容是所有日志的json格式.

到了这里其实就可以接收数据了, 在ddl创建soruce的时候, 每个RowData的第一个字段是一组log的json字符串, 后面可能需要使用udtf解析每个RowData.

比如下面的sql

CREATE FUNCTION ParseUriRow AS 'flinksql.function.udtf.ParseUriRow';
CREATE TABLE sourceTable (
    request_uri STRING
) WITH (
    'connector.type' = 'sls',
    'connector.endpoint' = '',
    'connector.project' = '',
    'connector.access.id' = '',
    'connector.access.key' = '',
    'connector.logstore' = '',
    'connector.consumer.beginposition' = '1585670400'
);
CREATE TABLE sinktable (
    platform STRING,
    aaaa STRING,
    bbbb STRING
) WITH (
    'connector.type' = 'print'
);
insert into sinktable
 select
    platform
 from sourceTable, LATERAL TABLE(ParseUriRow(request_uri)) as T(
        aaaa,
        bbbb
     )
 where 1 = POSITION('/log.gif?' IN request_uri);

不过这样的话,每次都要写udtf

所以这里我修改了一下, checkout 阿里云的flink-log-connector源码

https://github.com/aliyun/aliyun-log-flink-connector.git

修改com.aliyun.openservices.log.flink.model.LogDataFetcher的emitRecordAndUpdateState

在这个方法里面解析了数据返回多个RowData

void emitRecordAndUpdateState(T record, long recordTimestamp, int shardStateIndex, String cursor) {
        synchronized (checkpointLock) {
//            sourceContext.collectWithTimestamp(record, recordTimestamp);
            GenericRowData genericRowData = (GenericRowData)record;
            BinaryStringData binaryStringData = (BinaryStringData)genericRowData.getString(0);
            String str = binaryStringData.getJavaObject();
            JSONArray objects = JSONObject.parseArray(str);
            for (int i = 0; i < objects.size(); i++) {
                JSONObject jsonObject = objects.getJSONObject(i);
                RowDataTypeInfo rowTypeInfo = (RowDataTypeInfo)deserializer.getProducedType();
                String[] fieldNames = rowTypeInfo.getFieldNames();
                int fsize = fieldNames.length;
                GenericRowData oneRow = new GenericRowData(fsize);
                for (int j = 0; j < fsize; j++) {
                    oneRow.setField(j,new BinaryStringData((String) jsonObject.get(fieldNames[j])));
                }
                sourceContext.collectWithTimestamp((T)oneRow, recordTimestamp);
            }
            LogstoreShardState state = subscribedShardsState.get(shardStateIndex);
            state.setOffset(cursor);
            if (state.hasMoreData()) {
                return;
            }
            if (this.numberOfActiveShards.decrementAndGet() == 0) {
                LOG.info("Subtask {} has reached the end of all currently subscribed shards; marking the subtask as temporarily idle ...",
                        indexOfThisSubtask);
                sourceContext.markAsTemporarilyIdle();
            }
        }
    }

最终使用类似如下

CREATE TABLE sourceTable (
    platform STRING,
    aaaa STRING,
    bbbb STRING
) WITH (
    'connector.type' = 'sls',
    'connector.endpoint' = '',
    'connector.project' = '',
    'connector.access.id' = '',
    'connector.access.key' = '',
    'connector.logstore' = '',
    'connector.consumer.beginposition' = '1585670400'
);
CREATE TABLE sinktable (
    platform STRING,
    aaaa STRING,
    bbbb STRING
) WITH (
    'connector.type' = 'print'
);
insert into sinktable
 select
    platform,
    aaaa,
    bbbb
 from sourceTable
 where 1 = POSITION('/log.gif?' IN request_uri);
 

 

©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页